## 🚶 Customer Behavior Prediction

Given *data about customers*, let's try to predict whether a given customer will **purchase** the product being offered.

We will use a logsitic regression model to make our predictions. 

Data source: https://www.kaggle.com/datasets/denisadutca/customer-behaviour

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('Customer_Behaviour.csv')
data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


### Preprocessing

In [4]:
df = data.copy()
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [5]:
# Drop User ID column
df = df.drop('User ID', axis=1)
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
...,...,...,...,...
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0


In [6]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [7]:
# Binary encode Gender column
df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
df

  df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})


Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0
...,...,...,...,...
395,0,46,41000,1
396,1,51,23000,1
397,0,50,20000,1
398,1,36,33000,0


In [8]:
# Split df into X and y
y = df['Purchased']
X = df.drop('Purchased', axis=1)

In [9]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
X_train

Unnamed: 0,Gender,Age,EstimatedSalary
39,0,27,31000
167,0,35,71000
383,1,49,28000
221,1,35,91000
351,1,37,75000
...,...,...,...
255,0,52,90000
72,0,20,23000
396,1,51,23000
235,1,46,79000


In [10]:
# Scaling X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
X_train

Unnamed: 0,Gender,Age,EstimatedSalary
39,-0.992882,-0.979100,-1.110590
167,-0.992882,-0.203575,0.056697
383,1.007169,1.153594,-1.198137
221,1.007169,-0.203575,0.640340
351,1.007169,-0.009694,0.173426
...,...,...,...
255,-0.992882,1.444415,0.611158
72,-0.992882,-1.657685,-1.344048
396,1.007169,1.347475,-1.344048
235,1.007169,0.862772,0.290154


### Training and Results (No Feature Engineering)

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print("Test Accuracy: {:.3f}%".format(acc * 100))

Test Accuracy: 80.833%


### Feature Engineering

In [12]:
df['EstimatedSalary'].quantile(0.95)

137049.99999999994

In [13]:
income_threshold = df['EstimatedSalary'].quantile(0.95)
df['High Income'] = df['EstimatedSalary'].apply(lambda x: 1 if x >= income_threshold else 0)
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased,High Income
0,1,19,19000,0,0
1,1,35,20000,0,0
2,0,26,43000,0,0
3,0,27,57000,0,0
4,1,19,76000,0,0
...,...,...,...,...,...
395,0,46,41000,1,0
396,1,51,23000,1,0
397,0,50,20000,1,0
398,1,36,33000,0,0


### Training/Results with Feature Engineering

In [14]:
y = df['Purchased']
X = df.drop('Purchased', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
X_train

Unnamed: 0,Gender,Age,EstimatedSalary,High Income
39,-0.992882,-0.979100,-1.110590,-0.246183
167,-0.992882,-0.203575,0.056697,-0.246183
383,1.007169,1.153594,-1.198137,-0.246183
221,1.007169,-0.203575,0.640340,-0.246183
351,1.007169,-0.009694,0.173426,-0.246183
...,...,...,...,...
255,-0.992882,1.444415,0.611158,-0.246183
72,-0.992882,-1.657685,-1.344048,-0.246183
396,1.007169,1.347475,-1.344048,-0.246183
235,1.007169,0.862772,0.290154,-0.246183


In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print("Test Accuracy: {:.3f}%".format(acc * 100))

Test Accuracy: 81.667%


In [16]:
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased,High Income
0,1,19,19000,0,0
1,1,35,20000,0,0
2,0,26,43000,0,0
3,0,27,57000,0,0
4,1,19,76000,0,0
...,...,...,...,...,...
395,0,46,41000,1,0
396,1,51,23000,1,0
397,0,50,20000,1,0
398,1,36,33000,0,0


In [17]:
old_age_threshold = df['Age'].quantile(0.75)
df['Old Age'] = df['Age'].apply(lambda x: 1 if x >= old_age_threshold else 0)
young_age_threshold = df['Age'].quantile(0.25)
df['Young Age'] = df['Age'].apply(lambda x: 1 if x <= young_age_threshold else 0)

In [18]:
y = df['Purchased']
X = df.drop('Purchased', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
X_train

Unnamed: 0,Gender,Age,EstimatedSalary,High Income,Old Age,Young Age
39,-0.992882,-0.979100,-1.110590,-0.246183,-0.555348,1.653280
167,-0.992882,-0.203575,0.056697,-0.246183,-0.555348,-0.604858
383,1.007169,1.153594,-1.198137,-0.246183,1.800673,-0.604858
221,1.007169,-0.203575,0.640340,-0.246183,-0.555348,-0.604858
351,1.007169,-0.009694,0.173426,-0.246183,-0.555348,-0.604858
...,...,...,...,...,...,...
255,-0.992882,1.444415,0.611158,-0.246183,1.800673,-0.604858
72,-0.992882,-1.657685,-1.344048,-0.246183,-0.555348,1.653280
396,1.007169,1.347475,-1.344048,-0.246183,1.800673,-0.604858
235,1.007169,0.862772,0.290154,-0.246183,1.800673,-0.604858


In [19]:
model = LogisticRegression()
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print("Test Accuracy: {:.3f}%".format(acc * 100))

Test Accuracy: 85.000%
