<a href="https://colab.research.google.com/github/binhluong84/Machine-Learning/blob/main/Customer_Behavior_Prediction_(Feature_Engineering).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Connect Google Drive
from google.colab import drive
drive.mount('/gdrive')


Mounted at /gdrive


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [3]:
data = pd.read_csv('/gdrive/MyDrive/Customer Behavior Prediction (Feature Engineering)/archive/Customer_Behaviour.csv')

In [4]:
data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [6]:
def preprocess_inputs(df, engineer_features=False):
    df = df.copy()
    
    # Drop User ID column
    df = df.drop('User ID', axis=1)
    
    # Binary encode
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
    
    # Feature engineering
    if engineer_features == True:
        income_threshold = df['EstimatedSalary'].quantile(0.95)
        df['High Income'] = df['EstimatedSalary'].apply(lambda x: 1 if x >= income_threshold else 0)
        
        old_age_threshold = df['Age'].quantile(0.75)
        df['Old Age'] = df['Age'].apply(lambda x: 1 if x >= old_age_threshold else 0)
        
        young_age_threshold = df['Age'].quantile(0.25)
        df['Young Age'] = df['Age'].apply(lambda x: 1 if x <= young_age_threshold else 0)
        
    # Split df into X and y
    y = df['Purchased']
    X = df.drop('Purchased', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, engineer_features=False)
X_train

Unnamed: 0,Gender,Age,EstimatedSalary
39,-0.992882,-0.979100,-1.110590
167,-0.992882,-0.203575,0.056697
383,1.007169,1.153594,-1.198137
221,1.007169,-0.203575,0.640340
351,1.007169,-0.009694,0.173426
...,...,...,...
255,-0.992882,1.444415,0.611158
72,-0.992882,-1.657685,-1.344048
396,1.007169,1.347475,-1.344048
235,1.007169,0.862772,0.290154


In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print("Test Accuracy: {:.3f}%".format(acc * 100))

Test Accuracy: 80.833%


In [9]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, engineer_features=True)
X_train

Unnamed: 0,Gender,Age,EstimatedSalary,High Income,Old Age,Young Age
39,-0.992882,-0.979100,-1.110590,-0.246183,-0.555348,1.653280
167,-0.992882,-0.203575,0.056697,-0.246183,-0.555348,-0.604858
383,1.007169,1.153594,-1.198137,-0.246183,1.800673,-0.604858
221,1.007169,-0.203575,0.640340,-0.246183,-0.555348,-0.604858
351,1.007169,-0.009694,0.173426,-0.246183,-0.555348,-0.604858
...,...,...,...,...,...,...
255,-0.992882,1.444415,0.611158,-0.246183,1.800673,-0.604858
72,-0.992882,-1.657685,-1.344048,-0.246183,-0.555348,1.653280
396,1.007169,1.347475,-1.344048,-0.246183,1.800673,-0.604858
235,1.007169,0.862772,0.290154,-0.246183,1.800673,-0.604858


In [10]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)

print("Test Accuracy: {:.3f}%".format(acc * 100))

Test Accuracy: 85.000%
