In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('Datasets/customer_behavior_dataset.csv')
dataset.head()

Unnamed: 0,Age,Gender,Income (in $1000s),Online Hours Per Week,Purchase Frequency (Last Month),Is Loyal Customer,Preferred Shopping Platform,Ad Clicks (Last Month),Discounts Used (Last Month),Competitor Pricing Index,Economic Conditions,Distance to Nearest Store (km),Shopping Frequency,"Purchase Decision (1=Yes, 0=No)"
0,56,Female,91.97,49,7,0,Website,1,9,0.83,Neutral,43.09,1.68536,1
1,69,Male,22.29,1,2,1,Website,9,3,1.13,Neutral,16.78,15.337143,1
2,46,Female,50.59,3,7,0,Website,0,2,1.06,Favorable,9.8,17.53005,1
3,32,Male,58.63,49,14,1,In-store,0,8,0.94,Neutral,2.78,28.540342,1
4,60,Female,44.87,27,0,0,Mobile App,7,9,0.87,Unfavorable,49.22,1.0,1


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              5000 non-null   int64  
 1   Gender                           5000 non-null   object 
 2   Income (in $1000s)               4800 non-null   float64
 3   Online Hours Per Week            5000 non-null   int64  
 4   Purchase Frequency (Last Month)  5000 non-null   int64  
 5   Is Loyal Customer                5000 non-null   int64  
 6   Preferred Shopping Platform      5000 non-null   object 
 7   Ad Clicks (Last Month)           5000 non-null   int64  
 8   Discounts Used (Last Month)      5000 non-null   int64  
 9   Competitor Pricing Index         5000 non-null   float64
 10  Economic Conditions              4850 non-null   object 
 11  Distance to Nearest Store (km)   5000 non-null   float64
 12  Shopping Frequency  

In [4]:
# Since missing values are low dropping is best opion
dataset = dataset.dropna()
dataset.shape

(4660, 14)

In [5]:
categorical_features = ['Gender','Preferred Shopping Platform', 'Economic Conditions']
for f in categorical_features:
    print(f'{f}: {dataset[f].unique()}')

Gender: ['Female' 'Male']
Preferred Shopping Platform: ['Website' 'In-store' 'Mobile App']
Economic Conditions: ['Neutral' 'Favorable' 'Unfavorable']


In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
one_hot_encoded = ohe.fit_transform(dataset[['Gender']])
ohe_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out(['Gender']))
le = LabelEncoder()
encoded_dataset1 = dataset.copy()
for f in categorical_features:
    if f != 'Gender':  
        encoded_dataset1[f] = le.fit_transform(encoded_dataset1[f])

encoded_dataset = pd.concat(
    [encoded_dataset1.drop(columns=['Gender'], axis=1).reset_index(drop=True), ohe_hot_encoded_df.reset_index(drop=True)], axis=1)

In [7]:
X, y = encoded_dataset.drop(columns=['Purchase Decision (1=Yes, 0=No)'], axis=1).values, encoded_dataset['Purchase Decision (1=Yes, 0=No)'].values
X, y

(array([[56.        , 91.97      , 49.        , ...,  1.68536028,
          1.        ,  0.        ],
        [69.        , 22.29      ,  1.        , ..., 15.33714311,
          0.        ,  1.        ],
        [46.        , 50.59      ,  3.        , ..., 17.53005006,
          1.        ,  0.        ],
        ...,
        [26.        , 53.61      ,  9.        , ...,  1.        ,
          0.        ,  1.        ],
        [53.        , 42.04      ,  5.        , ..., 16.73424535,
          1.        ,  0.        ],
        [36.        , 55.16      ,  6.        , ..., 11.57171689,
          1.        ,  0.        ]], shape=(4660, 14)),
 array([1, 1, 1, ..., 1, 1, 1], shape=(4660,)))

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)
X_train_scaled, X_test_scaled, y_train, y_test

(array([[-0.37682941,  1.2497109 , -0.99712663, ...,  1.77213258,
         -0.99145283,  0.99145283],
        [ 0.75095395,  0.50765692, -0.92815366, ...,  1.80100262,
          1.00862085, -1.00862085],
        [ 1.34801573, -0.42815006,  0.17541377, ...,  0.55682696,
         -0.99145283,  0.99145283],
        ...,
        [-0.31048921,  0.4192696 , -0.03150512, ...,  1.31166512,
         -0.99145283,  0.99145283],
        [ 0.35291277,  0.18756499,  1.55487306, ...,  1.64263582,
          1.00862085, -1.00862085],
        [-1.04023138,  0.42725941,  0.45130563, ...,  0.78783563,
         -0.99145283,  0.99145283]], shape=(3262, 14)),
 array([[ 1.14899514, -0.77570697,  0.45130563, ...,  1.54416   ,
         -0.99145283,  0.99145283],
        [ 0.35291277,  0.90964444,  0.79617045, ..., -1.56080825,
          1.00862085, -1.00862085],
        [ 0.81729415,  0.11915221, -1.06609959, ...,  0.14621527,
         -0.99145283,  0.99145283],
        ...,
        [ 1.28167553,  0.68592964,  

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
logiregressor = LogisticRegression()
logiregressor.fit(X_train_scaled, y_train)
y_pred=logiregressor.predict(X_test_scaled)
print(f'accuracy: {accuracy_score(y_pred,y_test)}')
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))

accuracy: 0.9914163090128756
              precision    recall  f1-score   support

           0       0.08      1.00      0.14         1
           1       1.00      0.99      1.00      1397

    accuracy                           0.99      1398
   macro avg       0.54      1.00      0.57      1398
weighted avg       1.00      0.99      1.00      1398

[[   1    0]
 [  12 1385]]


# Lets try without dropping NAN values instead adding mode or mean

In [10]:
dataset = pd.read_csv('Datasets/customer_behavior_dataset.csv')
dataset.head()

Unnamed: 0,Age,Gender,Income (in $1000s),Online Hours Per Week,Purchase Frequency (Last Month),Is Loyal Customer,Preferred Shopping Platform,Ad Clicks (Last Month),Discounts Used (Last Month),Competitor Pricing Index,Economic Conditions,Distance to Nearest Store (km),Shopping Frequency,"Purchase Decision (1=Yes, 0=No)"
0,56,Female,91.97,49,7,0,Website,1,9,0.83,Neutral,43.09,1.68536,1
1,69,Male,22.29,1,2,1,Website,9,3,1.13,Neutral,16.78,15.337143,1
2,46,Female,50.59,3,7,0,Website,0,2,1.06,Favorable,9.8,17.53005,1
3,32,Male,58.63,49,14,1,In-store,0,8,0.94,Neutral,2.78,28.540342,1
4,60,Female,44.87,27,0,0,Mobile App,7,9,0.87,Unfavorable,49.22,1.0,1


In [11]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              5000 non-null   int64  
 1   Gender                           5000 non-null   object 
 2   Income (in $1000s)               4800 non-null   float64
 3   Online Hours Per Week            5000 non-null   int64  
 4   Purchase Frequency (Last Month)  5000 non-null   int64  
 5   Is Loyal Customer                5000 non-null   int64  
 6   Preferred Shopping Platform      5000 non-null   object 
 7   Ad Clicks (Last Month)           5000 non-null   int64  
 8   Discounts Used (Last Month)      5000 non-null   int64  
 9   Competitor Pricing Index         5000 non-null   float64
 10  Economic Conditions              4850 non-null   object 
 11  Distance to Nearest Store (km)   5000 non-null   float64
 12  Shopping Frequency  

In [12]:
dataset['Income (in $1000s)'] = dataset['Income (in $1000s)'].fillna(dataset['Income (in $1000s)'].mean())
dataset['Economic Conditions'] = dataset['Economic Conditions'].fillna(dataset['Economic Conditions'].mode()[0])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              5000 non-null   int64  
 1   Gender                           5000 non-null   object 
 2   Income (in $1000s)               5000 non-null   float64
 3   Online Hours Per Week            5000 non-null   int64  
 4   Purchase Frequency (Last Month)  5000 non-null   int64  
 5   Is Loyal Customer                5000 non-null   int64  
 6   Preferred Shopping Platform      5000 non-null   object 
 7   Ad Clicks (Last Month)           5000 non-null   int64  
 8   Discounts Used (Last Month)      5000 non-null   int64  
 9   Competitor Pricing Index         5000 non-null   float64
 10  Economic Conditions              5000 non-null   object 
 11  Distance to Nearest Store (km)   5000 non-null   float64
 12  Shopping Frequency  

In [13]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
one_hot_encoded = ohe.fit_transform(dataset[['Gender']])
ohe_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out(['Gender']))
le = LabelEncoder()
encoded_dataset1 = dataset.copy()
for f in categorical_features:
    if f != 'Gender':  
        encoded_dataset1[f] = le.fit_transform(encoded_dataset1[f])

encoded_dataset = pd.concat(
    [encoded_dataset1.drop(columns=['Gender'], axis=1).reset_index(drop=True), ohe_hot_encoded_df.reset_index(drop=True)], axis=1)

In [14]:
encoded_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              5000 non-null   int64  
 1   Income (in $1000s)               5000 non-null   float64
 2   Online Hours Per Week            5000 non-null   int64  
 3   Purchase Frequency (Last Month)  5000 non-null   int64  
 4   Is Loyal Customer                5000 non-null   int64  
 5   Preferred Shopping Platform      5000 non-null   int64  
 6   Ad Clicks (Last Month)           5000 non-null   int64  
 7   Discounts Used (Last Month)      5000 non-null   int64  
 8   Competitor Pricing Index         5000 non-null   float64
 9   Economic Conditions              5000 non-null   int64  
 10  Distance to Nearest Store (km)   5000 non-null   float64
 11  Shopping Frequency               5000 non-null   float64
 12  Purchase Decision (1

In [15]:
X, y = encoded_dataset.drop(columns=['Purchase Decision (1=Yes, 0=No)'], axis=1).values, encoded_dataset['Purchase Decision (1=Yes, 0=No)'].values
X, y

(array([[56.        , 91.97      , 49.        , ...,  1.68536028,
          1.        ,  0.        ],
        [69.        , 22.29      ,  1.        , ..., 15.33714311,
          0.        ,  1.        ],
        [46.        , 50.59      ,  3.        , ..., 17.53005006,
          1.        ,  0.        ],
        ...,
        [26.        , 53.61      ,  9.        , ...,  1.        ,
          0.        ,  1.        ],
        [53.        , 42.04      ,  5.        , ..., 16.73424535,
          1.        ,  0.        ],
        [36.        , 55.16      ,  6.        , ..., 11.57171689,
          1.        ,  0.        ]], shape=(5000, 14)),
 array([1, 1, 1, ..., 1, 1, 1], shape=(5000,)))

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)
X_train_scaled, X_test_scaled, y_train, y_test

(array([[-0.91132783, -0.69066401, -0.97738846, ...,  0.62440367,
         -0.96628239,  0.96628239],
        [-0.84481864,  1.23577697, -0.42499096, ..., -0.96644206,
          1.03489416, -1.03489416],
        [-1.31038292, -0.84827728,  1.57744997, ..., -0.31426032,
          1.03489416, -1.03489416],
        ...,
        [ 0.48536501, -1.59312712, -0.90833877, ...,  0.34414191,
          1.03489416, -1.03489416],
        [-1.64292884,  2.03757095,  0.61075435, ..., -1.38317761,
         -0.96628239,  0.96628239],
        [ 0.48536501, -1.79599066, -0.07974252, ..., -1.10516458,
          1.03489416, -1.03489416]], shape=(3500, 14)),
 array([[ 1.01743848,  0.53160153, -0.14879221, ...,  1.44056795,
          1.03489416, -1.03489416],
        [-1.24387374, -1.56109603,  1.50840028, ...,  0.55237516,
         -0.96628239,  0.96628239],
        [-0.37925436, -0.03173883, -1.66788533, ...,  0.988138  ,
         -0.96628239,  0.96628239],
        ...,
        [ 0.41885583,  1.4503344 , -

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
logiregressor = LogisticRegression()
logiregressor.fit(X_train_scaled, y_train)
y_pred=logiregressor.predict(X_test_scaled)
print(f'accuracy: {accuracy_score(y_pred,y_test)}')
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))

accuracy: 0.992
              precision    recall  f1-score   support

           0       0.09      0.33      0.14         3
           1       1.00      0.99      1.00      1497

    accuracy                           0.99      1500
   macro avg       0.54      0.66      0.57      1500
weighted avg       1.00      0.99      0.99      1500

[[   1    2]
 [  10 1487]]


# Getting High accuracy means dataset might be imbalanced

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
logiregressor = LogisticRegression(class_weight='balanced')
logiregressor.fit(X_train_scaled, y_train)
y_pred=logiregressor.predict(X_test_scaled)
print(f'accuracy: {accuracy_score(y_pred,y_test)}')
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))

accuracy: 0.882
              precision    recall  f1-score   support

           0       0.64      0.04      0.07       180
           1       0.88      1.00      0.94      1320

    accuracy                           0.88      1500
   macro avg       0.76      0.52      0.51      1500
weighted avg       0.85      0.88      0.83      1500

[[   7  173]
 [   4 1316]]


# Now we will do hyper parameter tuning

In [19]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'penalty':['l1', 'l2', 'elasticnet', None],
    'fit_intercept': [False, True],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    }
hyperlogiregressor = LogisticRegression(class_weight='balanced')
clf = GridSearchCV(hyperlogiregressor, parameters)
clf.fit(X_train_scaled, y_train)

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further opti

In [21]:
print(clf.best_params_)
print(clf.best_score_)

{'fit_intercept': True, 'penalty': None, 'solver': 'sag'}
0.9014285714285715


In [22]:
y_pred=clf.predict(X_test_scaled)
print(f'accuracy: {accuracy_score(y_pred,y_test)}')
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))

accuracy: 0.824
              precision    recall  f1-score   support

           0       0.82      0.03      0.06       271
           1       0.82      1.00      0.90      1229

    accuracy                           0.82      1500
   macro avg       0.82      0.52      0.48      1500
weighted avg       0.82      0.82      0.75      1500

[[   9  262]
 [   2 1227]]
