Shopping Data

In [196]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [197]:
# Read the data
customer_purchases = pd.read_csv("customer_purchase_data.csv")
customer_purchases

Unnamed: 0,Age,Gender,AnnualIncome,NumberOfPurchases,ProductCategory,TimeSpentOnWebsite,LoyaltyProgram,DiscountsAvailed,PurchaseStatus
0,40,1,66120.267939,8,0,30.568601,0,5,1
1,20,1,23579.773583,4,2,38.240097,0,5,0
2,27,1,127821.306432,11,2,31.633212,1,0,1
3,24,1,137798.623120,19,3,46.167059,0,4,1
4,31,1,99300.964220,19,1,19.823592,0,0,1
...,...,...,...,...,...,...,...,...,...
1495,39,1,65048.141834,13,0,34.590743,0,5,1
1496,67,1,28775.331069,18,2,17.625707,0,1,1
1497,40,1,57363.247541,7,4,12.206033,0,0,0
1498,63,0,134021.775532,16,2,37.311634,1,0,1


In [198]:
# look at nulls and data types
customer_purchases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 1500 non-null   int64  
 1   Gender              1500 non-null   int64  
 2   AnnualIncome        1500 non-null   float64
 3   NumberOfPurchases   1500 non-null   int64  
 4   ProductCategory     1500 non-null   int64  
 5   TimeSpentOnWebsite  1500 non-null   float64
 6   LoyaltyProgram      1500 non-null   int64  
 7   DiscountsAvailed    1500 non-null   int64  
 8   PurchaseStatus      1500 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 105.6 KB


In [199]:
# count the NAs
customer_purchases.isna().sum()

Age                   0
Gender                0
AnnualIncome          0
NumberOfPurchases     0
ProductCategory       0
TimeSpentOnWebsite    0
LoyaltyProgram        0
DiscountsAvailed      0
PurchaseStatus        0
dtype: int64

In [200]:
# Remove duplicates
customer_purchases = customer_purchases.drop_duplicates()
customer_purchases

Unnamed: 0,Age,Gender,AnnualIncome,NumberOfPurchases,ProductCategory,TimeSpentOnWebsite,LoyaltyProgram,DiscountsAvailed,PurchaseStatus
0,40,1,66120.267939,8,0,30.568601,0,5,1
1,20,1,23579.773583,4,2,38.240097,0,5,0
2,27,1,127821.306432,11,2,31.633212,1,0,1
3,24,1,137798.623120,19,3,46.167059,0,4,1
4,31,1,99300.964220,19,1,19.823592,0,0,1
...,...,...,...,...,...,...,...,...,...
1495,39,1,65048.141834,13,0,34.590743,0,5,1
1496,67,1,28775.331069,18,2,17.625707,0,1,1
1497,40,1,57363.247541,7,4,12.206033,0,0,0
1498,63,0,134021.775532,16,2,37.311634,1,0,1


In [201]:
# changing Dtypes for categorical variables
customer_purchases = customer_purchases.astype({'Gender': 'category', 'ProductCategory': 'category', 
           'LoyaltyProgram': 'category', 'PurchaseStatus': 'category'})

customer_purchases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1388 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Age                 1388 non-null   int64   
 1   Gender              1388 non-null   category
 2   AnnualIncome        1388 non-null   float64 
 3   NumberOfPurchases   1388 non-null   int64   
 4   ProductCategory     1388 non-null   category
 5   TimeSpentOnWebsite  1388 non-null   float64 
 6   LoyaltyProgram      1388 non-null   category
 7   DiscountsAvailed    1388 non-null   int64   
 8   PurchaseStatus      1388 non-null   category
dtypes: category(4), float64(2), int64(3)
memory usage: 71.1 KB


In [202]:
# Select only numerical columns
numerical_cols = customer_purchases.select_dtypes(include=['float64', 'int64']).columns

# Apply StandardScaler only to numerical columns
scaler = StandardScaler()
customer_purchases[numerical_cols] = scaler.fit_transform(customer_purchases[numerical_cols])

customer_purchases

Unnamed: 0,Age,Gender,AnnualIncome,NumberOfPurchases,ProductCategory,TimeSpentOnWebsite,LoyaltyProgram,DiscountsAvailed,PurchaseStatus
0,-0.254456,1,-0.495070,-0.434443,0,-0.010544,0,1.406691,1
1,-1.546283,1,-1.628648,-1.116191,2,0.441498,0,1.406691,0
2,-1.094144,1,1.149081,0.076869,2,0.052188,1,-1.535574,1
3,-1.287918,1,1.414947,1.440365,3,0.908594,0,0.818238,1
4,-0.835778,1,0.389098,1.440365,1,-0.643694,0,-1.535574,1
...,...,...,...,...,...,...,...,...,...
1495,-0.319048,1,-0.523639,0.417743,0,0.226460,0,1.406691,1
1496,1.489510,1,-1.490202,1.269928,2,-0.773204,0,-0.947121,1
1497,-0.254456,1,-0.728418,-0.604880,4,-1.092558,0,-1.535574,0
1498,1.231144,0,1.314305,0.929054,2,0.386789,1,-1.535574,1


## Model

In [203]:
y = customer_purchases["PurchaseStatus"]
X = customer_purchases.drop(columns="PurchaseStatus")

In [204]:
# Split our training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(1041, 8)

In [205]:
# Creating logistic regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                                random_state=1)
classifier

In [206]:
# fit model with training data
classifier.fit(X_train, y_train)

In [207]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8174831892411143
Testing Data Score: 0.8126801152737753


In [208]:
# make predictions
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,1,1
2,0,0
3,1,1
4,1,1
5,0,0
6,0,0
7,1,1
8,1,0
9,1,1


In [209]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.8126801152737753

In [210]:
# confusion matrix
target_names = ["0", "1"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       185
           1       0.83      0.75      0.79       162

    accuracy                           0.81       347
   macro avg       0.82      0.81      0.81       347
weighted avg       0.81      0.81      0.81       347



In [211]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# Forward selection
sfs_forward = SFS(estimator=classifier,  # Pass an instance of the estimator
                  k_features='best',
                  forward=True,
                  floating=False,
                  scoring='accuracy',
                  cv=5)

sfs_forward = sfs_forward.fit(X_train, y_train)
print('Selected features (forward):', sfs_forward.k_feature_names_)

Selected features (forward): ('Age', 'AnnualIncome', 'NumberOfPurchases', 'TimeSpentOnWebsite', 'LoyaltyProgram', 'DiscountsAvailed')


In [212]:
# Backward elimination
sfs_backward = SFS(estimator=classifier,  # Pass an instance of the estimator
                   k_features='best',
                   forward=False,
                   floating=False,
                   scoring='accuracy',
                   cv=5)

sfs_backward = sfs_backward.fit(X_train, y_train)
print('Selected features (backward):', sfs_backward.k_feature_names_)

Selected features (backward): ('Age', 'AnnualIncome', 'NumberOfPurchases', 'TimeSpentOnWebsite', 'LoyaltyProgram', 'DiscountsAvailed')


In [213]:
customer_purchases.columns

Index(['Age', 'Gender', 'AnnualIncome', 'NumberOfPurchases', 'ProductCategory',
       'TimeSpentOnWebsite', 'LoyaltyProgram', 'DiscountsAvailed',
       'PurchaseStatus'],
      dtype='object')

In [214]:
y = customer_purchases["PurchaseStatus"]
X = customer_purchases.drop(columns=["Gender", "ProductCategory"])

In [223]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_test.shape

(347, 7)

In [217]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [218]:
classifier.fit(X_train, y_train)

In [219]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [220]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,1,1
2,0,0
3,1,1
4,1,1
5,0,0
6,0,0
7,1,1
8,0,0
9,1,1


In [221]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

1.0

In [222]:
# confusion matrix
target_names = ["0", "1"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       185
           1       1.00      1.00      1.00       162

    accuracy                           1.00       347
   macro avg       1.00      1.00      1.00       347
weighted avg       1.00      1.00      1.00       347

