In [1]:
import pandas as pd #for data preprocessing

import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np

from datetime import datetime

from sklearn.model_selection import KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV,train_test_split #For splitting

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import warnings

from sklearn.preprocessing import LabelEncoder,OneHotEncoder 

In [2]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [3]:
train_data.isnull().sum()

ID                          0
Year_of_Birth               0
Education_Level             0
Marital_Status              0
Disposable_Income          16
No_of_Kids_in_home          0
No_of_Teen_in_home          0
Date_Customer               0
Recency                     0
Discounted_Purchases        0
WebPurchases                0
CatalogPurchases            0
StorePurchases              0
Amount_on_Wines             0
Amount_on_Fruits            0
Amount_on_MeatProducts      0
Amount_on_FishProducts      0
Amount_on_SweetProducts     0
Amount_on_GoldProds         0
WebVisitsMonth              0
Cmp3Accepted                0
Cmp4Accepted                0
Cmp5Accepted                0
Cmp1Accepted                0
Cmp2Accepted                0
Any_Complain                0
Response                    0
dtype: int64

In [4]:
test_data.isnull().sum()

ID                         0
Year_of_Birth              0
Education_Level            0
Marital_Status             0
Disposable_Income          8
No_of_Kids_in_home         0
No_of_Teen_in_home         0
Date_Customer              0
Recency                    0
Discounted_Purchases       0
WebPurchases               0
CatalogPurchases           0
StorePurchases             0
Amount_on_Wines            0
Amount_on_Fruits           0
Amount_on_MeatProducts     0
Amount_on_FishProducts     0
Amount_on_SweetProducts    0
Amount_on_GoldProds        0
WebVisitsMonth             0
Cmp3Accepted               0
Cmp4Accepted               0
Cmp5Accepted               0
Cmp1Accepted               0
Cmp2Accepted               0
Any_Complain               0
dtype: int64

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1568 entries, 0 to 1567
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       1568 non-null   object 
 1   Year_of_Birth            1568 non-null   int64  
 2   Education_Level          1568 non-null   object 
 3   Marital_Status           1568 non-null   object 
 4   Disposable_Income        1552 non-null   float64
 5   No_of_Kids_in_home       1568 non-null   int64  
 6   No_of_Teen_in_home       1568 non-null   int64  
 7   Date_Customer            1568 non-null   object 
 8   Recency                  1568 non-null   int64  
 9   Discounted_Purchases     1568 non-null   int64  
 10  WebPurchases             1568 non-null   int64  
 11  CatalogPurchases         1568 non-null   int64  
 12  StorePurchases           1568 non-null   int64  
 13  Amount_on_Wines          1568 non-null   int64  
 14  Amount_on_Fruits        

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672 entries, 0 to 671
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       672 non-null    object 
 1   Year_of_Birth            672 non-null    int64  
 2   Education_Level          672 non-null    object 
 3   Marital_Status           672 non-null    object 
 4   Disposable_Income        664 non-null    float64
 5   No_of_Kids_in_home       672 non-null    int64  
 6   No_of_Teen_in_home       672 non-null    int64  
 7   Date_Customer            672 non-null    object 
 8   Recency                  672 non-null    int64  
 9   Discounted_Purchases     672 non-null    int64  
 10  WebPurchases             672 non-null    int64  
 11  CatalogPurchases         672 non-null    int64  
 12  StorePurchases           672 non-null    int64  
 13  Amount_on_Wines          672 non-null    int64  
 14  Amount_on_Fruits         6

In [7]:
#Drop ID from train and test datasets
train_data.drop('ID', axis=1, inplace=True)
test_data.drop('ID', axis=1, inplace=True)

In [8]:
#Convert Datetime
train_data['Date_Customer'] = pd.to_datetime(train_data['Date_Customer'])
test_data['Date_Customer'] = pd.to_datetime(test_data['Date_Customer'])

In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1568 entries, 0 to 1567
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Year_of_Birth            1568 non-null   int64         
 1   Education_Level          1568 non-null   object        
 2   Marital_Status           1568 non-null   object        
 3   Disposable_Income        1552 non-null   float64       
 4   No_of_Kids_in_home       1568 non-null   int64         
 5   No_of_Teen_in_home       1568 non-null   int64         
 6   Date_Customer            1568 non-null   datetime64[ns]
 7   Recency                  1568 non-null   int64         
 8   Discounted_Purchases     1568 non-null   int64         
 9   WebPurchases             1568 non-null   int64         
 10  CatalogPurchases         1568 non-null   int64         
 11  StorePurchases           1568 non-null   int64         
 12  Amount_on_Wines          1568 non-

In [10]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672 entries, 0 to 671
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Year_of_Birth            672 non-null    int64         
 1   Education_Level          672 non-null    object        
 2   Marital_Status           672 non-null    object        
 3   Disposable_Income        664 non-null    float64       
 4   No_of_Kids_in_home       672 non-null    int64         
 5   No_of_Teen_in_home       672 non-null    int64         
 6   Date_Customer            672 non-null    datetime64[ns]
 7   Recency                  672 non-null    int64         
 8   Discounted_Purchases     672 non-null    int64         
 9   WebPurchases             672 non-null    int64         
 10  CatalogPurchases         672 non-null    int64         
 11  StorePurchases           672 non-null    int64         
 12  Amount_on_Wines          672 non-nul

In [11]:
#categorical_columns == categorical columns excluding Date column
categorical_columns = train_data.select_dtypes(include=['object', 'category']).columns

#numerical_columns == numerical columns + date column included
numerical_columns = [col for col in train_data.columns if col not in categorical_columns]

print(f'The Categorical Columns are: {categorical_columns}')
print(f'The Numerical Columns are: {numerical_columns}')

The Categorical Columns are: Index(['Education_Level', 'Marital_Status'], dtype='object')
The Numerical Columns are: ['Year_of_Birth', 'Disposable_Income', 'No_of_Kids_in_home', 'No_of_Teen_in_home', 'Date_Customer', 'Recency', 'Discounted_Purchases', 'WebPurchases', 'CatalogPurchases', 'StorePurchases', 'Amount_on_Wines', 'Amount_on_Fruits', 'Amount_on_MeatProducts', 'Amount_on_FishProducts', 'Amount_on_SweetProducts', 'Amount_on_GoldProds', 'WebVisitsMonth', 'Cmp3Accepted', 'Cmp4Accepted', 'Cmp5Accepted', 'Cmp1Accepted', 'Cmp2Accepted', 'Any_Complain', 'Response']


In [12]:
#categ_cols == categorical columns . Date column excluded
categorical_columns = test_data.select_dtypes(include=['object', 'category']).columns

#numerical_columns == numerical columns + date column included
numerical_columns = [col for col in test_data.columns if col not in categorical_columns]
print(f'The Categorical Columns are: {categorical_columns}')
print(f'The Numerical Columns are: {numerical_columns}')

The Categorical Columns are: Index(['Education_Level', 'Marital_Status'], dtype='object')
The Numerical Columns are: ['Year_of_Birth', 'Disposable_Income', 'No_of_Kids_in_home', 'No_of_Teen_in_home', 'Date_Customer', 'Recency', 'Discounted_Purchases', 'WebPurchases', 'CatalogPurchases', 'StorePurchases', 'Amount_on_Wines', 'Amount_on_Fruits', 'Amount_on_MeatProducts', 'Amount_on_FishProducts', 'Amount_on_SweetProducts', 'Amount_on_GoldProds', 'WebVisitsMonth', 'Cmp3Accepted', 'Cmp4Accepted', 'Cmp5Accepted', 'Cmp1Accepted', 'Cmp2Accepted', 'Any_Complain']


In [13]:
for col in categorical_columns:
    print(f'Unique entries in the {col} column in the train set is: {train_data[col].nunique()}')
    print(f'and the unique entries are: {train_data[col].unique()}')

Unique entries in the Education_Level column in the train set is: 5
and the unique entries are: ['Graduation' '2n Cycle' 'Master' 'PhD' 'Basic']
Unique entries in the Marital_Status column in the train set is: 8
and the unique entries are: ['Divorced' 'Together' 'Single' 'Married' 'Widow' 'Alone' 'Absurd' 'YOLO']


In [14]:
for col in categorical_columns:
    print(f'Unique entries in the {col} column in the test set is: {test_data[col].nunique()}')
    print(f'and the unique entries are: {test_data[col].unique()}')

Unique entries in the Education_Level column in the test set is: 5
and the unique entries are: ['Graduation' 'Basic' 'Master' '2n Cycle' 'PhD']
Unique entries in the Marital_Status column in the test set is: 6
and the unique entries are: ['Single' 'Widow' 'Married' 'Together' 'Divorced' 'Absurd']


In [15]:
print(f'Unique entries in the Target column in  is: {train_data.Response.nunique()}')
print(f'and the unique entries are: {train_data.Response.unique()}')

Unique entries in the Target column in  is: 2
and the unique entries are: [0 1]


In [16]:
train_data['Disposable_Income'].fillna((train_data['Disposable_Income'].mean()) , inplace = True)

#check again to make sure
train_data.isnull().sum()

Year_of_Birth              0
Education_Level            0
Marital_Status             0
Disposable_Income          0
No_of_Kids_in_home         0
No_of_Teen_in_home         0
Date_Customer              0
Recency                    0
Discounted_Purchases       0
WebPurchases               0
CatalogPurchases           0
StorePurchases             0
Amount_on_Wines            0
Amount_on_Fruits           0
Amount_on_MeatProducts     0
Amount_on_FishProducts     0
Amount_on_SweetProducts    0
Amount_on_GoldProds        0
WebVisitsMonth             0
Cmp3Accepted               0
Cmp4Accepted               0
Cmp5Accepted               0
Cmp1Accepted               0
Cmp2Accepted               0
Any_Complain               0
Response                   0
dtype: int64

In [18]:
#import classification module
from pycaret.classification import *
#intialize the setup
exp_clf101 = setup(data = train_data, target = 'Response')

Unnamed: 0,Description,Value
0,session_id,7505
1,Target,Response
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(1568, 26)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,15
8,Ordinal Features,False
9,High Cardinality Features,False


In [19]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8852,0.8502,0.4092,0.7008,0.5094,0.4502,0.4748,0.369
ridge,Ridge Classifier,0.8851,0.0,0.3724,0.6932,0.478,0.4227,0.4506,0.022
lightgbm,Light Gradient Boosting Machine,0.8815,0.8602,0.3607,0.6966,0.4669,0.4094,0.441,0.838
catboost,CatBoost Classifier,0.8806,0.8596,0.3294,0.6977,0.4417,0.3862,0.4225,4.336
xgboost,Extreme Gradient Boosting,0.8769,0.8518,0.3974,0.6435,0.4848,0.4202,0.4392,1.393
lda,Linear Discriminant Analysis,0.876,0.8667,0.5081,0.6055,0.55,0.4791,0.483,0.041
ada,Ada Boost Classifier,0.8751,0.8354,0.3967,0.628,0.4819,0.4156,0.4319,0.133
et,Extra Trees Classifier,0.8751,0.8361,0.2746,0.72,0.3878,0.3351,0.3871,0.244
rf,Random Forest Classifier,0.8641,0.8493,0.1831,0.6094,0.2743,0.2278,0.2785,0.328
lr,Logistic Regression,0.8605,0.7842,0.1835,0.6108,0.2751,0.2231,0.2753,1.251


In [21]:
gbc = create_model('gbc')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9,0.9235,0.4706,0.8,0.5926,0.5399,0.5647
1,0.9182,0.8754,0.6471,0.7857,0.7097,0.6626,0.6668
2,0.9,0.7881,0.4118,0.875,0.56,0.5117,0.5582
3,0.8909,0.8384,0.375,0.75,0.5,0.4463,0.4802
4,0.8091,0.8052,0.25,0.3077,0.2759,0.1673,0.1685
5,0.8818,0.8404,0.3125,0.7143,0.4348,0.3799,0.4206
6,0.8818,0.8168,0.3125,0.7143,0.4348,0.3799,0.4206
7,0.8716,0.8683,0.3125,0.625,0.4167,0.3534,0.3803
8,0.9174,0.9315,0.625,0.7692,0.6897,0.6426,0.6472
9,0.8807,0.8145,0.375,0.6667,0.48,0.4185,0.4407


In [23]:
lda = create_model('lda')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9182,0.9001,0.6471,0.7857,0.7097,0.6626,0.6668
1,0.9182,0.9367,0.7059,0.75,0.7273,0.6792,0.6796
2,0.8455,0.7849,0.3529,0.5,0.4138,0.3278,0.3344
3,0.9273,0.9275,0.625,0.8333,0.7143,0.6736,0.6827
4,0.7727,0.7959,0.3125,0.2632,0.2857,0.1518,0.1526
5,0.8909,0.885,0.5,0.6667,0.5714,0.5104,0.5173
6,0.8455,0.8208,0.375,0.4615,0.4138,0.3259,0.3282
7,0.844,0.8051,0.375,0.4615,0.4138,0.325,0.3273
8,0.9266,0.9388,0.75,0.75,0.75,0.707,0.707
9,0.8716,0.8723,0.4375,0.5833,0.5,0.428,0.4339
