In [1]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB

from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.neural_network import MLPClassifier

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)


## Loading online_shoppers_intension.csv dataset

In [2]:
df = pd.read_csv("online_shoppers_intention.csv")

In [3]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
df.shape  # this shape is the predefine attribute comming from df check it by dir(df)

(12330, 18)

In [5]:
dir(df)

['Administrative',
 'Administrative_Duration',
 'BounceRates',
 'Browser',
 'ExitRates',
 'Informational',
 'Informational_Duration',
 'Month',
 'OperatingSystems',
 'PageValues',
 'ProductRelated',
 'ProductRelated_Duration',
 'Region',
 'Revenue',
 'SpecialDay',
 'T',
 'TrafficType',
 'VisitorType',
 'Weekend',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__dataframe__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__'

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [7]:
df.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend', 'Revenue'],
      dtype='object')

In [8]:
len(df.columns)

18

## Feature Engineering

In [9]:
print(df['Weekend'].unique())

[False  True]


- coverting Weekend column to binary values

In [10]:
df['Weekend'] = df['Weekend'].replace((True,False),(1,0) )

In [11]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,False


In [12]:
print(df['Revenue'].unique())

[False  True]


In [13]:
df['Revenue'] = df['Revenue'].replace((True,False),(1,0))

In [14]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0


In [15]:
df['VisitorType'].unique()

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

In [16]:
condition = df['VisitorType']=='Returning_Visitor'
df['Returning_Visitor'] = np.where(condition,1,0)


In [17]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue,Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0,1


In [18]:
df = df.drop(columns = ['VisitorType'])
df

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,Returning_Visitor
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,1,1,1,1,0,0,1
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,Feb,2,2,1,2,0,0,1
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,4,1,9,3,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,Feb,3,2,2,4,0,0,1
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,Feb,3,3,1,4,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,1,0,1
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,Nov,3,2,1,8,1,0,1
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,Nov,3,2,1,13,1,0,1
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,Nov,2,2,3,11,0,0,1


In [19]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
Weekend                      int64
Revenue                      int64
Returning_Visitor            int32
dtype: object

In [20]:
df['Month'].unique()

array(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep',
       'Dec'], dtype=object)

In [21]:
ordinal_encoder = OrdinalEncoder()
df['Month'] = ordinal_encoder.fit_transform(df[['Month']])

In [22]:
print(df['Month'].unique())

[2. 5. 6. 8. 4. 3. 0. 7. 9. 1.]


In [23]:
df.Revenue.value_counts()   # 0 -->they didnt buy product 1---> they buy the product

Revenue
0    10422
1     1908
Name: count, dtype: int64

In [24]:
result = df[df.columns[1:]]
result

Unnamed: 0,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,Returning_Visitor
0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,2.0,1,1,1,1,0,0,1
1,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,2.0,2,2,1,2,0,0,1
2,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,2.0,4,1,9,3,0,0,1
3,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,2.0,3,2,2,4,0,0,1
4,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,2.0,3,3,1,4,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,1.0,4,6,1,1,1,0,1
12326,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,7.0,3,2,1,8,1,0,1
12327,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,7.0,3,2,1,13,1,0,1
12328,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,7.0,2,2,3,11,0,0,1


### Checking correlation on Revenue column

In [25]:
result = df[df.columns[1:]].corr()['Revenue']
result1 = result.sort_values(ascending = False)
result1

Revenue                    1.000000
PageValues                 0.492569
ProductRelated             0.158538
ProductRelated_Duration    0.152373
Informational              0.095200
Administrative_Duration    0.093587
Month                      0.080150
Informational_Duration     0.070345
Weekend                    0.029295
Browser                    0.023984
TrafficType               -0.005113
Region                    -0.011595
OperatingSystems          -0.014668
SpecialDay                -0.082305
Returning_Visitor         -0.103843
BounceRates               -0.150673
ExitRates                 -0.207071
Name: Revenue, dtype: float64

In [26]:
df.shape

(12330, 18)

### Preparing Features as X and Target as Y

In [27]:
X = df.drop(['Revenue'],axis=1)
y = df['Revenue']

### Preparing Train and Test Dataset

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0)

### Model Pipeline

In [29]:
def model_pipeline(X, model):
    n_c = X.select_dtypes(exclude=['object']).columns.values.tolist()
    c_c = X.select_dtypes(include=['object']).columns.values.tolist()
    numeric_columns = n_c
    categorical_columns = c_c
    numeric_pipeline = SimpleImputer(strategy = 'constant')
    categorical_pipeline = OneHotEncoder(handle_unknown = 
    'ignore')
    a = ('numeric', numeric_pipeline, numeric_columns)
    b = ('categorical', categorical_pipeline, categorical_columns)
    preprocessor = ColumnTransformer(
    transformers = [a, b], 
    remainder = 'passthrough'
    )
    c = ('preprocessor', preprocessor)
    d = ('smote', SMOTE(random_state = 1))
    e = ('scaler', MinMaxScaler())
    f = ('feature_selection', SelectKBest(score_func = chi2, k = 6))
    g = ('model', model)
    bundled_pipeline = imbpipeline(steps = [c, d, e, f, g])
    return bundled_pipeline

### Model Selection

In [30]:
def select_model(X, y, pipeline=None):
    classifiers = {}
    c_d1 = {"DummyClassifier": 
    DummyClassifier(strategy='most_frequent')}
    classifiers.update(c_d1)
    
    xgb = XGBClassifier(
    verbosity=0, 
    use_label_encoder=None,
    eval_metric='logloss',
    #objective='binary: logistic',
    )
    c_d2 = {"XGBClassifier": xgb}
    classifiers.update(c_d2)
    c_d3 = {"LGBMClassifier": LGBMClassifier()}
    classifiers.update(c_d3)
    c_d4 = {"RandomForestClassifier": 
    RandomForestClassifier()}
    classifiers.update(c_d4)
    c_d5 = {"DecisionTreeClassifier": DecisionTreeClassifier()}
    classifiers.update(c_d5)
    c_d6 = {"ExtraTreeClassifier": ExtraTreeClassifier()}
    classifiers.update(c_d6)
    c_d7 = {"ExtraTreesClassifier": ExtraTreeClassifier()}
    classifiers.update(c_d7) 
    c_d8 = {"AdaBoostClassifier": AdaBoostClassifier()}
    classifiers.update(c_d8)
    c_d9 = {"KNeighborsClassifier": KNeighborsClassifier()}
    classifiers.update(c_d9)
    c_d10 = {"RidgeClassifier": RidgeClassifier()}
    classifiers.update(c_d10)
    c_d11 = {"SGDClassifier": SGDClassifier()}
    classifiers.update(c_d11)
    c_d12 = {"BaggingClassifier": BaggingClassifier()}
    classifiers.update(c_d12)
    c_d13 = {"BernoulliNB": BernoulliNB()}
    classifiers.update(c_d13)
    c_d14 = {"SVC": SVC()}
    classifiers.update(c_d14)
    c_d15 = {"MLPClassifier": MLPClassifier()}
    classifiers.update(c_d15)
    mlpc = {
    "MLPClassifier (paper)": 
    MLPClassifier(hidden_layer_sizes=(27, 50),
    max_iter=300,
    activation='relu',
    solver='lbfgs',
    random_state=1)
    }
    c_d16 = mlpc
    classifiers.update(c_d16) 
    cols = ['model', 'run_time', 'roc_auc']
    df_models = pd.DataFrame(columns = cols)
    for key in classifiers:
        start_time = time.time()
        print()
        print("Step 12: model_pipeline run successfully on", key)
        pipeline = model_pipeline(X_train, classifiers[key])

        cv = cross_val_score(pipeline, X, y, cv=10, scoring='roc_auc')
        row = {'model': key,
        'run_time': format(round((time.time() -start_time)/60,2)),
        'roc_auc': cv.mean(),
         }
        df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)

        df_models = df_models.sort_values(by='roc_auc', ascending = False)
    return df_models


### Access Model select_model function

In [31]:
models = select_model(X_train, y_train)


Step 12: model_pipeline run successfully on DummyClassifier

Step 12: model_pipeline run successfully on XGBClassifier

Step 12: model_pipeline run successfully on LGBMClassifier

Step 12: model_pipeline run successfully on RandomForestClassifier

Step 12: model_pipeline run successfully on DecisionTreeClassifier

Step 12: model_pipeline run successfully on ExtraTreeClassifier

Step 12: model_pipeline run successfully on ExtraTreesClassifier

Step 12: model_pipeline run successfully on AdaBoostClassifier

Step 12: model_pipeline run successfully on KNeighborsClassifier

Step 12: model_pipeline run successfully on RidgeClassifier

Step 12: model_pipeline run successfully on SGDClassifier

Step 12: model_pipeline run successfully on BaggingClassifier

Step 12: model_pipeline run successfully on BernoulliNB

Step 12: model_pipeline run successfully on SVC

Step 12: model_pipeline run successfully on MLPClassifier

Step 12: model_pipeline run successfully on MLPClassifier (paper)


In [32]:
models

Unnamed: 0,model,run_time,roc_auc
0,MLPClassifier,2.39,0.902524
15,MLPClassifier (paper),2.43,0.899221
1,LGBMClassifier,0.08,0.897217
2,XGBClassifier,0.16,0.891174
3,AdaBoostClassifier,0.16,0.888264
4,SGDClassifier,0.02,0.887366
5,RandomForestClassifier,0.55,0.88662
6,SVC,1.05,0.885963
7,BaggingClassifier,0.13,0.862125
8,BernoulliNB,0.01,0.857851


### Access best Model and Training

In [33]:
# selected_model = We can write code here to get best modle name from DataFrame

In [35]:
selected_model = MLPClassifier()
bundled_pipeline = model_pipeline(X_train, selected_model)
bundled_pipeline.fit(X_train,y_train)

### Model pipeline

In [36]:
y_pred = bundled_pipeline.predict(X_test)

In [39]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Roc and Auc Score

In [40]:
roc_auc = roc_auc_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
f1_score = f1_score(y_test,y_pred)

In [41]:
roc_auc

0.8369149493127623

In [42]:
accuracy

0.8769937821032712

In [43]:
f1_score

0.6798029556650246

### Classification Report

In [44]:
classif_report = classification_report(y_test,y_pred)

In [45]:
print(classif_report)

              precision    recall  f1-score   support

           0       0.95      0.90      0.92      3077
           1       0.60      0.78      0.68       622

    accuracy                           0.88      3699
   macro avg       0.78      0.84      0.80      3699
weighted avg       0.89      0.88      0.88      3699

