In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv("online_shoppers_intention (1).csv")

In [47]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates in %,ExitRates in %,PageValues,SpecialDay (probability),Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
cat_col = [col for col in df.columns if df[col].dtypes == 'object']
cat_col

['Month', 'VisitorType']

In [4]:
for col in cat_col:
    le = LabelEncoder()
    df.loc[:, col] = le.fit_transform(df[col])

In [5]:
df['Revenue'] = np.where(df['Revenue'] == True, 1, 0)
df['Weekend'] = np.where(df['Weekend'] == True, 1, 0)

In [6]:
num_cols = [col for col in df.columns if df[col].dtypes != 'object'and col not in ['Weekend', 'Revenue']]
num_cols

['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates in %',
 'ExitRates in %',
 'PageValues',
 'SpecialDay (probability)',
 'Month',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'VisitorType']

In [7]:
for col in num_cols:
    scaler = StandardScaler()
    df.loc[:, col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

In [8]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates in %,ExitRates in %,PageValues,SpecialDay (probability),Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,-1.334959,-1.233426,-0.790293,-0.894178,-0.762629,0.407786,0,0
1,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.590903,-0.457683,1.171473,-0.317178,-0.308821,-1.334959,-0.136078,-0.207952,-0.894178,-0.514182,0.407786,0,0
2,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,-1.334959,2.058618,-0.790293,2.437081,-0.265735,0.407786,0,0
3,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.622954,0.573535,1.99461,-0.317178,-0.308821,-1.334959,0.96127,-0.207952,-0.477771,-0.017289,0.407786,0,0
4,-0.696993,-0.457191,-0.396478,-0.244931,-0.488636,-0.29643,-0.045196,0.142551,-0.317178,-0.308821,-1.334959,0.96127,0.374389,-0.894178,-0.017289,0.407786,1,0


In [9]:
X = df.drop('Revenue', axis=1)
y = df['Revenue']
X_constant = sm.add_constant(X)

logit_mod = sm.Logit(y, X_constant).fit()

Optimization terminated successfully.
         Current function value: 0.297033
         Iterations 9


In [55]:
logit_mod.summary()

0,1,2,3
Dep. Variable:,Revenue,No. Observations:,12330.0
Model:,Logit,Df Residuals:,12312.0
Method:,MLE,Df Model:,17.0
Date:,"Sat, 28 Jan 2023",Pseudo R-squ.:,0.3106
Time:,02:59:40,Log-Likelihood:,-3662.4
converged:,True,LL-Null:,-5312.4
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.2649,0.050,-44.958,0.000,-2.364,-2.166
Administrative,0.0074,0.036,0.203,0.839,-0.064,0.079
Administrative_Duration,-0.0263,0.034,-0.765,0.444,-0.094,0.041
Informational,0.0369,0.034,1.086,0.277,-0.030,0.103
Informational_Duration,0.0091,0.031,0.291,0.771,-0.052,0.070
ProductRelated,0.1548,0.051,3.053,0.002,0.055,0.254
ProductRelated_Duration,0.1109,0.052,2.121,0.034,0.008,0.213
BounceRates in %,-0.1441,0.152,-0.950,0.342,-0.442,0.153
ExitRates in %,-0.7330,0.115,-6.389,0.000,-0.958,-0.508


In [58]:
vif = pd.DataFrame()
vif["variables"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif

Unnamed: 0,variables,VIF
0,Administrative,1.944887
1,Administrative_Duration,1.691379
2,Informational,1.832295
3,Informational_Duration,1.678727
4,ProductRelated,4.430245
5,ProductRelated_Duration,4.330386
6,BounceRates in %,6.40718
7,ExitRates in %,7.139377
8,PageValues,1.053916
9,SpecialDay (probability),1.034535


In [10]:
X = df.drop('Revenue', axis=1)
y = df['Revenue']

In [11]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [12]:
from sklearn.linear_model import LogisticRegression

my_model = LogisticRegression(class_weight='balanced').fit(train_X, train_y)
data_for_prediction = my_model.predict(val_X)

In [13]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

Weight,Feature
0.1219  ± 0.0101,PageValues
0.0006  ± 0.0003,Informational_Duration
0.0005  ± 0.0007,Administrative_Duration
-0.0000  ± 0.0011,Region
-0.0001  ± 0.0033,VisitorType
-0.0002  ± 0.0024,Browser
-0.0003  ± 0.0010,TrafficType
-0.0005  ± 0.0025,SpecialDay (probability)
-0.0009  ± 0.0013,Weekend
-0.0010  ± 0.0026,OperatingSystems


In [17]:
from sklearn.ensemble import RandomForestClassifier

my_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=0).fit(train_X, train_y)
data_for_prediction = my_model.predict_proba(val_X)

In [76]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

Weight,Feature
0.1155  ± 0.0085,PageValues
0.0111  ± 0.0045,ExitRates in %
0.0090  ± 0.0024,BounceRates in %
0.0081  ± 0.0039,Month
0.0039  ± 0.0019,ProductRelated
0.0034  ± 0.0018,Administrative
0.0013  ± 0.0018,Region
0.0006  ± 0.0020,Browser
0.0004  ± 0.0025,VisitorType
0.0003  ± 0.0011,TrafficType


In [None]:
import shap

explainer = shap.TreeExplainer(my_model)
shap_values = explainer.shap_values(data_for_prediction)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)

In [14]:
possion_model = sm.GLM(y, X, family=sm.families.Poisson()).fit()
possion_model.summary()

0,1,2,3
Dep. Variable:,Revenue,No. Observations:,12330.0
Model:,GLM,Df Residuals:,12303.0
Model Family:,Poisson,Df Model:,26.0
Link Function:,Log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4623.7
Date:,"Tue, 24 Jan 2023",Deviance:,5431.4
Time:,19:34:23,Pearson chi2:,8720.0
No. Iterations:,9,Pseudo R-squ. (CS):,0.128
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Administrative,0.0191,0.008,2.415,0.016,0.004,0.035
Administrative_Duration,-0.0002,0.000,-1.244,0.214,-0.000,0.000
Informational,0.0336,0.019,1.766,0.077,-0.004,0.071
Informational_Duration,2.041e-05,0.000,0.120,0.905,-0.000,0.000
ProductRelated,-0.0008,0.001,-0.988,0.323,-0.002,0.001
ProductRelated_Duration,5.61e-05,1.58e-05,3.548,0.000,2.51e-05,8.71e-05
BounceRates in %,-1.8781,2.858,-0.657,0.511,-7.479,3.723
ExitRates in %,-20.6953,1.966,-10.526,0.000,-24.549,-16.842
PageValues,0.0120,0.000,30.566,0.000,0.011,0.013


In [16]:
binomial_model = sm.GLM(y, X, family=sm.families.Binomial()).fit()
binomial_model.summary()

0,1,2,3
Dep. Variable:,Revenue,No. Observations:,12330.0
Model:,GLM,Df Residuals:,12303.0
Model Family:,Binomial,Df Model:,26.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-3576.5
Date:,"Tue, 24 Jan 2023",Deviance:,7153.1
Time:,19:37:33,Pearson chi2:,131000000.0
No. Iterations:,8,Pseudo R-squ. (CS):,0.2454
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Administrative,0.0051,0.011,0.464,0.642,-0.016,0.027
Administrative_Duration,-0.0001,0.000,-0.630,0.528,-0.001,0.000
Informational,0.0334,0.027,1.236,0.216,-0.020,0.086
Informational_Duration,7.117e-05,0.000,0.321,0.748,-0.000,0.001
ProductRelated,0.0017,0.001,1.490,0.136,-0.001,0.004
ProductRelated_Duration,6.075e-05,2.71e-05,2.245,0.025,7.72e-06,0.000
BounceRates in %,-3.7879,3.254,-1.164,0.244,-10.165,2.589
ExitRates in %,-15.5895,2.399,-6.497,0.000,-20.292,-10.887
PageValues,0.0822,0.002,34.021,0.000,0.077,0.087


In [17]:
binomial_model = sm.GLM(y, X, family=sm.families.NegativeBinomial()).fit()
binomial_model.summary()

0,1,2,3
Dep. Variable:,Revenue,No. Observations:,12330.0
Model:,GLM,Df Residuals:,12303.0
Model Family:,NegativeBinomial,Df Model:,26.0
Link Function:,Log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4755.4
Date:,"Tue, 24 Jan 2023",Deviance:,4220.6
Time:,19:39:08,Pearson chi2:,7140.0
No. Iterations:,18,Pseudo R-squ. (CS):,0.1293
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Administrative,0.0222,0.009,2.429,0.015,0.004,0.040
Administrative_Duration,-0.0001,0.000,-0.743,0.458,-0.000,0.000
Informational,0.0354,0.022,1.593,0.111,-0.008,0.079
Informational_Duration,7.277e-05,0.000,0.377,0.706,-0.000,0.000
ProductRelated,-0.0003,0.001,-0.380,0.704,-0.002,0.001
ProductRelated_Duration,6.227e-05,1.98e-05,3.145,0.002,2.35e-05,0.000
BounceRates in %,-2.6363,2.938,-0.897,0.370,-8.395,3.123
ExitRates in %,-16.5426,2.141,-7.725,0.000,-20.740,-12.346
PageValues,0.0262,0.001,37.635,0.000,0.025,0.028
