In [1]:
import warnings
import numpy as np
import pandas as pd
import json
import requests
import seaborn as sns
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data=pd.read_csv('online_shoppers_intention.csv')
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
data.isna().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [5]:
df=data.copy()
df1=data.iloc[:,:-1]

In [6]:
data_col=[feature for feature in df1.columns if df1[feature].dtype=='O']
data_num=[feature for feature in df1.columns if df1[feature].dtype!='O']
print(f'categorical columns: {data_col}')
print(f'numerical columns: {data_num}')
print('Target column: Revenue')

categorical columns: ['Month', 'VisitorType']
numerical columns: ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend']
Target column: Revenue


In [7]:
print(df['Month'].value_counts())
print('='*24)
print(df['VisitorType'].value_counts())

May     3364
Nov     2998
Mar     1907
Dec     1727
Oct      549
Sep      448
Aug      433
Jul      432
June     288
Feb      184
Name: Month, dtype: int64
Returning_Visitor    10551
New_Visitor           1694
Other                   85
Name: VisitorType, dtype: int64


In [8]:
df['Revenue'].value_counts()

False    10422
True      1908
Name: Revenue, dtype: int64

In [33]:
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split,GridSearchCV

In [10]:
le=LabelEncoder()
df['Revenue']=le.fit_transform(df['Revenue'])

In [11]:
df['Revenue'].value_counts()

0    10422
1     1908
Name: Revenue, dtype: int64

In [15]:
X=df.iloc[:,:-1]
y=df['Revenue']

In [19]:
X.shape

(12330, 17)

In [20]:
y.shape

(12330,)

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.20,random_state=35)

In [21]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9864, 17)
(2466, 17)
(9864,)
(2466,)


In [24]:
num_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                            ('scale',StandardScaler())])
cat_pipeline=Pipeline(steps=[('mputer',SimpleImputer(strategy='most_frequent')),
                            ('encoder',OneHotEncoder())])
preprocessor=ColumnTransformer([('numerical_pipe',num_pipeline,data_num),
                               ('categorical_pipe',cat_pipeline,data_col)])

In [25]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [29]:
models={'RandomForest': RandomForestClassifier(),
       'GradientBoost': GradientBoostingClassifier()}

params={'RandomForest':{'n_estimators': [100,300,500],'max_depth':[15],'min_samples_split':[5],'random_state':[1]},
       'GradientBoost':{'n_estimators': [100,300,500],'max_depth':[15],'min_samples_split' :[5],'random_state':[1]}}

In [35]:
def evaluate_models(X_train,X_test,y_train,y_test,models,params):
    report={}
    for i in range(len(list(models))):
        model=list(models.values())[i]
        param=params[list(models.keys())[i]]
        
        gs=GridSearchCV(model,param,cv=3)
        gs.fit(X_train,y_train)
        
        model.set_params(**gs.best_params_)
        model.fit(X_train,y_train)
        
        y_test_pred=model.predict(X_test)
        acc_score=accuracy_score(y_test,y_test_pred)
        
        print(f'{list(models.keys())[i]} :')
        print(classification_report(y_test,y_test_pred))
        
        report[list(models.keys())[i]]=acc_score
        
        best_score=max(list(report.values()))
        best_model=list(filter(lambda x:report[x]==best_score,report))[0]
    print(f'The Best score of models:{best_model} : {best_score}')
    return report
        

In [36]:
evaluate_models(X_train,X_test,y_train,y_test,models,params)

RandomForest :
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      2085
           1       0.74      0.57      0.64       381

    accuracy                           0.90      2466
   macro avg       0.83      0.77      0.79      2466
weighted avg       0.90      0.90      0.90      2466

GradientBoost :
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      2085
           1       0.70      0.57      0.63       381

    accuracy                           0.90      2466
   macro avg       0.81      0.76      0.79      2466
weighted avg       0.89      0.90      0.89      2466

The Best score of models:RandomForest : 0.9022708840227088


{'RandomForest': 0.9022708840227088, 'GradientBoost': 0.8965936739659367}

In [46]:
best_mod=models['RandomForest']
gs=GridSearchCV(best_mod,params['RandomForest'],cv=3)
gs.fit(X_train,y_train)

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(max_depth=15, min_samples_split=5,
                                              n_estimators=300,
                                              random_state=1),
             param_grid={'max_depth': [15], 'min_samples_split': [5],
                         'n_estimators': [100, 300, 500], 'random_state': [1]})

In [49]:
best_mod.set_params(**gs.best_params_)
best_mod.fit(X_train,y_train)

RandomForestClassifier(max_depth=15, min_samples_split=5, n_estimators=300,
                       random_state=1)

In [50]:
y_pred=best_mod.predict(X_test)

In [53]:
y_pred_new=le.inverse_transform(y_pred)

In [92]:
revenue_new=pd.DataFrame(y_pred_new)
revenue_new.columns=['revenue']
X_set=pd.DataFrame(X_test)

In [94]:
df_new=pd.concat([X_set,revenue_new],axis='columns')

In [95]:
df_new.shape

(2466, 29)

In [96]:
df_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,revenue
0,-0.398573,-0.428136,-0.401219,-0.244789,-0.544483,-0.523328,-0.457344,-0.690163,-0.311384,-0.307468,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,False
1,1.401576,0.651998,-0.401219,-0.244789,-0.269566,-0.199682,-0.457344,-0.829217,-0.311384,-0.307468,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,False
2,-0.698598,-0.456011,-0.401219,-0.244789,-0.177927,-0.308184,-0.113166,-0.342529,-0.311384,-0.307468,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,False
3,0.501502,-0.115943,-0.401219,-0.244789,1.95268,0.195309,-0.386743,-0.641785,2.343948,-0.307468,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,False
4,3.501751,6.579028,9.066771,6.654864,-0.040468,0.398332,-0.393659,-0.414293,-0.311384,2.721355,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,False


predict the Revenue and whether that particular revenue is on the weekend or not

In [117]:
ss=df_new.loc[df_new['revenue']==True]

In [125]:
pred_value=ss.loc[ss[27]==1]
pred_value.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,revenue
74,0.201477,0.207401,0.387781,0.357361,-0.155017,-0.412539,-0.457344,-0.616924,0.507228,-0.307468,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,True
81,0.501502,0.082523,-0.401219,-0.244789,-0.384115,-0.484562,-0.239968,-0.452524,3.380899,-0.307468,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,True
83,-0.698598,-0.456011,-0.401219,-0.244789,-0.498663,-0.56162,0.689915,0.831247,1.105803,-0.307468,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,True
92,2.901701,5.279616,0.387781,-0.223784,2.823251,2.942309,-0.406354,-0.566428,-0.113299,-0.307468,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,True
101,-0.698598,-0.456011,-0.401219,-0.244789,-0.246656,-0.057579,-0.457344,-0.669499,0.547527,-0.307468,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,True
