In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
set_config(display='diagram')

## **Read data from UCI dataset**

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv', sep=',')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [None]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

After reviewing, we have made decision on the feature of `Revenue` as our targets to research in this campain.

In [None]:
df['Revenue'].value_counts()

False    10422
True      1908
Name: Revenue, dtype: int64

In [None]:
y_sr = df['Revenue']
X_df = df.drop('Revenue', axis=1)

Below, we would refer to split the data set into train set and test set with ratio of (70%, 30%) of data set. Moreover, from the train set, we continue spliting a part of it for validation set with ratio of (80%, 20%) of train set.

In [None]:
train_X_df, test_X_df, train_y_sr, test_y_sr = train_test_split(X_df, y_sr, test_size=0.3, stratify=y_sr, random_state=0)
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(train_X_df, train_y_sr, test_size=0.2, stratify=train_y_sr, random_state=0)

In [None]:
print(f"Train X set shape: {train_X_df.shape}")
print(f"Train y set shape: {train_y_sr.shape}")
print(f"Validate X set shape: {val_X_df.shape}")
print(f"Validate y set shape: {val_y_sr.shape}")
print(f"Test X set shape: {test_X_df.shape}")
print(f"Test y set shape: {test_y_sr.shape}")

Train X set shape: (6904, 17)
Train y set shape: (6904,)
Validate X set shape: (1727, 17)
Validate y set shape: (1727,)
Test X set shape: (3699, 17)
Test y set shape: (3699,)


We just `randome_state` the `train_test_split` to make the training process more efficient in general.

In [None]:
train_X_df.head().index

Int64Index([8488, 9032, 4291, 3904, 9909], dtype='int64')

In [None]:
train_X_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
8488,6,79.25,0,0.0,13,150.166667,0.0,0.012632,0.0,0.0,Dec,1,1,1,3,New_Visitor,False
9032,3,156.5,0,0.0,9,143.0,0.0,0.005556,0.0,0.0,Nov,1,1,1,2,New_Visitor,False
4291,0,0.0,0,0.0,2,0.0,0.0,0.2,0.0,0.8,May,3,2,3,13,Returning_Visitor,False
3904,0,0.0,0,0.0,41,3272.0,0.045238,0.078571,0.0,0.8,May,2,6,1,6,Returning_Visitor,False
9909,0,0.0,0,0.0,23,836.5,0.0,0.013043,0.0,0.0,Nov,3,2,3,2,Returning_Visitor,False
