In [104]:
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
set_config(display='diagram')

## **Read data from UCI dataset**

In [105]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv', sep=',')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [106]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

After reviewing, we have made decision on the feature of `Revenue` as our targets to research in this campain.

In [107]:
df['Revenue'].value_counts()

False    10422
True      1908
Name: Revenue, dtype: int64

In [108]:
y = df['Revenue']
X = df.drop('Revenue', axis=1)

Below, we would refer to split the data set into train set and test set with ratio of (70%, 30%) of data set. Moreover, from the train set, we continue spliting a part of it for validation set with ratio of (80%, 20%) of train set.

In [109]:
X_df, test_X_df, y_sr, test_y_sr = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.25, stratify=y_sr, random_state=0)

In [110]:
print(f"Train X set shape: {train_X_df.shape}")
print(f"Train y set shape: {train_y_sr.shape}")
print(f"Validate X set shape: {val_X_df.shape}")
print(f"Validate y set shape: {val_y_sr.shape}")
print(f"Test X set shape: {test_X_df.shape}")
print(f"Test y set shape: {test_y_sr.shape}")

Train X set shape: (6473, 17)
Train y set shape: (6473,)
Validate X set shape: (2158, 17)
Validate y set shape: (2158,)
Test X set shape: (3699, 17)
Test y set shape: (3699,)


We just `randome_state` the `train_test_split` to make the training process more efficient in general.

In [111]:
train_X_df.head().index

Int64Index([7126, 3617, 4491, 3824, 10164], dtype='int64')

In [112]:
train_X_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
7126,2,30.2,0,0.0,29,630.326667,0.006667,0.023333,0.0,0.0,Sep,2,2,6,1,Returning_Visitor,False
3617,0,0.0,0,0.0,11,855.25,0.018182,0.063636,0.0,0.0,May,2,2,3,13,Returning_Visitor,False
4491,3,48.0,0,0.0,10,129.5,0.0,0.018182,0.0,0.0,May,2,5,9,4,Returning_Visitor,False
3824,0,0.0,0,0.0,8,202.833333,0.0,0.014286,0.0,0.0,May,2,4,1,2,Returning_Visitor,False
10164,3,52.75,0,0.0,31,3069.077778,0.006452,0.020538,5.793027,0.0,Nov,1,1,3,2,Returning_Visitor,False


In [113]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_list=[], num_top_values_list=[], default=-1):
        self.num_top_values_list = num_top_values_list
        self.columns_list = columns_list
        self.default = default

    def getLogicNumberTopValue(self, X_df, column):
        counts = X_df[column].value_counts()
        return sum([1 for value in list(counts.values) if value>(1*max(list(counts.values))/100)])

    def reprocessing_data(self, X_df):
        self.columns_list = [column for column in self.columns_list if len(X_df[column].value_counts())>=5]
        if len(self.num_top_values_list)==0:
            self.num_top_values_list = [self.getLogicNumberTopValue(X_df, column) for column in self.columns_list]

    def fit(self, X_df, y=None):
        self.reprocessing_data(X_df)
        self.top_values_list = []

        for column, num_top_values in zip(self.columns_list, self.num_top_values_list):
            self.counts_ = X_df[column].value_counts()
            ids = list(self.counts_.index)
            self.top_values_list.append(ids[:max(1, min(num_top_values, len(ids)))])
        return self
    
    def transform(self, X_df, y=None):
        out_df = X_df.copy()
        for top_value, column in zip(self.top_values_list, self.columns_list):
            out_df.loc[~out_df[column].isin(top_value), column] = self.default
            out_df = out_df.sort_index(axis=1)
        return out_df

In [114]:
cate_cols = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']
nume_cols = X.columns.drop(cate_cols)

In [115]:
col_adderdropper = ColAdderDropper(columns_list=cate_cols)
col_adderdropper.fit(train_X_df)
new_train_X_df = col_adderdropper.transform(train_X_df)
new_train_X_df

Unnamed: 0,Administrative,Administrative_Duration,BounceRates,Browser,ExitRates,Informational,Informational_Duration,Month,OperatingSystems,PageValues,ProductRelated,ProductRelated_Duration,Region,SpecialDay,TrafficType,VisitorType,Weekend
7126,2,30.200000,0.006667,2,0.023333,0,0.0,Sep,2,0.000000,29,630.326667,6,0.0,1,Returning_Visitor,False
3617,0,0.000000,0.018182,2,0.063636,0,0.0,May,2,0.000000,11,855.250000,3,0.0,13,Returning_Visitor,False
4491,3,48.000000,0.000000,5,0.018182,0,0.0,May,2,0.000000,10,129.500000,9,0.0,4,Returning_Visitor,False
3824,0,0.000000,0.000000,4,0.014286,0,0.0,May,2,0.000000,8,202.833333,1,0.0,2,Returning_Visitor,False
10164,3,52.750000,0.006452,1,0.020538,0,0.0,Nov,1,5.793027,31,3069.077778,3,0.0,2,Returning_Visitor,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7942,8,410.900000,0.011765,2,0.024216,5,406.9,Oct,3,8.887672,24,1220.257143,2,0.0,2,Returning_Visitor,False
1552,4,63.333333,0.000000,1,0.001667,0,0.0,Mar,1,27.306367,13,556.666667,1,0.0,2,Returning_Visitor,False
8109,0,0.000000,0.005634,2,0.017425,1,6.0,Dec,2,0.000000,71,1375.264286,3,0.0,2,New_Visitor,False
1149,0,0.000000,0.003704,5,0.016667,0,0.0,Mar,2,0.000000,56,2633.869048,1,0.0,1,Returning_Visitor,False


In [119]:
numerics_trans = SimpleImputer()
categorical_trans=make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

column_trans = make_column_transformer(
    (numerics_trans, nume_cols),
    (categorical_trans, cate_cols)
)

preprocess_pipeline = make_pipeline(col_adderdropper, column_trans, StandardScaler())
preprocess_pipeline
preprocess_train_X = preprocess_pipeline.fit_transform(train_X_df)
preprocess_train_X

<class 'scipy.sparse._csr.csr_matrix'>
