# Task 0: Choosing you project topic

We select the problem type 1: Focusing on Data Modelling and the dataset we choose is Online Shoppers Purchasing Intention Dataset Data Set.

# Task 1: Retrieving and Preparing the Data

<h4><b>Import library</b></h4>

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import f_regression, f_classif, mutual_info_classif
set_config(display='diagram')

<h4><b>Load data</b></h4>

In [23]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv', sep=',')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


<h4><b>Data description</b></h4>

<p>The dataset consists of 10 numerical and 8 categorical attributes.</p>
<p>The 'Revenue' attribute can be used as the class label.</p>

<p>Below, we would refer to split the data set into train set and test set with ratio of (70%, 30%) of data set. Moreover, from the train set, we continue splitting a part of it for validation set with ratio of (75%, 25%) of train set.</p>

In [24]:
y = df['Revenue'].replace({True: 1, False: 0})
X = df.drop('Revenue', axis=1)
X_df, test_X_df, y_sr, test_y_sr = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.25, stratify=y_sr, random_state=0)

We check the data type of each column and whether the column has missing value

In [25]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8631 entries, 5767 to 9341
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           8631 non-null   int64  
 1   Administrative_Duration  8631 non-null   float64
 2   Informational            8631 non-null   int64  
 3   Informational_Duration   8631 non-null   float64
 4   ProductRelated           8631 non-null   int64  
 5   ProductRelated_Duration  8631 non-null   float64
 6   BounceRates              8631 non-null   float64
 7   ExitRates                8631 non-null   float64
 8   PageValues               8631 non-null   float64
 9   SpecialDay               8631 non-null   float64
 10  Month                    8631 non-null   object 
 11  OperatingSystems         8631 non-null   int64  
 12  Browser                  8631 non-null   int64  
 13  Region                   8631 non-null   int64  
 14  TrafficType          

<p>Firstly, We will select the numeric columns and category columns</p>

In [26]:
cate_cols = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']
nume_cols = X_df.columns.drop(cate_cols)

### a) Identify outliers


In [27]:
X_df.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0,8631.0
mean,2.323833,79.560976,0.510949,34.993989,31.870699,1198.888323,0.02216,0.042844,5.917538,0.062357,2.120728,2.337041,3.151894,4.099409
std,3.330882,170.278267,1.306935,142.204516,45.358193,1888.448874,0.048342,0.048401,18.508248,0.200509,0.905257,1.682179,2.397321,4.041194
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,185.125,0.0,0.014198,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,8.0,0.0,0.0,18.0,604.5,0.003125,0.025333,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,92.100758,0.0,0.0,38.0,1474.5,0.016937,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,2657.318056,24.0,2256.916667,705.0,43171.23338,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


* Standard deviation method

In [43]:
data_mean, data_std = np.mean(X_df[nume_cols]), np.std(X_df[nume_cols])
# identify outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off
outlier_std_df = X_df[~((X_df[nume_cols] < lower) | (X_df[nume_cols] > upper)).any(axis=1)]
print('Percentage of outliers: ',round((1-len(outlier_std_df)/len(X_df))*100, 2),'%', sep='')

Percentage of outliers: 18.73%


* Interquartile Range Method

In [44]:
Q1 = X_df[nume_cols].quantile(0.25)
Q3 = X_df[nume_cols].quantile(0.75)
IQR = Q3 - Q1

outlier_iqr_df = X_df[~((X_df[nume_cols] < (Q1 - 1.5 * IQR)) |(X_df[nume_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
print('Percentage of outliers: ',round((1-len(outlier_iqr_df)/len(X_df))*100, 2),'%', sep='')

Percentage of outliers: 57.49%


* Automatic Outlier Detection

<p>Some category columns have too many categories that will affect the model. So, we choose which columns that have more than 5 categories to process</p>

In [8]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_list=[], num_top_values_list=[], percentage=1, categories=5, default=-1):
        self.num_top_values_list = num_top_values_list
        self.columns_list = columns_list
        self.percentage = percentage
        self.categories = categories
        self.default = default

    def getLogicNumberTopValue(self, X_df, column):
        counts = X_df[column].value_counts()
        return sum([1 for value in list(counts.values) if value>(self.percentage*max(list(counts.values))/100)])

    def reprocessing_data(self, X_df):
        self.columns_list = [column for column in self.columns_list if len(X_df[column].value_counts())>=self.categories]
        if len(self.num_top_values_list)==0:
            self.num_top_values_list = [self.getLogicNumberTopValue(X_df, column) for column in self.columns_list]

    def fit(self, X_df, y=None):
        self.reprocessing_data(X_df)
        self.top_values_list = []
        for column, num_top_values in zip(self.columns_list, self.num_top_values_list):
            counts_ = X_df[column].value_counts()
            ids = list(counts_.index)
            self.top_values_list.append(ids[:max(1, min(num_top_values, len(ids)))])
            return self
    
    def transform(self, X_df, y=None):
        df = X_df.copy()
        for top_value, column in zip(self.top_values_list, self.columns_list):
            df.loc[~df[column].isin(top_value), column] = self.default
        return df

In [9]:
col_adderdropper = ColAdderDropper(columns_list=cate_cols)
col_adderdropper.fit(X_df)
new_df = col_adderdropper.transform(train_X_df)
new_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
7126,2,30.2,0,0.0,29,630.326667,0.006667,0.023333,0.0,0.0,Sep,2,2,6,1,Returning_Visitor,False
3617,0,0.0,0,0.0,11,855.25,0.018182,0.063636,0.0,0.0,May,2,2,3,13,Returning_Visitor,False
4491,3,48.0,0,0.0,10,129.5,0.0,0.018182,0.0,0.0,May,2,5,9,4,Returning_Visitor,False
3824,0,0.0,0,0.0,8,202.833333,0.0,0.014286,0.0,0.0,May,2,4,1,2,Returning_Visitor,False
10164,3,52.75,0,0.0,31,3069.077778,0.006452,0.020538,5.793027,0.0,Nov,1,1,3,2,Returning_Visitor,False


In [10]:
numeric_trans=SimpleImputer()
categorical_trans=make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
        OneHotEncoder(handle_unknown='ignore')
)
column_trans=make_column_transformer(
        (numeric_trans, nume_cols),
        (categorical_trans, cate_cols)
        )
preprocess_pipeline = make_pipeline(col_adderdropper, column_trans)
preprocessed_train_X = preprocess_pipeline.fit_transform(train_X_df)

# Task 2: Feature Engineering

In [11]:
df[cate_cols].select_dtypes(['bool', 'object'])

Unnamed: 0,Month,VisitorType,Weekend
0,Feb,Returning_Visitor,False
1,Feb,Returning_Visitor,False
2,Feb,Returning_Visitor,False
3,Feb,Returning_Visitor,False
4,Feb,Returning_Visitor,True
...,...,...,...
12325,Dec,Returning_Visitor,True
12326,Nov,Returning_Visitor,True
12327,Nov,Returning_Visitor,True
12328,Nov,Returning_Visitor,False


In [12]:
X_new = SelectKBest(chi2, k=5).fit_transform(X_df, y_sr)

NameError: name 'SelectKBest' is not defined

In [None]:
class OneHotEncoding:
    def __init__(self, columns, handle_unknown_type='ignore', isSparse=False):
        self.model = OneHotEncoder(handle_unknown=handle_unknown_type, sparse=isSparse)
        self.columns = columns
    def fit_transform(self, df, visualize=False):
        for col in self.columns:
            self.matrix = self.model.fit_transform(df[[col]])  # get a matrix of new features and values
            names = self.model.get_feature_names()  # get names for these features
            df_oh = pd.DataFrame(data=self.matrix, columns=names, index=df.index)
            if visualize: display(df_oh.plot.hist())
            df = pd.concat([df, df_oh], axis=1)
            df.drop(col, axis=1, inplace=True)
        return df

    def retransform_with_values(self, columns, values):
        dic = {}
        for col, value in zip(columns, values):
            split_col = col.rsplit('_')
            if split_col[0] not in dic:
                dic[split_col[0]] = (value, 1)
            else:
                dic[split_col[0]] = (dic[split_col[0]][0]+value, dic[split_col[0]][1]+1)
        key = list(dic.keys())
        return key, [dic[k][0]/dic[k][1] for k in key]


In [None]:
cate_df = train_X_df.drop(nume_cols, axis=1)
onehotencoder = OneHotEncoding(cate_df.select_dtypes("object").columns, handle_unknown_type='ignore', isSparse=False)
cate_ohc_df = onehotencoder.fit_transform(cate_df)
cate_ohc_df

Unnamed: 0,OperatingSystems,Browser,Region,TrafficType,Weekend,x0_Aug,x0_Dec,x0_Feb,x0_Jul,x0_June,x0_Mar,x0_May,x0_Nov,x0_Oct,x0_Sep,x0_New_Visitor,x0_Other,x0_Returning_Visitor
7126,2,2,6,1,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3617,2,2,3,13,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4491,2,5,9,4,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3824,2,4,1,2,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
10164,1,1,3,2,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7942,3,2,2,2,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1552,1,1,1,2,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8109,2,2,3,2,False,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1149,2,5,1,1,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# References
[1] https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/