# Task 0: Choosing you project topic

We select the problem type 1: Focusing on Data Modelling and the dataset we choose is Online Shoppers Purchasing Intention Dataset Data Set.

# Task 1: Retrieving and Preparing the Data

<h4><b>Import library</b></h4>

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import f_regression, f_classif, mutual_info_classif, chi2, SelectKBest
set_config(display='diagram')

<h4><b>Load data</b></h4>

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv', sep=',')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


<h4><b>Data description</b></h4>

<p>The dataset consists of 10 numerical and 8 categorical attributes.</p>
<p>The 'Revenue' attribute can be used as the class label.</p>

<p>Below, we would refer to split the data set into train set and test set with ratio of (70%, 30%) of data set. Moreover, from the train set, we continue splitting a part of it for validation set with ratio of (75%, 25%) of train set.</p>

In [None]:
y = df['Revenue'].replace({True: 1, False: 0})
X = df.drop('Revenue', axis=1)
X_df, test_X_df, y_sr, test_y_sr = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.25, stratify=y_sr, random_state=0)

We check the data type of each column and whether the column has missing value

In [None]:
train_X_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6473 entries, 7126 to 71
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           6473 non-null   int64  
 1   Administrative_Duration  6473 non-null   float64
 2   Informational            6473 non-null   int64  
 3   Informational_Duration   6473 non-null   float64
 4   ProductRelated           6473 non-null   int64  
 5   ProductRelated_Duration  6473 non-null   float64
 6   BounceRates              6473 non-null   float64
 7   ExitRates                6473 non-null   float64
 8   PageValues               6473 non-null   float64
 9   SpecialDay               6473 non-null   float64
 10  Month                    6473 non-null   object 
 11  OperatingSystems         6473 non-null   int64  
 12  Browser                  6473 non-null   int64  
 13  Region                   6473 non-null   int64  
 14  TrafficType            

<p>Firstly, We will select the numeric columns and category columns</p>

In [None]:
cate_cols = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']
nume_cols = train_X_df.columns.drop(cate_cols)

### a) Identify and remove outliers


In [None]:
train_X_df.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0,6473.0
mean,2.326433,79.767706,0.511509,35.646408,32.050827,1209.048855,0.022154,0.042658,5.875163,0.063525,2.118183,2.335393,3.150935,4.095937
std,3.356341,172.342034,1.308158,144.039658,45.129758,1893.904016,0.048516,0.04849,17.928228,0.202411,0.900366,1.686622,2.39429,4.021409
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,188.0,0.0,0.014,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,8.0,0.0,0.0,18.0,609.541667,0.003038,0.025,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,91.666667,0.0,0.0,38.0,1494.083333,0.016667,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,2657.318056,24.0,2256.916667,705.0,43171.23338,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


* Standard deviation method

In [None]:
data_mean, data_std = np.mean(train_X_df[nume_cols]), np.std(train_X_df[nume_cols])
# identify outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off
outlier_std_df = train_X_df[~((train_X_df[nume_cols] < lower) | (train_X_df[nume_cols] > upper)).any(axis=1)]
print('Percentage of outliers: ',round((1-len(outlier_std_df)/len(train_X_df))*100, 2),'%', sep='')

Percentage of outliers: 19.22%


The percentage of outliers is so high that we wonder whether the values are outliers

* Interquartile Range Method

In [None]:
Q1 = train_X_df[nume_cols].quantile(0.25)
Q3 = train_X_df[nume_cols].quantile(0.75)
IQR = Q3 - Q1

outlier_iqr_df = train_X_df[~((train_X_df[nume_cols] < (Q1 - 1.5 * IQR)) |(train_X_df[nume_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
print('Percentage of outliers: ',round((1-len(outlier_iqr_df)/len(train_X_df))*100, 2),'%', sep='')

Percentage of outliers: 57.5%


The interquartile range method detects too much outliers so this is completely inefficient

* Automatic Outlier Detection

In [None]:
lof = LocalOutlierFactor()
yhat = lof.fit_predict(train_X_df[nume_cols])
mask = yhat != -1
outlier_aod_df = train_X_df.iloc[mask]
outlier_aod_y_df = train_y_sr.iloc[mask]
print('Percentage of outliers: ',round((1-len(outlier_aod_df)/len(train_X_df))*100, 2),'%', sep='')

Percentage of outliers: 10.06%


This method gets low percentage of outliers that we can accept to remove

In [None]:
train_X_df = outlier_aod_df
train_y_sr = outlier_aod_y_df
train_X_df.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0
mean,2.330814,74.792171,0.433184,25.339379,31.794401,1198.582692,0.021682,0.042419,4.212721,0.06156,2.119375,2.329784,3.167296,4.064411
std,3.305934,139.332081,1.14755,97.951695,42.065412,1651.625145,0.047342,0.047466,12.607227,0.199715,0.896081,1.666634,2.402364,4.012368
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,8.0,197.4375,0.0,0.014135,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,8.0,0.0,0.0,19.0,644.6,0.003318,0.025517,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,94.858333,0.0,0.0,38.0,1545.3125,0.017004,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,24.0,1417.5,14.0,1778.0,518.0,14988.59151,0.2,0.2,154.095539,1.0,8.0,13.0,9.0,20.0


### b)

<p>Some category columns have too many categories that will affect the model. So, we choose which columns that have more than 5 categories to process</p>

In [None]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_list=[], num_top_values_list=[], percentage=1, categories=5, default=-1):
        self.num_top_values_list = num_top_values_list
        self.columns_list = columns_list
        self.percentage = percentage
        self.categories = categories
        self.default = default

    def getLogicNumberTopValue(self, X_df, column):
        counts = X_df[column].value_counts()
        return sum([1 for value in list(counts.values) if value>(self.percentage*max(list(counts.values))/100)])

    def reprocessing_data(self, X_df):
        self.columns_list = [column for column in self.columns_list if len(X_df[column].value_counts())>=self.categories]
        if len(self.num_top_values_list)==0:
            self.num_top_values_list = [self.getLogicNumberTopValue(X_df, column) for column in self.columns_list]

    def fit(self, X_df, y=None):
        self.reprocessing_data(X_df)
        self.top_values_list = []
        for column, num_top_values in zip(self.columns_list, self.num_top_values_list):
            counts_ = X_df[column].value_counts()
            ids = list(counts_.index)
            self.top_values_list.append(ids[:max(1, min(num_top_values, len(ids)))])
            return self
    
    def transform(self, X_df, y=None):
        df = X_df.copy()
        for top_value, column in zip(self.top_values_list, self.columns_list):
            df.loc[~df[column].isin(top_value), column] = self.default
        return df

In [None]:
col_adderdropper = ColAdderDropper(columns_list=cate_cols)
col_adderdropper.fit(train_X_df)
new_train_X_df = col_adderdropper.transform(train_X_df)
new_train_X_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
7126,2,30.2,0,0.0,29,630.326667,0.006667,0.023333,0.0,0.0,Sep,2,2,6,1,Returning_Visitor,False
3617,0,0.0,0,0.0,11,855.25,0.018182,0.063636,0.0,0.0,May,2,2,3,13,Returning_Visitor,False
4491,3,48.0,0,0.0,10,129.5,0.0,0.018182,0.0,0.0,May,2,5,9,4,Returning_Visitor,False
3824,0,0.0,0,0.0,8,202.833333,0.0,0.014286,0.0,0.0,May,2,4,1,2,Returning_Visitor,False
10164,3,52.75,0,0.0,31,3069.077778,0.006452,0.020538,5.793027,0.0,Nov,1,1,3,2,Returning_Visitor,False


In [None]:
numerics_trans = SimpleImputer()
categorical_trans=make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

column_trans = make_column_transformer(
    (numerics_trans, nume_cols),
    (categorical_trans, cate_cols)
)

preprocess_pipeline = make_pipeline(col_adderdropper, column_trans, StandardScaler(with_mean=False))
preprocess_train_X = preprocess_pipeline.fit_transform(train_X_df)
preprocess_pipeline

# Task 2: Feature Engineering

In [None]:
train_X_df[cate_cols].select_dtypes(['bool', 'object'])

Unnamed: 0,Month,VisitorType,Weekend
7126,Sep,Returning_Visitor,False
3617,May,Returning_Visitor,False
4491,May,Returning_Visitor,False
3824,May,Returning_Visitor,False
10164,Nov,Returning_Visitor,False
...,...,...,...
7942,Oct,Returning_Visitor,False
1552,Mar,Returning_Visitor,False
8109,Dec,New_Visitor,False
1149,Mar,Returning_Visitor,False


In [None]:
class OneHotEncoding:
    def __init__(self, columns, handle_unknown_type='ignore', isSparse=False):
        self.model = OneHotEncoder(handle_unknown=handle_unknown_type, sparse=isSparse)
        self.columns = columns
        
    def fit_transform(self, df, visualize=False):
        for col in self.columns:
            self.matrix = self.model.fit_transform(df[[col]])  # get a matrix of new features and values
            names = self.model.get_feature_names()  # get names for these features
            df_oh = pd.DataFrame(data=self.matrix, columns=names, index=df.index)
            if visualize: display(df_oh.plot.hist())
            df = pd.concat([df, df_oh], axis=1)
            df.drop(col, axis=1, inplace=True)
        return df

    def retransform_with_values(self, columns, values):
        dic = {}
        for col, value in zip(columns, values):
            split_col = col.rsplit('_')
            if split_col[0] not in dic:
                dic[split_col[0]] = (value, 1)
            else:
                dic[split_col[0]] = (dic[split_col[0]][0]+value, dic[split_col[0]][1]+1)
        key = list(dic.keys())
        return key, [dic[k][0]/dic[k][1] for k in key]


In [None]:
cate_df = train_X_df.drop(nume_cols, axis=1)
onehotencoder = OneHotEncoding(cate_df.select_dtypes("object").columns, handle_unknown_type='ignore', isSparse=False)
cate_ohc_df = onehotencoder.fit_transform(cate_df)
cate_ohc_df

Unnamed: 0,OperatingSystems,Browser,Region,TrafficType,Weekend,x0_Aug,x0_Dec,x0_Feb,x0_Jul,x0_June,x0_Mar,x0_May,x0_Nov,x0_Oct,x0_Sep,x0_New_Visitor,x0_Other,x0_Returning_Visitor
7126,2,2,6,1,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3617,2,2,3,13,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4491,2,5,9,4,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3824,2,4,1,2,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
10164,1,1,3,2,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7942,3,2,2,2,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1552,1,1,1,2,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8109,2,2,3,2,False,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1149,2,5,1,1,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Array of F-Statistic that present for the affect estimation between each numerical features with the target

In [None]:
f_statistics, p_values = f_regression(train_X_df.drop(cate_cols, axis=1), train_y_sr)
for col, value in zip(nume_cols, f_statistics):
    print(f'{col}: {value.round(5)}')

Administrative: 196.57564
Administrative_Duration: 122.78163
Informational: 69.7794
Informational_Duration: 32.58059
ProductRelated: 204.59107
ProductRelated_Duration: 219.82964
BounceRates: 106.09255
ExitRates: 210.53115
PageValues: 1689.54323
SpecialDay: 28.3438


Array of estimate mutual information of categorical features for a target variable
$$Estimate = \sum \limits_{i=1}^{n} \frac{(\chi_{i}-mean(\chi))*(\gamma-mean(\gamma))}{std(\chi_{i})*std(\gamma)}$$

In [None]:
estimate = mutual_info_classif(cate_ohc_df, train_y_sr, discrete_features='auto', copy=True, random_state=1)
cate_colums, new_estimate = onehotencoder.retransform_with_values(cate_ohc_df.columns, estimate)
for col, value in zip(cate_colums, new_estimate):
    print(f'{col}: {value.round(5)}')

OperatingSystems: 0.00132
Browser: 0.0
Region: 0.0
TrafficType: 0.01262
Weekend: 0.0
x0: 0.0029


In [None]:
train_accs = []
val_accs = []
alphas = [0,0.01,0.1, 1, 10, 100]
k_s = [3, 5, 7, 9, 15, 17]
best_val_acc = 0
best_alpha = None
best_k = None

full_pipeline = make_pipeline(
    preprocess_pipeline,
    SelectKBest(chi2),
    MLPClassifier(hidden_layer_sizes=(50), activation='relu', solver='adam', random_state=0, learning_rate_init=0.05, max_iter=100)
)
full_pipeline

In [None]:
# full_pipeline.set_params(selectkbest__k=15)
# full_pipeline.fit(train_X_df, train_y_sr)
# train_score, val_score = full_pipeline.score(train_X_df, train_y_sr)*100, full_pipeline.score(val_X_df, val_y_sr)*100
for alpha in alphas:
    for k in k_s:
        full_pipeline.set_params(selectkbest__k=k, mlpclassifier__alpha=alpha)
        full_pipeline.fit(train_X_df, train_y_sr)
        train_score, val_score = full_pipeline.score(train_X_df, train_y_sr)*100, full_pipeline.score(val_X_df, val_y_sr)*100
        train_accs.append(train_score)
        val_accs.append(val_score)
        if val_score>best_val_acc:
            best_val_acc, best_alpha, best_k = val_score, alpha, k

In [None]:
# Visualizing the result
train_accs_df = pd.DataFrame(data=np.array(train_accs).reshape(len(alphas), -1),
                             index=alphas, columns=k_s)
val_accs_df = pd.DataFrame(
    data=np.array(val_accs).reshape(len(alphas), -1), 
    index=alphas, columns=k_s)
min_err = min(min(train_accs), min(val_accs))
max_err = max(max(train_accs), max(val_accs))
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.heatmap(train_accs_df, vmin=min_err, vmax=max_err, square=True, 
            annot=True, cbar=False, fmt='.2f', cmap='Reds')
plt.title('train accuracies'); plt.xlabel('value of k'); plt.ylabel('alpha')
plt.subplot(1, 2, 2)
sns.heatmap(val_accs_df, vmin=min_err, vmax=max_err, square=True, 
            annot=True, cbar=False, fmt='.2f', cmap='Reds')
plt.title('validation accuracies'); plt.xlabel('value of k'); plt.ylabel('alpha');

# References
[1] https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/