**Data Pre-Processing Steps**

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.discovery import all_estimators

# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [11]:
# load dataset from online_shoppers_intention.csv
shoppers_df = pd.read_csv('Dataset 3 (Online Shoppers Intention)/online_shoppers_intention.csv')

In [12]:
# view first 5 rows of the dataframe
shoppers_df.head(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0.0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1.0,1,1,1,Returning_Visitor,False,False
1,0,0.0,0.0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2.0,2,1,2,Returning_Visitor,False,False
2,0,0.0,0.0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4.0,1,9,3,Returning_Visitor,False,False
3,0,0.0,0.0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3.0,2,2,4,Returning_Visitor,False,False
4,0,0.0,0.0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3.0,3,1,4,Returning_Visitor,True,False


In [None]:
# handle all columns by correctly setting their data types
shoppers_df['Administrative'].astype(int)
shoppers_df['Administrative_Duration'].astype(float)
# this should have been int but csv has float values (all values are whole numbers), so for missing values, make them 0 first
shoppers_df['Informational'].fillna(0, inplace=True)
shoppers_df['Informational'].astype(int)
shoppers_df['Informational_Duration'].astype(float)
shoppers_df['ProductRelated'].astype(int)
shoppers_df['ProductRelated_Duration'].astype(float)
shoppers_df['BounceRates'].astype(float)
shoppers_df['ExitRates'].astype(float)
# same here, should be int but csv has float values (all values are whole numbers), so for missing values, make them 0 first
shoppers_df['PageValues'].fillna(0, inplace=True)
shoppers_df['PageValues'].astype(int)
shoppers_df['SpecialDay'].astype(float)
# cast Month categorical type to ints 1-12
month_map = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
                 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
shoppers_df['Month'] = shoppers_df['Month'].map(month_map)
# same here, should be int but csv has float values (all values are whole numbers), so for missing values, make them 0 first
shoppers_df['OperatingSystems'].fillna(0, inplace=True)
shoppers_df['OperatingSystems'].astype(int)
shoppers_df['Browser'].astype(int)
shoppers_df['Region'].astype(int)
shoppers_df['TrafficType'].astype(int)
# cast visitortype to int (0 = New_Visitor, 1 = Returning_Visitor, 2 = Other)
visitor_map = {'New_Visitor': 0, 'Returning_Visitor': 1, 'Other': 2}
shoppers_df['VisitorType'] = shoppers_df['VisitorType'].map(visitor_map)
shoppers_df['Weekend'] = shoppers_df['Weekend'].astype(bool)
shoppers_df['Revenue'] = shoppers_df['Revenue'].astype(bool)
shoppers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  float64
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12042 non-null  float64
 11  OperatingSystems         12330 non-null  float64
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [14]:
# count missing values in each column
shoppers_df.isnull().sum()

Administrative               0
Administrative_Duration      0
Informational                0
Informational_Duration       0
ProductRelated               0
ProductRelated_Duration      0
BounceRates                  0
ExitRates                    0
PageValues                   0
SpecialDay                   0
Month                      288
OperatingSystems             0
Browser                      0
Region                       0
TrafficType                  0
VisitorType                  0
Weekend                      0
Revenue                      0
dtype: int64

**Dataset comes with no missing values so no need for imputation or row/column drops.**

In [21]:
input = shoppers_df.drop('Revenue', axis=1)
target = shoppers_df['Revenue']

input_train, input_test, target_train, target_test = train_test_split(input, target, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(n_estimators=100, random_state=42)

classifier.fit(input_train, target_train)

accuracy = classifier.score(input_test, target_test)
print(f"Mean accuracy: {accuracy}")

Mean accuracy: 0.8913219789132197
