## Setup

In [1]:
# import libraries
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# display settings
pd.options.display.max_columns = 999
%matplotlib inline
%load_ext nb_black

# for reproducibility
np.random.seed(42)

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

# function for reading data
def read_data(filename, date_cols=None, file_path=data_path):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_cols)

<IPython.core.display.Javascript object>

In [2]:
train = read_data("Train.csv")
test = read_data("Test.csv")
submission = read_data("SampleSubmission.csv")

<IPython.core.display.Javascript object>

In [3]:
train.head()

Unnamed: 0,id,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,revenue
0,1,0,0.0,0,0.0,5,81.083333,0.04,0.05,0.0,0.0,Dec,3,2,1,2,New_Visitor,False,0
1,2,0,0.0,0,0.0,3,189.0,0.0,0.066667,0.0,0.0,Mar,3,2,8,1,Returning_Visitor,False,0
2,3,0,0.0,1,132.0,8,445.0,0.0,0.014286,0.0,0.0,Mar,3,2,4,14,Returning_Visitor,True,0
3,4,0,0.0,0,0.0,3,0.0,0.2,0.2,0.0,0.0,Mar,1,8,2,1,Returning_Visitor,False,0
4,5,0,0.0,0,0.0,4,14.0,0.1,0.15,0.0,0.0,Mar,3,2,1,1,Returning_Visitor,False,0


<IPython.core.display.Javascript object>

## Data Preprocessing Pipelines

In [4]:
# change the name of the target column
train.rename(columns={"revenue": "target"}, inplace=True)
# map bool values to yes and no
train["Weekend"] = train["Weekend"].map({True: "Yes", False: "No"})
test["Weekend"] = test["Weekend"].map({True: "Yes", False: "No"})

<IPython.core.display.Javascript object>

In [5]:
train["id"].nunique()

8631

<IPython.core.display.Javascript object>

In [6]:
train.shape

(8631, 19)

<IPython.core.display.Javascript object>

In [7]:
# set the id col as index
train.set_index("id", inplace=True)
test.set_index("id", inplace=True)

<IPython.core.display.Javascript object>

In [8]:
train.head()

Unnamed: 0_level_0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,0.0,0,0.0,5,81.083333,0.04,0.05,0.0,0.0,Dec,3,2,1,2,New_Visitor,No,0
2,0,0.0,0,0.0,3,189.0,0.0,0.066667,0.0,0.0,Mar,3,2,8,1,Returning_Visitor,No,0
3,0,0.0,1,132.0,8,445.0,0.0,0.014286,0.0,0.0,Mar,3,2,4,14,Returning_Visitor,Yes,0
4,0,0.0,0,0.0,3,0.0,0.2,0.2,0.0,0.0,Mar,1,8,2,1,Returning_Visitor,No,0
5,0,0.0,0,0.0,4,14.0,0.1,0.15,0.0,0.0,Mar,3,2,1,1,Returning_Visitor,No,0


<IPython.core.display.Javascript object>

In [9]:
# seperate the fetures and the target
X_train = train.drop("target", axis=1).copy()
y_train = train["target"].copy()
X_test = test.copy()

<IPython.core.display.Javascript object>

In [10]:
# create preprocessing pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# select numerical and categorical columns
num_cols = X_train.select_dtypes(exclude="object").columns.tolist()
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

# numerical pipeline
num_pipe = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

# categorical pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

# full pipeline for data preprocessing
full_pipe = ColumnTransformer(
    [("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)]
)
full_pipe

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Administrative', 'Administrative_Duration',
                                  'Informational', 'Informational_Duration',
                                  'ProductRelated', 'ProductRelated_Duration',
                                  'BounceRates', 'ExitRates', 'PageValues',
                                  'SpecialDay', 'OperatingSystems', 'Browser',
                                  'Region', 'TrafficType']),
                                ('cat',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='NA',
                                               

<IPython.core.display.Javascript object>

In [11]:
# fit and transform training and test features
X_train_tr = full_pipe.fit_transform(X_train)
X_test_tr = full_pipe.transform(X_test)

<IPython.core.display.Javascript object>

In [12]:
full_pipe.named_transformers_["cat"].named_steps["onehotencoder"].get_feature_names()

array(['x0_Aug', 'x0_Dec', 'x0_Feb', 'x0_Jul', 'x0_June', 'x0_Mar',
       'x0_May', 'x0_Nov', 'x0_Oct', 'x0_Sep', 'x1_New_Visitor',
       'x1_Other', 'x1_Returning_Visitor', 'x2_No', 'x2_Yes'],
      dtype=object)

<IPython.core.display.Javascript object>

In [13]:
full_pipe.named_transformers_["cat"].named_steps["onehotencoder"].categories_

[array(['Aug', 'Dec', 'Feb', 'Jul', 'June', 'Mar', 'May', 'Nov', 'Oct',
        'Sep'], dtype=object),
 array(['New_Visitor', 'Other', 'Returning_Visitor'], dtype=object),
 array(['No', 'Yes'], dtype=object)]

<IPython.core.display.Javascript object>

In [14]:
# Get the list of categories generated by the one-hot-encoder
ohe_categories = (
    full_pipe.named_transformers_["cat"].named_steps["onehotencoder"].categories_
)

<IPython.core.display.Javascript object>

In [15]:
# Create nice names for our one hot encoded features
new_ohe_features = [
    f"{col}__{val}" for col, vals in zip(cat_cols, ohe_categories) for val in vals
]
new_ohe_features

['Month__Aug',
 'Month__Dec',
 'Month__Feb',
 'Month__Jul',
 'Month__June',
 'Month__Mar',
 'Month__May',
 'Month__Nov',
 'Month__Oct',
 'Month__Sep',
 'VisitorType__New_Visitor',
 'VisitorType__Other',
 'VisitorType__Returning_Visitor',
 'Weekend__No',
 'Weekend__Yes']

<IPython.core.display.Javascript object>

In [16]:
# Create a new list with all names of features
all_features = num_cols + new_ohe_features

# Create pandas dataframe
X_train_tr = pd.DataFrame(X_train_tr, columns=all_features)
X_test_tr = pd.DataFrame(X_test_tr, columns=all_features)


<IPython.core.display.Javascript object>

In [17]:
X_train_tr.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType,Month__Aug,Month__Dec,Month__Feb,Month__Jul,Month__June,Month__Mar,Month__May,Month__Nov,Month__Oct,Month__Sep,VisitorType__New_Visitor,VisitorType__Other,VisitorType__Returning_Visitor,Weekend__No,Weekend__Yes
0,-0.699615,-0.465158,-0.397205,-0.252976,-0.601617,-0.609623,0.391685,0.159557,-0.314838,-0.3047,0.978651,-0.20801,-0.892301,-0.515315,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.699615,-0.465158,-0.397205,-0.252976,-0.646686,-0.5506,-0.454537,0.509468,-0.314838,-0.3047,0.978651,-0.20801,2.013749,-0.763872,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-0.699615,-0.465158,0.383546,0.720154,-0.534014,-0.410585,-0.454537,-0.590251,-0.314838,-0.3047,0.978651,-0.20801,0.353149,2.467373,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,-0.699615,-0.465158,-0.397205,-0.252976,-0.646686,-0.65397,3.776574,3.308752,-0.314838,-0.3047,-1.237178,3.285991,-0.477151,-0.763872,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,-0.699615,-0.465158,-0.397205,-0.252976,-0.624152,-0.646313,1.661019,2.25902,-0.314838,-0.3047,0.978651,-0.20801,-0.892301,-0.763872,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


<IPython.core.display.Javascript object>

# ML Models

## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

<IPython.core.display.Javascript object>

In [22]:
log_reg = make_pipeline(full_pipe, LogisticRegression(random_state=42, max_iter=1000))
scores = cross_val_score(log_reg, X_train, y_train, cv=5, scoring="f1_macro")
print("Scores:", scores)
print("Average score:", np.mean(scores))

Scores: [0.68932791 0.72167978 0.74813425 0.72716555 0.72006225]
Average score: 0.7212739488959479


<IPython.core.display.Javascript object>

## Support vector Machines

In [30]:
from sklearn.svm import SVC

<IPython.core.display.Javascript object>

In [31]:
svm_clf = make_pipeline(full_pipe, SVC(random_state=42))
scores = cross_val_score(svm_clf, X_train, y_train, cv=5, scoring="f1_macro")
print("Scores:", scores)
print("Average score:", np.mean(scores))

Scores: [0.74459026 0.75886837 0.77945311 0.74998016 0.76607948]
Average score: 0.759794276992569


<IPython.core.display.Javascript object>

## Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier

<IPython.core.display.Javascript object>

In [33]:
tree_clf = make_pipeline(full_pipe, DecisionTreeClassifier(random_state=42))
scores = cross_val_score(tree_clf, X_train, y_train, cv=5, scoring="f1_macro")
print("Scores:", scores)
print("Average score:", np.mean(scores))

Scores: [0.73785542 0.73626786 0.74025352 0.72277219 0.74545276]
Average score: 0.7365203492625806


<IPython.core.display.Javascript object>

## Random Forest 

In [34]:
from sklearn.ensemble import RandomForestClassifier

<IPython.core.display.Javascript object>

In [35]:
rf_clf = make_pipeline(full_pipe, RandomForestClassifier(random_state=42))
scores = cross_val_score(rf_clf, X_train, y_train, cv=5, scoring="f1_macro")
print("Scores:", scores)
print("Average score:", np.mean(scores))

Scores: [0.78011204 0.81484353 0.80738757 0.77836793 0.79697307]
Average score: 0.7955368296101


<IPython.core.display.Javascript object>

## XGBoost

In [36]:
from xgboost import XGBClassifier

<IPython.core.display.Javascript object>

In [39]:
xgb_clf = make_pipeline(
    full_pipe,
    XGBClassifier(random_state=42, eval_metric="logloss", use_label_encoder=False),
)
scores = cross_val_score(xgb_clf, X_train, y_train, cv=5, scoring="f1_macro")
print("Scores:", scores)
print("Average score:", np.mean(scores))

Scores: [0.76772639 0.79936458 0.81573609 0.77942372 0.78402912]
Average score: 0.7892559826643669


<IPython.core.display.Javascript object>

## Submission

so far, the random forest model performed best out of all the models that we tested, the xgboost model is also very close. Let's make a submission and see the results on leaderboard and later impoove these two models. 

In [40]:
submission

Unnamed: 0,id,revenue
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
3694,3695,0
3695,3696,0
3696,3697,0
3697,3698,0


<IPython.core.display.Javascript object>

In [44]:
rf_clf = make_pipeline(full_pipe, RandomForestClassifier(random_state=42))
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
submission["revenue"] = y_pred

<IPython.core.display.Javascript object>

In [45]:
# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)

<IPython.core.display.Javascript object>

In [46]:
save_dataframe(submission, "rf_default_sub1.csv")

<IPython.core.display.Javascript object>

Using only the random forest model with default parameters, we are already in the `30th position`. The `f1 macro score` on the public leaderboard is `0.7776`.  