In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import warnings
import sklearn

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import seaborn as sns

#to print all output from the cell, not just the last line
#from IPython.core.interactiveshell import InteractiveShell


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer


from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc

from imblearn.over_sampling import SMOTE

#InteractiveShell.ast_node_interactivity = "all"

#toggle visualizations because they slow things down
visualizations = False

In [2]:
df = pd.read_csv('data/online_shoppers_intention.csv')

### Business Problem

This classification project examines a dataset of online shopping sessions to predict whether or not the session ended in a purchase. The imagined audience is a web development firm looking to improve the ecommerce functionalities they offer their clients. They are hoping to develop new ecommerce tools and functionalities for their platform. Development of this classification model is the first exploratory step to understand the needs of their clients. This model might pave the way for development and A/B testing of new ecommerce features. 

This firm would like to demonstrate their return on their clients' investment by demonstrating how they might help their clients convert traffic into sales. Those clients might be small(ish) business owners, who are hoping to optimize their advertising strategies, learn what types of customers to target, and drive overall sales.

### Data Understanding

For this project, I used a dataset titled "Online Shoppers Purchasing Intention" from the UCI Machine Learning Repository. The data was used in a 2018 article on machine learning models' potential for predicting user behavior in ecommerce sites, and was donated to the repository in 2018 

After spending some time with the data, I informally organized the 18 features into several categories:

<b> Type of pages visited and session duration </b> <br>
These features include the number of Administrative, Informational, and Product-Related pages visited in the session, and the total duration spent in each of those categories in seconds.

<b> Qualities of pages visited </b> <br>
This section includes Google analytics data for the pages visited in this session, aggregated by mean. Includes Bounce Rates (frequency of sessions that enter and exit on a given page), Exit Rates (frequency of sessions that end on this page), and Page Values (rate at which the given page leads to a purchase) for each page.

<b> Season session occurred </b> <br>
These two features indicate which month the session occurred, and the session's proximity to one of two designated 'Special Day's, or days expected to yield a high number of gift purchases (Valentine's Day and Mother's Day). 

<b> User information </b> <br>
Finally, there was a handful of data about the user collected for each session: the operating system and browser type used to access the site, and the region where the user is located. Additionally, we know how the user came to the site (Traffic Type) and if they had previously visited the site (Visitor Type). Whether the session occurred on a weekend is captured in Weekend.

In [3]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


INTERESTING DESCRIPTIVE STATISTICS

In [4]:
for each in df.columns:
    print(each + ":")
    print(df[each].describe(), end='\n\n')

Administrative:
count    12330.000000
mean         2.315166
std          3.321784
min          0.000000
25%          0.000000
50%          1.000000
75%          4.000000
max         27.000000
Name: Administrative, dtype: float64

Administrative_Duration:
count    12330.000000
mean        80.818611
std        176.779107
min          0.000000
25%          0.000000
50%          7.500000
75%         93.256250
max       3398.750000
Name: Administrative_Duration, dtype: float64

Informational:
count    12330.000000
mean         0.503569
std          1.270156
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         24.000000
Name: Informational, dtype: float64

Informational_Duration:
count    12330.000000
mean        34.472398
std        140.749294
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       2549.375000
Name: Informational_Duration, dtype: float64

ProductRelated:
count    12330.000000
mean      

In [5]:
#create lists of categorical/continuous features for visualizations and piplining
categorical_feats = df.columns.tolist()[12:-1]
numerical_feats = df.columns.tolist()[:12]
#some housekeeping to keep the months in order in visualizations

months = ["Feb", "Mar", "May", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
df['Month'] = pd.Categorical(df['Month'], categories=months, ordered=True)

In [6]:
# for each in categorical_feats:
#     fig = px.histogram(df, x=each, color='Revenue', )
#     fig.show()

In [7]:
#to simplify visualizations
df["Invert_Revenue"] = (df['Revenue']==0).astype(int)

# for each in categorical_feats:
#     categories = df[each].unique().tolist()
#     categories.sort    
#     sales = df.groupby(each)['Revenue'].sum().tolist()
#     passes = df.groupby(each)['Invert_Revenue'].sum().tolist()

#     fig = go.Figure()
#     fig.add_bar(x=categories, y=passes, name='No Sale').add_bar(x=categories, y=sales, name='Sale').update_layout(title_text = each, barmode='stack')


In [8]:
# for each in categorical_feats:
#     categories = df[each].unique().tolist()
#     categories.sort()

#     total = df.groupby(each)['Revenue'].count().tolist()
#     sales = df.groupby(each)['Revenue'].sum().tolist()
#     passes = df.groupby(each)['Invert_Revenue'].sum().tolist()
#     sale_percent = []
#     pass_percent = []
#     for idx, category in enumerate(categories):
#         sale_percent.append(sales[idx] / total[idx] *100)
#         pass_percent.append(passes[idx]/total[idx] * 100)

#     fig = go.Figure()
#     fig.add_bar(x=categories, y=pass_percent, name='No Sale').add_bar(x=categories, y=sale_percent, name='Sale').update_layout(title_text = each, barmode='stack')

In [9]:
# for each in numerical_feats:
#     fig = px.scatter(df, x=each, y='Revenue')
#     fig.show()

In [10]:
#  for each in numerical_feats:
#      fig = px.scatter(df, x=each, y='Total_Duration', color='Revenue')
#      fig.show()

I added the total page views and the total time spent on the site as additional features. I expected that these features would correlate strongly with Producted Related counts and duration, but thought that it was useful to capture how the user interacted with the overall site during their session. 

In [19]:
#create new features by summing existing features
df['Total_Pages'] = df.apply(lambda row: row.Administrative + row.Informational + row.ProductRelated, axis=1)
df['Total_Duration'] = df.apply(lambda row: row.Administrative_Duration + row.Informational_Duration + row.ProductRelated_Duration, axis=1)

#move the new columns to a place in the dataset where I prefer them
cols = df.columns.tolist()
new_cols = cols[:6] + cols[-2:] + cols[6:-2]

df = df[new_cols]

I considered the possibility that the site the data represented was new, or had been newly linked to Google Analytics for the purposes of the project. I wanted to see if the Google Analytics features would change over time as more users interacted with the site. After printing out the following visualizations, it seems that the distribution of data over time parallels that of other features plotted against Month. 

In [20]:
if visualizations:    
    fig = make_subplots(rows=3, cols=1)

    fig.append_trace(go.Scatter(
        x = df["Month"], y=df["BounceRates"],
        name='Bounce Rates',
        mode='markers',
        marker_color = '#22577a'
    ), row=1, col=1)

    fig.append_trace(go.Scatter(
        x = df["Month"], y=df["ExitRates"],
        name='Exit Rates',
        mode='markers',
        marker_color='#38a3a5'
    ), row=2, col=1)

    fig.add_trace(go.Scatter(
        x = df["Month"], y=df["PageValues"],
        name='Page Values',
        mode='markers',
        marker_color='#57cc99'
    ), row=3, col=1)

    fig.update_layout(hovermode=False)
    #this line of code took me 45 minutes rip
    fig.update_xaxes(categoryorder = 'array', categoryarray=np.array(months))

After reading the scant literature on the conceptualization of the 'Special Day' feature, I wanted to explore it further. Is there a 'Special Day' for each month? Which days are 'Special'? Do they correlate with sales peaks?

My investigation was short, as it turns out there were only two 'Special Days', and they were those named in the study: Valentine's Day and Mother's Day

In [13]:
df.groupby('Month')['SpecialDay'].mean()

Month
Feb    0.233696
Mar    0.000000
May    0.212366
Jul    0.000000
Aug    0.000000
Sep    0.000000
Oct    0.000000
Nov    0.000000
Dec    0.000000
Name: SpecialDay, dtype: float64

As attentive readers may have noticed, this dataset is missing several months' worth of data! We have no data for January, April, or June. In addition to having incomplete monthly data, this also renders the overall counts from month-to-month suspect, ad we're not sure if certain months are lower count due to lower traffic, or due to errors in data collection. 

### Data Preparation

Several of my categorical variables has categories with counts so low that when the data was split into training and test data, some categories wouldn't appear in the test data. For consistency across the training and test data, I grouped low-count categories into a single 'other' (coded as '99', since the categories were all integers). 

In [21]:
#columns with low-count categories
sm_outliers = ['OperatingSystems', 'Browser', 'TrafficType']

#this loop codes any low-count category as a single replacement value. I chose 99
for each in sm_outliers:
    #code snippet below from stackoverflow
    series = pd.value_counts(df[each])
    mask = (series/series.sum() * 100).lt(1)
    # To replace df['column'] use np.where I.e 
    df[each] = np.where(df[each].isin(series[mask].index),99,df[each])
    #print(df[each].value_counts())

In [22]:
#ensure that target variable is typed correctly
df['Revenue'] = df['Revenue'].astype('int32')

#cast other binary categorical variable as the correct type
df['Weekend'] = df['Weekend'].astype(int)

#remove Invert Revenue column, otherwise it helps the models
if visualizations:
    df.drop('Invert_Revenue', axis=1, inplace=True )

Here I did the test-train split, separating out 20% of my data to test my models for overfitting.

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Revenue', axis=1), df['Revenue'], test_size=0.2, random_state=270)

Although I initially wanted to use a Pipeline for my modeling, I found that I needed to use the SMOTE sampling strategy to balance my data. The SMOTE tool doesn't work into a sklearn Pipeline, and the imblearn Pipeline doesn't work with sklearn models. For that reason, I created dummy variables and scaled the data by hand, and saved those values in separate variables from those I entered into the Pipeline.

In [18]:
#create dummies for categorical features
processed_train = pd.get_dummies(X_train, columns=categorical_feats, drop_first = True)
processed_test = pd.get_dummies(X_test, columns=categorical_feats, drop_first = True)

#save the columns for later
processed_columns = processed_train.columns.to_list()

#fit the scaler on the training data
x_scale = StandardScaler().fit(processed_train)

#create scaled X-values
processed_train = x_scale.transform(processed_train)
processed_test = x_scale.fit_transform(processed_test)


#I'm still not sure if I should be scaling my targets or not?
#Most of the models didn't like having scaled models
#if I need it in a pinch, it's here.

#processed_y_train = y_train.to_numpy().reshape(-1,1)
#y_scale = StandardScaler().fit(processed_y_train)
#processed_y_train = y_scale.transform(processed_y_train)
#processed_y_test = y_test.to_numpy().reshape(-1,1)
#processed_y_test = y_scale.transform(processed_y_test)

ValueError: could not convert string to float: 'Nov'

I did have some occasion to use the sklearn Pipeline, so I constructed one below. The Pipeline accepts separate categorical and numerical data for preprocessing, then joins the two in the ColumnTransformer section of the pipe, which will be invoked alongside the given model. 

<i>I didn't end up using this Pipeline as much as I anticipated given the above complication with SMOTE. </i>

In [None]:
#numerical pipe, scales data
num_pipe = Pipeline(steps=[
    ('scaler', StandardScaler())
])

#categorical pipe, one-hot-encodes and scales data
cat_pipe = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first')),
    ('scaler', StandardScaler())
])

#combines numerical and categorical data
transformer = ColumnTransformer(transformers=[
    ('num', num_pipe, numerical_feats),
    ('cat', cat_pipe, categorical_feats)
])

Here I used SMOTE to create synthetic data points with a positive target (sales) to balance out the negative values (sessions without a sale). Because the class imbalance in my model was pretty dramatic, SMOTE proved to be a really crucial tool. The difference between balanced and unbalanced data will be apparent in the modeling section.   

In [None]:
#use SMOTE to create synthetic columns so that there are an equal number of positive/negative values
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(processed_train, y_train)

### Modeling

#### Logistic Regression

In [None]:
lr_pipe = Pipeline(steps=[
    ('ColumnTransformer', transformer),
    ('Classifier', LogisticRegression(random_state=270, solver='lbfgs', max_iter=250))
])

lr_pipe.fit(X_train, y_train)

print(lr_pipe.score(X_test, y_test))

In [None]:
y_hat_test = lr_pipe.predict(X_test)
y_hat_train = lr_pipe.predict(X_train)
residuals = np.abs(y_train - y_hat_train)
print('Train residuals:')
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

In [None]:
residuals = np.abs(y_test - y_hat_test)
print('Test residuals:')
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

In [None]:
cf_train = confusion_matrix(y_train, y_hat_train)
ax = sns.heatmap(cf_train/np.sum(cf_train), annot=True, fmt='.2%', cmap = 'Blues')
ax.set(xlabel='True', ylabel='Predicted')

In [None]:
cf_test = confusion_matrix(y_test, y_hat_test)
ax = sns.heatmap(cf_test/np.sum(cf_test), annot=True, fmt='.2%', cmap = 'Blues')
ax.set(xlabel='True', ylabel='Predicted')

In [None]:
target_names = ['No Sale', 'Sale']
print(classification_report(y_train, y_hat_train, target_names=target_names))

In [None]:
lr_pipe = Pipeline(steps=[
    ('ColumnTransformer', transformer),
    ('Classifier', LogisticRegression(class_weight='balanced', random_state=270, solver='lbfgs', max_iter=250))
])

lr_pipe.fit(X_train, y_train)

print(lr_pipe.score(X_test, y_test))

In [None]:
y_hat_train = lr_pipe.predict(X_train)
cf_train = confusion_matrix(y_train, y_hat_train)
ax = sns.heatmap(cf_train/np.sum(cf_train), annot=True, fmt='.2%', cmap = 'Blues')
ax.set(xlabel='True', ylabel='Predicted')

In [None]:
manual_lr = LogisticRegression(random_state=270, solver='lbfgs', max_iter=250)
manual_lr.fit(X_train_resampled, y_train_resampled)
y_hat_train = manual_lr.predict(processed_train)
y_hat_test = manual_lr.predict(processed_test)
y_score = manual_lr.decision_function(processed_test)


In [None]:
y_score = lr_pipe.fit(X_train, y_train).decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_score)

print('AUC: {}'.format(auc(fpr, tpr)))

MAKE THIS PRETTIER

In [None]:

ax = sns.lineplot(x=fpr, y=tpr, color = 'red')
ax = sns.lineplot(x=[0,1], y=[0,1], color='navy')

Would prefer a model that predicted sales better

#### Decision Tree

In [None]:
dt_pipe = Pipeline(steps=[
    ('ColumnTransformer', transformer),
    ('Classifier', tree.DecisionTreeClassifier(criterion='entropy'))
])

dt_pipe.fit(X_train, y_train)


In [None]:
dtc = tree.DecisionTreeClassifier(criterion='entropy')

dtc.fit(processed_train, y_train.to_numpy().reshape(-1,1))

tree.plot_tree(dtc,
             feature_names = processed_columns,
             class_names = np.unique(y_train).astype('str'),
             filled=True)

In [None]:
y_hat_train = dtc.predict(processed_train)
y_hat_test = dtc.predict(processed_test)

print(classification_report(y_train, y_hat_train, target_names=target_names))
print(classification_report(y_test, y_hat_test, target_names=target_names))

In [None]:
dtc = tree.DecisionTreeClassifier(criterion='entropy', max_depth = 5)

dtc.fit(X_train_resampled, y_train_resampled)

tree.plot_tree(dtc,
             feature_names = processed_columns,
             class_names = np.unique(y_train).astype('str'),
             filled=True)

In [None]:
y_hat_train = dtc.predict(X_train_resampled)
y_hat_test = dtc.predict(processed_test)

print(classification_report(y_train_resampled, y_hat_train, target_names=target_names))
print(classification_report(y_test, y_hat_test, target_names=target_names))

In [None]:
dtc_search = tree.DecisionTreeClassifier()

param_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [2, 5, 7, 10],
    'min_samples_split' : [2, 5, 10, 20]
}

gs_tree = GridSearchCV(dtc_search, param_grid, cv=3)
gs_tree.fit(X_train_resampled, y_train_resampled)

In [None]:
print(gs_tree.best_params_)

y_hat_train = gs_tree.predict(processed_train)
y_hat_test = gs_tree.predict(processed_test)

print(classification_report(y_train, y_hat_train, target_names=target_names))
print(classification_report(y_test, y_hat_test, target_names=target_names))

#### Random Forest

### Evaluation