In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif

pd.options.display.float_format = '{:20,.4f}'.format
pd.set_option('display.max_rows', 500)
%matplotlib inline

## 3. Related Work

### 3.1. Comparative Machine Learning Studies

### 3.2. Comparative Machine Learning Studies Focused on Customer Journey Prediction in E-Commerce

## 5. Experiments

### 5.3. Descriptive Statistics

In [None]:
cs5 = pd.read_pickle('../results/descriptives/raw_and_cleaned_hits_clickstream_0516.pkl.gz')
cs6 = pd.read_pickle('../results/descriptives/raw_and_cleaned_hits_clickstream_0616.pkl.gz')
cs7 = pd.read_pickle('../results/descriptives/raw_and_cleaned_hits_clickstream_0716.pkl.gz')
cs8 = pd.read_pickle('../results/descriptives/raw_and_cleaned_hits_clickstream_0816.pkl.gz')
cs9 = pd.read_pickle('../results/descriptives/raw_and_cleaned_hits_clickstream_0916.pkl.gz')
cs10 = pd.read_pickle('../results/descriptives/raw_and_cleaned_hits_clickstream_1016.pkl.gz')

In [None]:
cs = cs5
for df in [cs6, cs7, cs8, cs9, cs10]:
    cs = cs.append(df)

In [None]:
cs['raw_hits'].sum()

In [None]:
cs['cleaned_hits'].sum()

In [None]:
desc100k = pd.read_pickle('../results/descriptives/descriptives_100000.pkl.gz')
desc200k = pd.read_pickle('../results/descriptives/descriptives_200000.pkl.gz')
desc400k = pd.read_pickle('../results/descriptives/descriptives_400000.pkl.gz')
desc800k = pd.read_pickle('../results/descriptives/descriptives_800000.pkl.gz')
desc1600k = pd.read_pickle('../results/descriptives/descriptives_1600000.pkl.gz')

In [None]:
desc = desc100k
for df in [desc200k, desc400k, desc800k, desc1600k]:
    desc = desc.append(df)

In [None]:
df['month'] = df['visit_start_time_gmt'].apply(lambda x: x.month)
df['week'] = df['visit_start_time_gmt'].apply(lambda x: x.isocalendar()[1])

In [None]:
# all visitors
journey_length = df[['visitor_id', 'visit_start_time_gmt']].groupby(['visitor_id'], as_index=False).agg({'visit_start_time_gmt' : ['first', 'last']})
journey_length.columns = ['_'.join(x) for x in journey_length.columns.ravel()]
journey_length.rename(columns={'visitor_id_' : 'visitor_id'}, inplace=True)
journey_length['delta_first_last_visit'] = journey_length['visit_start_time_gmt_last'] - journey_length['visit_start_time_gmt_first']
journey_length['delta_first_last_visit_days'] = journey_length['delta_first_last_visit'].apply(lambda x: x.days)
journey_length['delta_first_last_visit_seconds'] = journey_length['delta_first_last_visit'].apply(lambda x: x.seconds)

journey_length['journey_length'] = journey_length.apply(lambda x: 1 if (x['delta_first_last_visit_days'] == 0) & (x['delta_first_last_visit_seconds'] != 0) else x['delta_first_last_visit_days'], axis=1)

# number of journeys of length > 0 and < 1 day
journey_length[(journey_length['delta_first_last_visit_days'] == 0) & (journey_length['delta_first_last_visit_seconds'] != 0)].shape[0]

# number of journeys of length 0 (visitors with 1 visit)
journey_length[(journey_length['delta_first_last_visit_days'] == 0) & (journey_length['delta_first_last_visit_seconds'] == 0)].shape[0]

print('mean: ', journey_length.delta_first_last_visit.mean())
print('median: ', journey_length.delta_first_last_visit.median())
print('std: ', journey_length.delta_first_last_visit.std())

print('number of unique visitors: ', journey_length.visitor_id.nunique())
print('number of visitors with 1 visit: ', len(journey_length[journey_length.delta_first_last_visit == 0]['delta_first_last_visit']))
print('number of visitors with more than 1 visit: ', journey_length.visitor_id.nunique() - len(journey_length[journey_length.delta_first_last_visit == 0]['delta_first_last_visit']))

# buyers
buyers = pd.DataFrame(df[df.purchase_within_current_visit != 0]['visitor_id'].unique(), columns=['visitor_id'])
journey_length_buyers = pd.merge(journey_length, buyers, how='inner', on='visitor_id')

print('mean: ', journey_length_buyers.delta_first_last_visit.mean())
print('median: ', journey_length_buyers.delta_first_last_visit.median())
print('std: ', journey_length_buyers.delta_first_last_visit.std())

print('number of unique visitors: ', journey_length_buyers.visitor_id.nunique())
print('number of visitors with 1 visit: ', len(journey_length_buyers[journey_length_buyers.delta_first_last_visit == 0]['delta_first_last_visit']))
print('number of visitors with more than 1 visit: ', journey_length_buyers.visitor_id.nunique() - len(journey_length_buyers[journey_length_buyers.delta_first_last_visit == 0]['delta_first_last_visit']))

# returning visitors
df['visit_num_period'] = df.groupby('visitor_id').cumcount()
df['visit_num_period'] = df['visit_num_period'] + 1
returning_visitors = pd.DataFrame(df[df.visit_num_period > 1]['visitor_id'].unique(), columns=['visitor_id'])
journey_length_returning_visitors = pd.merge(journey_length, returning_visitors, how='inner', on='visitor_id')

print('mean: ', journey_length_returning_visitors.delta_first_last_visit.mean())
print('median: ', journey_length_returning_visitors.delta_first_last_visit.median())
print('std: ', journey_length_returning_visitors.delta_first_last_visit.std())

print('number of unique visitors: ', journey_length_returning_visitors.visitor_id.nunique())
print('number of visitors with 2 visits: ', len(journey_length_returning_visitors[journey_length_returning_visitors.delta_first_last_visit == 0]['delta_first_last_visit']))
print('number of visitors with more than 2 visit: ', journey_length_returning_visitors.visitor_id.nunique() - len(journey_length_returning_visitors[journey_length_returning_visitors.delta_first_last_visit == 0]['delta_first_last_visit']))

In [None]:
all_visits_per_month = df.groupby(['month']).size().reset_index(name='all_visits')
all_visits_per_week = df.groupby(['week']).size().reset_index(name='all_visits')
no_bounce_visits_per_month = df[df['hit_count'] > 1].groupby(['month']).size().reset_index(name='no_bounce_visits')
no_bounce_visits_per_week = df[df['hit_count'] > 1].groupby(['week']).size().reset_index(name='no_bounce_visits')

In [None]:
bounces = df[df['hit_count'] == 1]
bounces_per_month = bounces.groupby(['month']).size().reset_index(name='bounces_per_month')
bounces_per_month
bounces_per_week = bounces.groupby(['week']).size().reset_index(name='bounces_per_week')
bounces_per_week

In [None]:
purchases = df[df.purchase_within_current_visit != 0]
all_purchases_per_month = purchases.groupby(['month']).size().reset_index(name='all_purchases')
all_purchases_per_week = purchases.groupby(['week']).size().reset_index(name='all_purchases')
no_bounce_purchases_per_month = purchases[purchases['hit_count'] > 1].groupby(['month']).size().reset_index(name='no_bounce_purchases')
no_bounce_purchases_per_week = purchases[purchases['hit_count'] > 1].groupby(['week']).size().reset_index(name='no_bounce_purchases')

In [None]:
all_page_views_per_month = df.groupby(['month'], as_index=False).agg({'page_view_boolean_sum' : 'sum'})
all_page_views_per_week = df.groupby(['week'], as_index=False).agg({'page_view_boolean_sum' : 'sum'})
no_bounce_page_views_per_month = df[df['hit_count'] > 1].groupby(['month'], as_index=False).agg({'page_view_boolean_sum' : 'sum'})
no_bounce_page_views_per_week = df[df['hit_count'] > 1].groupby(['week'], as_index=False).agg({'page_view_boolean_sum' : 'sum'})
all_product_views_per_month = df.groupby(['month'], as_index=False).agg({'product_view_boolean_sum' : 'sum'})
all_product_views_per_week = df.groupby(['week'], as_index=False).agg({'product_view_boolean_sum' : 'sum'})
no_bounce_product_views_per_month = df[df['hit_count'] > 1].groupby(['month'], as_index=False).agg({'product_view_boolean_sum' : 'sum'})
no_bounce_product_views_per_week = df[df['hit_count'] > 1].groupby(['week'], as_index=False).agg({'product_view_boolean_sum' : 'sum'})

In [None]:
# bin with journey length == 1 includes 189190 journeys of length > 0 and < 1 day (3461328 journeys of length 0)
plt.figure(figsize=(12,6))
plt.hist(journey_length[journey_length.journey_length > 0]['journey_length'], bins=50)
plt.title('journey_length')

plt.savefig('journey_length.png')

In [None]:
plt.figure(figsize=(10,10))

plt.subplot(4,2,1)
plt.plot(all_visits_per_month.month, all_visits_per_month.all_visits)
plt.plot(no_bounce_visits_per_month.month, no_bounce_visits_per_month.no_bounce_visits)
plt.title('visits_and_bounces_per_month')
plt.ylabel('number')
plt.xlabel('month')
blue = mpatches.Patch(color='blue', label='all')
orange = mpatches.Patch(color='orange', label='no_bounce')
plt.legend(handles=[blue, orange], loc='upper center')

plt.subplot(4,2,2)
plt.plot(all_visits_per_week.week, all_visits_per_week.all_visits)
plt.plot(no_bounce_visits_per_week.week, no_bounce_visits_per_week.no_bounce_visits)
plt.title('visits_and_bounces_per_week')
plt.ylabel('number')
plt.xlabel('week')
blue = mpatches.Patch(color='blue', label='all')
orange = mpatches.Patch(color='orange', label='no_bounce')
plt.legend(handles=[blue, orange], loc='upper center')

plt.subplot(4,2,3)
plt.plot(all_purchases_per_month.month, all_purchases_per_month.all_purchases)
plt.plot(no_bounce_purchases_per_month.month, no_bounce_purchases_per_month.no_bounce_purchases)
plt.title('purchases_per_month')
plt.ylabel('number')
plt.xlabel('month')
blue = mpatches.Patch(color='blue', label='all')
orange = mpatches.Patch(color='orange', label='no_bounce')
plt.legend(handles=[blue, orange], loc='upper center')

plt.subplot(4,2,4)
plt.plot(all_purchases_per_week.week, all_purchases_per_week.all_purchases)
plt.plot(no_bounce_purchases_per_week.week, no_bounce_purchases_per_week.no_bounce_purchases)
plt.title('purchases_per_week')
plt.ylabel('number')
plt.xlabel('week')
blue = mpatches.Patch(color='blue', label='all')
orange = mpatches.Patch(color='orange', label='no_bounce')
plt.legend(handles=[blue, orange], loc='upper center')

plt.subplot(4,2,5)
plt.plot(all_page_views_per_month.month, all_page_views_per_month.page_view_boolean_sum)
plt.plot(no_bounce_page_views_per_month.month, no_bounce_page_views_per_month.page_view_boolean_sum)
plt.title('page_views_per_month')
plt.ylabel('number')
plt.xlabel('month')
blue = mpatches.Patch(color='blue', label='all')
orange = mpatches.Patch(color='orange', label='no_bounce')
plt.legend(handles=[blue, orange], loc='upper center')

plt.subplot(4,2,6)
plt.plot(all_page_views_per_week.week, all_page_views_per_week.page_view_boolean_sum)
plt.plot(no_bounce_page_views_per_week.week, no_bounce_page_views_per_week.page_view_boolean_sum)
plt.title('page_views_per_week')
plt.ylabel('number')
plt.xlabel('week')
blue = mpatches.Patch(color='blue', label='all')
orange = mpatches.Patch(color='orange', label='no_bounce')
plt.legend(handles=[blue, orange], loc='upper center')

plt.subplot(4,2,7)
plt.plot(all_product_views_per_month.month, all_product_views_per_month.product_view_boolean_sum)
plt.plot(no_bounce_product_views_per_month.month, no_bounce_product_views_per_month.product_view_boolean_sum)
plt.title('product_views_per_month')
plt.ylabel('number')
plt.xlabel('month')
blue = mpatches.Patch(color='blue', label='all')
orange = mpatches.Patch(color='orange', label='no_bounce')
plt.legend(handles=[blue, orange], loc='upper center')

plt.subplot(4,2,8)
plt.plot(all_product_views_per_week.week, all_product_views_per_week.product_view_boolean_sum)
plt.plot(no_bounce_product_views_per_week.week, no_bounce_product_views_per_week.product_view_boolean_sum)
plt.title('product_views_per_week')
plt.ylabel('number')
plt.xlabel('week')
blue = mpatches.Patch(color='blue', label='all')
orange = mpatches.Patch(color='orange', label='no_bounce')
plt.legend(handles=[blue, orange], loc='upper center')

plt.tight_layout()
plt.savefig('all_and_no_bounce_visits_purchases_page_and_product_views_per_month_and_week.png')

## 6. Evaluation

### 6.1. Predictive Accuracy

In [None]:
lr24 = pd.read_pickle('../results/models/model_performance_LR_100000_purchase_within_next_24_hours.pkl.gz')
dt24 = pd.read_pickle('../results/models/model_performance_DT_100000_purchase_within_next_24_hours.pkl.gz')
nb24 = pd.read_pickle('../results/models/model_performance_NB_100000_purchase_within_next_24_hours.pkl.gz')
knn24 = pd.read_pickle('../results/models/model_performance_KNN_100000_purchase_within_next_24_hours.pkl.gz')
rf24 = pd.read_pickle('../results/models/model_performance_RF_100000_purchase_within_next_24_hours.pkl.gz')
svm24 = pd.read_pickle('../results/models/model_performance_SVM_100000_purchase_within_next_24_hours.pkl.gz')
boost24 = pd.read_pickle('../results/models/model_performance_BOOST_100000_purchase_within_next_24_hours.pkl.gz')
bag24 = pd.read_pickle('../results/models/model_performance_BAG_100000_purchase_within_next_24_hours.pkl.gz')
nn124 = pd.read_pickle('../results/models/model_performance_NN1_100000_purchase_within_next_24_hours.pkl.gz')
nn324 = pd.read_pickle('../results/models/model_performance_NN3_100000_purchase_within_next_24_hours.pkl.gz')
nn524 = pd.read_pickle('../results/models/model_performance_NN5_100000_purchase_within_next_24_hours.pkl.gz')
rnn24 = pd.read_pickle('../results/models/model_performance_RNN_100000_purchase_within_next_24_hours.pkl.gz')
lstm24 = pd.read_pickle('../results/models/model_performance_LSTM_100000_purchase_within_next_24_hours.pkl.gz')

In [None]:
perf24 = lr24
for df in [dt24, nb24, knn24, rf24, svm24, boost24, bag24, nn124, nn324, nn524, rnn24, lstm24]:
    perf24 = perf24.append(df)

In [None]:
print('training_set_size: ', perf24['training_set_size'][0])
print('test_set_size: ', perf24['test_set_size'][0])
print('target: ', perf24['target'][0])
print('features: ', perf24['features'][0])

In [None]:
perf24.drop(['features', 'training_set_size', 'test_set_size', 'target'], axis=1)

In [None]:
lr7 = pd.read_pickle('../results/models/model_performance_LR_100000_purchase_within_next_7_days.pkl.gz')
dt7 = pd.read_pickle('../results/models/model_performance_DT_100000_purchase_within_next_7_days.pkl.gz')
nb7 = pd.read_pickle('../results/models/model_performance_NB_100000_purchase_within_next_7_days.pkl.gz')
knn7 = pd.read_pickle('../results/models/model_performance_KNN_100000_purchase_within_next_7_days.pkl.gz')
rf7 = pd.read_pickle('../results/models/model_performance_RF_100000_purchase_within_next_7_days.pkl.gz')
svm7 = pd.read_pickle('../results/models/model_performance_SVM_100000_purchase_within_next_7_days.pkl.gz')
boost7 = pd.read_pickle('../results/models/model_performance_BOOST_100000_purchase_within_next_7_days.pkl.gz')
bag7 = pd.read_pickle('../results/models/model_performance_BAG_100000_purchase_within_next_7_days.pkl.gz')
nn17 = pd.read_pickle('../results/models/model_performance_NN1_100000_purchase_within_next_7_days.pkl.gz')
nn37 = pd.read_pickle('../results/models/model_performance_NN3_100000_purchase_within_next_7_days.pkl.gz')
nn57 = pd.read_pickle('../results/models/model_performance_NN5_100000_purchase_within_next_7_days.pkl.gz')
rnn7 = pd.read_pickle('../results/models/model_performance_RNN_100000_purchase_within_next_7_days.pkl.gz')
lstm7 = pd.read_pickle('../results/models/model_performance_LSTM_100000_purchase_within_next_7_days.pkl.gz')

In [None]:
perf7 = lr7
for df in [dt7, nb7, knn7, rf7, svm7, boost7, bag7, nn17, nn37, nn57, rnn7, lstm7]:
    perf7 = perf7.append(df)

In [None]:
print('training_set_size: ', perf7['training_set_size'][0])
print('test_set_size: ', perf7['test_set_size'][0])
print('target: ', perf7['target'][0])
print('features: ', perf7['features'][0])

In [None]:
perf7.drop(['features', 'training_set_size', 'test_set_size', 'target'], axis=1)

### 6.3. Robustness

### 6.4. Interpretability

In [None]:
y_train_24_100k = train['purchase_within_next_24_hours']
y_train_7_100k = train['purchase_within_next_7_days']
X_train_100k = train.drop(['visitor_id', 'visit_start_time_gmt', 'purchase_within_next_24_hours', 'purchase_within_next_7_days'], axis=1)

In [None]:
selector_24_100k = SelectKBest(f_classif, k='all')
X_k_best_24_100k = selector_24_100k.fit_transform(X_train_100k, y_train_24_100k)
features_24_100k = X_train_100k.columns.values[selector_24_100k.get_support()]
scores_24_100k = selector_24_100k.scores_[selector_24_100k.get_support()]
p_values_24_100k = selector_24_100k.pvalues_[selector_24_100k.get_support()]
k_best_df_24_100k = pd.DataFrame({'features': features_24_100k,
                          'scores_24': scores_24_100k,
                          'p_values_24': p_values_24_100k})
k_best_df_24_100k.sort_values('scores_24', ascending=False, inplace=True)
k_best_df_24_100k['rank_24'] = range(k_best_df_24_100k.shape[0])
k_best_df_24_100k['rank_24'] = k_best_df_24_100k['rank_24'].apply(lambda x: x+1)
k_best_df_24_100k.to_pickle('../results/descriptives/k_best_features_24_100k.pkl.gz', compression='gzip')
k_best_df_24_100k

In [None]:
selector_7_100k = SelectKBest(f_classif, k='all')
X_k_best_7_100k = selector_7_100k.fit_transform(X_train_100k, y_train_7_100k)
features_7_100k = X_train_100k.columns.values[selector_7_100k.get_support()]
scores_7_100k = selector_7_100k.scores_[selector_7_100k.get_support()]
p_values_7_100k = selector_7_100k.pvalues_[selector_7_100k.get_support()]
k_best_df_7_100k = pd.DataFrame({'features': features_7_100k,
                          'scores_7': scores_7_100k,
                          'p_values_7': p_values_7_100k})
k_best_df_7_100k.sort_values('scores_7', ascending=False, inplace=True)
k_best_df_7_100k['rank_7'] = range(k_best_df_7_100k.shape[0])
k_best_df_7_100k['rank_7'] = k_best_df_7_100k['rank_7'].apply(lambda x: x+1)
k_best_df_7_100k.to_pickle('../results/descriptives/k_best_features_7_100k.pkl.gz', compression='gzip')
k_best_df_7_100k

In [None]:
k_best_100k = pd.merge(k_best_df_24_100k, k_best_df_7_100k, how='inner', on='features')
k_best_100k

In [None]:
k_best_features = list(k_best_100k[(k_best_100k['p_values_24'] <= 0.01) & (k_best_100k['p_values_7'] <= 0.01)]['features'])
len(k_best_features)

In [None]:
dt24fi = pd.read_pickle('../results/models/feature_importances_DT_100000_purchase_within_next_24_hours.pkl.gz')
rf24fi = pd.read_pickle('../results/models/feature_importances_RF_100000_purchase_within_next_24_hours.pkl.gz')
boost24fi = pd.read_pickle('../results/models/feature_importances_BOOST_100000_purchase_within_next_24_hours.pkl.gz')

In [None]:
dt24fi.sort_values('feature_importances', ascending=False).head(20)

In [None]:
rf24fi.sort_values('feature_importances', ascending=False).head(20)

In [None]:
boost24fi.sort_values('feature_importances', ascending=False).head(20)

In [None]:
dt7fi = pd.read_pickle('../results/models/feature_importances_DT_100000_purchase_within_next_7_days.pkl.gz')
rf7fi = pd.read_pickle('../results/models/feature_importances_RF_100000_purchase_within_next_7_days.pkl.gz')
boost7fi = pd.read_pickle('../results/models/feature_importances_BOOST_100000_purchase_within_next_7_days.pkl.gz')

In [None]:
dt7fi.sort_values('feature_importances', ascending=False).head(20)

In [None]:
rf7fi.sort_values('feature_importances', ascending=False).head(20)

In [None]:
boost7fi.sort_values('feature_importances', ascending=False).head(20)

### 6.6. Algorithmic Efficiency