#### Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os
import math
import pandas as pd
import numpy as np
import feather

from IPython.display import display

In [3]:
import sys
sys.path.append('../src')

In [4]:
from fastai.structured import add_datepart, train_cats, proc_df, apply_cats, set_rf_samples
from workflow.log_utils import get_logger

#### Functions and Constants

In [5]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [6]:
def check_columns_and_types_same_in(a,b):
    same_names = all(a.columns == b.columns)
    same_types = [a[col].dtype == b[col].dtype for col in a.columns]
    return same_names, all(same_types)

In [75]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())

In [7]:
LN_SPLITTER ='\n====\n'
LOGGER = get_logger('log')

#### Load data

In [8]:
%%time
df_all = feather.read_dataframe('../data/tmp/raw_train')

CPU times: user 274 ms, sys: 161 ms, total: 436 ms
Wall time: 2.66 s


In [9]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 48 columns):
channelGrouping                                 903653 non-null category
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null datetime64[ns]
device_browser                                  903653 non-null category
device_deviceCategory                           903653 non-null category
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null category
geoNetwork_city                                 903653 non-null category
geoNetwork_continent                            903653 non-null category
geoNetwork_country                              903653 non-null category
geoNetwork_metro                                903653 non-null category
geoNetwork_networkDomain                        903653 non-null category
geoNetwork_reg

#### Workflow

##### Validation 2 months

In [10]:
valid_start_ts = df_all.visitStartTime.max() - pd.DateOffset(months=2)

In [11]:
df_valid = df_all[df_all.visitStartTime >= valid_start_ts].copy()

In [12]:
df_valid.columns

Index(['channelGrouping', 'visitNumber', 'visitStartTime', 'device_browser',
       'device_deviceCategory', 'device_isMobile', 'device_operatingSystem',
       'geoNetwork_city', 'geoNetwork_continent', 'geoNetwork_country',
       'geoNetwork_metro', 'geoNetwork_networkDomain', 'geoNetwork_region',
       'geoNetwork_subContinent', 'totals_bounces', 'totals_hits',
       'totals_newVisits', 'totals_pageviews', 'trafficSource_adContent',
       'trafficSource_adwordsClickInfo.adNetworkType',
       'trafficSource_adwordsClickInfo.gclId',
       'trafficSource_adwordsClickInfo.isVideoAd',
       'trafficSource_adwordsClickInfo.page',
       'trafficSource_adwordsClickInfo.slot', 'trafficSource_campaign',
       'trafficSource_isTrueDirect', 'trafficSource_keyword',
       'trafficSource_medium', 'trafficSource_referralPath',
       'trafficSource_source', 'totals_transactionRevenue',
       'visitStartTimeYear', 'visitStartTimeMonth', 'visitStartTimeWeek',
       'visitStartTimeDay', '

In [13]:
valid_ids_targets = df_valid.groupby('fullVisitorId', as_index=False)\
                                    .agg({'totals_transactionRevenue':sum})

In [14]:
valid_ids_targets['log_transactionRevenue'] = np.log1p(valid_ids_targets.totals_transactionRevenue)

In [15]:
valid_ids_targets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108033 entries, 0 to 108032
Data columns (total 3 columns):
fullVisitorId                108033 non-null object
totals_transactionRevenue    108033 non-null float64
log_transactionRevenue       108033 non-null float64
dtypes: float64(2), object(1)
memory usage: 3.3+ MB


In [58]:
valid_ids_targets.log_transactionRevenue.max()

23.442593540690858

##### Previous 5 months 

In [17]:
intrmid_start_ts = valid_start_ts - pd.DateOffset(months=5)

In [20]:
train_df = df_all[(df_all.visitStartTime < valid_start_ts) &
                  (df_all.visitStartTime >= intrmid_start_ts)].copy()

In [26]:
# make targets
ids = train_df.fullVisitorId.unique()

In [70]:
len(ids)

260397

In [28]:
targets_df = pd.DataFrame({'fullVisitorId':ids})

In [31]:
# ids in previous months are also in future 2 months
targets_df['is_future_customer'] = targets_df.fullVisitorId.isin(
    valid_ids_targets.fullVisitorId).astype(bool)

In [68]:
targets_df = targets_df.merge(valid_ids_targets, how='left', on='fullVisitorId')

In [72]:
targets_df.fillna(value=0., inplace=True)

In [73]:
targets_df.log_transactionRevenue.mean()

0.020801285535588753

##### Predict with zeros 

In [74]:
pred_y = np.zeros(len(targets_df))
y = targets_df.log_transactionRevenue.values

In [76]:
rmse(pred_y, y)

0.6136467649066868

##### Predict with future mean 

In [79]:
pred_y = [targets_df.log_transactionRevenue.mean()] * len(targets_df)

In [81]:
rmse(pred_y, y)

0.6132941044886289

##### Predict with current mean

In [88]:
log_current = np.log1p(train_df.groupby('fullVisitorId', as_index=False).agg({'totals_transactionRevenue': sum})
                       ['totals_transactionRevenue'].values)

In [114]:
pred_y = [log_current.mean()] * len(targets_df)

In [115]:
rmse(pred_y, y)

0.6595329619964011

##### First two months mean 

In [130]:
first_month_offset = train_df.visitStartTime.min() + pd.DateOffset(months=2)

In [131]:
first_month_df = train_df[train_df.visitStartTime < first_month_offset].copy()

In [132]:
log_first_month_revenue = np.log1p(first_month_df.groupby('fullVisitorId', as_index=False).agg({'totals_transactionRevenue': sum})
                                   ['totals_transactionRevenue'].values)

In [133]:
pred_y = [log_first_month_revenue.mean()] * len(targets_df)
rmse(pred_y, y)

0.6475396265730865

##### Last two months mean

In [134]:
last_month_offset = train_df.visitStartTime.max() - pd.DateOffset(months=2)

In [136]:
last_month_df = train_df[train_df.visitStartTime >= last_month_offset].copy()

In [141]:
log_last_month_revenue = np.log1p(last_month_df.groupby('fullVisitorId', as_index=False).agg({'totals_transactionRevenue': sum})
                                   ['totals_transactionRevenue'].values)

In [144]:
pred_y = [log_last_month_revenue.mean()] * len(targets_df)
rmse(pred_y, y)

0.6816747532352604

##### Validation Christmas Period

In [84]:
df_all.visitStartTime.min(), df_all.visitStartTime.max()

(Timestamp('2016-08-01 07:00:12'), Timestamp('2017-08-02 06:59:53'))

In [85]:
xmas_start_ts = '2016-12-01'
xmas_ens_ts = '2017-01-31'

In [88]:
xmas_df = df_all[(df_all.visitStartTime >= xmas_start_ts) & (df_all.visitStartTime < xmas_ens_ts)].copy()

In [89]:
xmas_ids_targets = xmas_df.groupby('fullVisitorId', as_index=False)\
                                    .agg({'totals_transactionRevenue':sum})

In [91]:
xmas_ids_targets['log_transactionRevenue'] = np.log1p(xmas_ids_targets.totals_transactionRevenue)

In [92]:
xmas_ids_targets.log_transactionRevenue.mean()

0.2997448918926952

##### Previous 3 months 

In [108]:
prev_start_ts = '2016-09-01'

In [109]:
ids_to_predict = df_all.loc[(df_all.visitStartTime < xmas_start_ts) &\
                            (df_all.visitStartTime >= prev_start_ts), 'fullVisitorId'].unique()

In [110]:
len(pd.Series(ids_to_predict, name='fullVisitorId'))

236667

In [111]:
len(set(xmas_ids_targets.fullVisitorId).intersection(ids_to_predict))

5136

In [112]:
5136 / 236667

0.02170137788538326