#### Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [10]:
import os
import math
import pandas as pd
import numpy as np
import feather

from IPython.display import display

In [5]:
import sys
sys.path.append('../src')

In [6]:
from fastai.structured import add_datepart, train_cats, proc_df, apply_cats, set_rf_samples
from workflow.log_utils import get_logger

#### Functions and Constants

In [7]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [8]:
def check_columns_and_types_same_in(a,b):
    same_names = all(a.columns == b.columns)
    same_types = [a[col].dtype == b[col].dtype for col in a.columns]
    return same_names, all(same_types)

In [9]:
LN_SPLITTER ='\n====\n'
LOGGER = get_logger('log')

#### Load data

In [14]:
%%time
df_all = feather.read_dataframe('../data/tmp/raw_train')

CPU times: user 156 ms, sys: 75.9 ms, total: 232 ms
Wall time: 231 ms


In [15]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 48 columns):
channelGrouping                                 903653 non-null category
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null datetime64[ns]
device_browser                                  903653 non-null category
device_deviceCategory                           903653 non-null category
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null category
geoNetwork_city                                 903653 non-null category
geoNetwork_continent                            903653 non-null category
geoNetwork_country                              903653 non-null category
geoNetwork_metro                                903653 non-null category
geoNetwork_networkDomain                        903653 non-null category
geoNetwork_reg

#### Workflow

##### Validation 2 months

In [22]:
valid_start_ts = df_all.visitStartTime.max() - pd.DateOffset(months=2)

In [25]:
df_valid = df_all[df_all.visitStartTime >= valid_start_ts].copy()

In [27]:
df_valid.columns

Index(['channelGrouping', 'visitNumber', 'visitStartTime', 'device_browser',
       'device_deviceCategory', 'device_isMobile', 'device_operatingSystem',
       'geoNetwork_city', 'geoNetwork_continent', 'geoNetwork_country',
       'geoNetwork_metro', 'geoNetwork_networkDomain', 'geoNetwork_region',
       'geoNetwork_subContinent', 'totals_bounces', 'totals_hits',
       'totals_newVisits', 'totals_pageviews', 'trafficSource_adContent',
       'trafficSource_adwordsClickInfo.adNetworkType',
       'trafficSource_adwordsClickInfo.gclId',
       'trafficSource_adwordsClickInfo.isVideoAd',
       'trafficSource_adwordsClickInfo.page',
       'trafficSource_adwordsClickInfo.slot', 'trafficSource_campaign',
       'trafficSource_isTrueDirect', 'trafficSource_keyword',
       'trafficSource_medium', 'trafficSource_referralPath',
       'trafficSource_source', 'totals_transactionRevenue',
       'visitStartTimeYear', 'visitStartTimeMonth', 'visitStartTimeWeek',
       'visitStartTimeDay', '

In [33]:
valid_ids_targets = df_valid.groupby('fullVisitorId', as_index=False)\
                                    .agg({'totals_transactionRevenue':sum})

In [35]:
valid_ids_targets['log_transactionRevenue'] = np.log1p(valid_ids_targets.totals_transactionRevenue)

In [58]:
valid_ids_targets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108033 entries, 0 to 108032
Data columns (total 3 columns):
fullVisitorId                108033 non-null object
totals_transactionRevenue    108033 non-null float64
log_transactionRevenue       108033 non-null float64
dtypes: float64(2), object(1)
memory usage: 3.3+ MB


In [39]:
valid_ids_targets.log_transactionRevenue.mean()

0.2957701490144469

##### Intermidiate 3 months 

In [79]:
intrmid_start_ts = valid_start_ts - pd.DateOffset(months=3)

In [80]:
ids_to_predict = df_all.loc[(df_all.visitStartTime < valid_start_ts) &\
                            (df_all.visitStartTime >= intrmid_start_ts), 'fullVisitorId'].unique()

In [81]:
len(pd.Series(ids_to_predict, name='fullVisitorId'))

161057

In [82]:
len(set(valid_ids_targets.fullVisitorId).intersection(ids_to_predict))

3856

In [101]:
3856 / 161057

0.023941834257436807

##### Validation Christmas Period

In [84]:
df_all.visitStartTime.min(), df_all.visitStartTime.max()

(Timestamp('2016-08-01 07:00:12'), Timestamp('2017-08-02 06:59:53'))

In [85]:
xmas_start_ts = '2016-12-01'
xmas_ens_ts = '2017-01-31'

In [88]:
xmas_df = df_all[(df_all.visitStartTime >= xmas_start_ts) & (df_all.visitStartTime < xmas_ens_ts)].copy()

In [89]:
xmas_ids_targets = xmas_df.groupby('fullVisitorId', as_index=False)\
                                    .agg({'totals_transactionRevenue':sum})

In [91]:
xmas_ids_targets['log_transactionRevenue'] = np.log1p(xmas_ids_targets.totals_transactionRevenue)

In [92]:
xmas_ids_targets.log_transactionRevenue.mean()

0.2997448918926952

##### Previous 3 months 

In [108]:
prev_start_ts = '2016-09-01'

In [109]:
ids_to_predict = df_all.loc[(df_all.visitStartTime < xmas_start_ts) &\
                            (df_all.visitStartTime >= prev_start_ts), 'fullVisitorId'].unique()

In [110]:
len(pd.Series(ids_to_predict, name='fullVisitorId'))

236667

In [111]:
len(set(xmas_ids_targets.fullVisitorId).intersection(ids_to_predict))

5136

In [112]:
5136 / 236667

0.02170137788538326