### Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os
import math

In [3]:
import pandas as pd
import numpy as np

In [4]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [5]:
pd.__version__

'0.23.4'

In [6]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit

In [7]:
import sys
sys.path.append('../src')

In [8]:
from workflow.data_utils import load_csv

In [29]:
from workflow.log_utils import get_logger

In [9]:
# from workflow.structured import add_datepart, train_cats, proc_df, apply_cats

In [10]:
from fastai.structured import add_datepart, train_cats, proc_df, apply_cats

### Constants

In [11]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [12]:
def half_bool_columns_to_float_type(df):
    for col in df.columns:
        if is_string_dtype(df[col]):
            unqs = df[col].unique()
            if len(unqs) == 2 and (False in unqs or True in unqs):
                print(col)
                df[col] = df[col].astype(float)

In [38]:
def check_columns_and_types_same_in(a,b):
    same_names = all(a.columns == b.columns)
    same_types = [a[col].dtype == b[col].dtype for col in a.columns]
    return same_names, all(same_types)

In [46]:
# some duplicated info
drop_columns = ['date', 'sessionId', 'visitId']
LN_SPLITTER ='\n====\n'

### Loads 

In [20]:
%%time
full_train = load_csv('../data/processed_data/processed_train.csv',
                      low_memory=False, nrows=None)

CPU times: user 6.09 s, sys: 529 ms, total: 6.62 s
Wall time: 7.92 s


In [21]:
full_train.drop(columns=drop_columns, inplace=True)

In [22]:
half_bool_columns_to_float_type(full_train)

trafficSource_adwordsClickInfo.isVideoAd
trafficSource_isTrueDirect


In [53]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 903653 entries, 538675 to 64223
Data columns (total 48 columns):
channelGrouping                                 903653 non-null category
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null datetime64[ns]
device_browser                                  903653 non-null category
device_deviceCategory                           903653 non-null category
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null category
geoNetwork_city                                 903653 non-null category
geoNetwork_continent                            903653 non-null category
geoNetwork_country                              903653 non-null category
geoNetwork_metro                                903653 non-null category
geoNetwork_networkDomain                        903653 non-null category
geoNetwork

In [21]:
display_all(full_train.head())

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue
0,Organic Search,1131660440785968503,1,1472830385,Chrome,desktop,False,Windows,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
1,Organic Search,377306020877927890,1,1472880147,Firefox,desktop,False,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
2,Organic Search,3895546263509774583,1,1472865386,Chrome,desktop,False,Windows,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
3,Organic Search,4763447161404445595,1,1472881213,UC Browser,desktop,False,Linux,not available in demo dataset,Asia,Indonesia,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),,google + online,organic,,google,
4,Organic Search,27294437909732085,2,1472822600,Chrome,mobile,True,Android,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,unknown.unknown,not available in demo dataset,Northern Europe,1.0,1,,1.0,,,,,,,(not set),1.0,(not provided),organic,,google,


In [24]:
%%time
test_df = load_csv('../data/processed_data/processed_test.csv', low_memory=False, nrows=None)

CPU times: user 5.43 s, sys: 404 ms, total: 5.83 s
Wall time: 6.51 s


In [25]:
test_df.drop(columns=drop_columns, inplace=True)

In [26]:
half_bool_columns_to_float_type(test_df)

trafficSource_adwordsClickInfo.isVideoAd
trafficSource_isTrueDirect


In [27]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 31 columns):
channelGrouping                                 804684 non-null object
fullVisitorId                                   804684 non-null object
visitNumber                                     804684 non-null int64
visitStartTime                                  804684 non-null int64
device_browser                                  804684 non-null object
device_deviceCategory                           804684 non-null object
device_isMobile                                 804684 non-null bool
device_operatingSystem                          804684 non-null object
geoNetwork_city                                 804684 non-null object
geoNetwork_continent                            804684 non-null object
geoNetwork_country                              804684 non-null object
geoNetwork_metro                                804684 non-null object
geoNetwork_networkDomain                 

#### Check column names and types are == for train and test

In [58]:
all(full_train.drop('totals_transactionRevenue', axis=1).columns == test_df.columns)

True

In [59]:
types_equal = []
for col_name in test_df.columns:
    types_equal.append(full_train[col_name].dtype == test_df[col_name].dtype)

all(types_equal)

True

In [37]:
check_columns_and_types_same_in(full_train.drop('totals_transactionRevenue', axis=1), test_df)

(True, True)

In [23]:
display_all(full_train.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
channelGrouping,903653,8.0,Organic Search,381561.0,,,,,,,
fullVisitorId,903653,714167.0,1957458976293878100,278.0,,,,,,,
visitNumber,903653,,,,2.2649,9.28373,1.0,1.0,1.0,1.0,395.0
visitStartTime,903653,,,,1485010000.0,9022120.0,1470030000.0,1477560000.0,1483950000.0,1492760000.0,1501660000.0
device_browser,903653,54.0,Chrome,620364.0,,,,,,,
device_deviceCategory,903653,3.0,desktop,664479.0,,,,,,,
device_isMobile,903653,2.0,False,664530.0,,,,,,,
device_operatingSystem,903653,20.0,Windows,350072.0,,,,,,,
geoNetwork_city,903653,649.0,not available in demo dataset,508229.0,,,,,,,
geoNetwork_continent,903653,6.0,Americas,450377.0,,,,,,,


### Workflow

In [30]:
full_train['totals_transactionRevenue'] = full_train['totals_transactionRevenue'].fillna(value=0)

In [31]:
# posix seconds to datetime
full_train['visitStartTime'] = pd.to_datetime(full_train.visitStartTime, unit='s')

In [32]:
test_df['visitStartTime'] = pd.to_datetime(test_df.visitStartTime, unit='s')

#### Take fullVisitorId -> validation ids, if it has transaction after 2017-04-30 

In [28]:
validationIds = full_train[full_train.visitStartTime > '2017-04-30'].fullVisitorId.unique()

In [29]:
validationIds.shape

(161118,)

In [30]:
# save for later use
np.save('../data/tmp/validIds.npy',validationIds)

#### Continue workflow: make time feature columns

In [34]:
# make time features
add_datepart(full_train, 'visitStartTime', time=True, drop=False)

In [36]:
# make time features
add_datepart(test_df, 'visitStartTime', time=True, drop=False)

In [37]:
set(full_train.drop('totals_transactionRevenue', axis=1).columns) - set(test_df.columns)

set()

#### Drop fullVisitorId columns to not categorize them

In [40]:
train_visitorIds = full_train.fullVisitorId
test_visitorIds = test_df.fullVisitorId

In [41]:
full_train.drop(columns=['fullVisitorId'], inplace=True)
test_df.drop(columns=['fullVisitorId'], inplace=True)

In [42]:
# make categorical data
train_cats(full_train)

In [43]:
apply_cats(test_df, full_train)

In [36]:
display_all(full_train.isnull().sum().sort_index()/len(full_train))

channelGrouping                                 0.000000
device_browser                                  0.000000
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.000000
geoNetwork_city                                 0.000000
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000000
geoNetwork_metro                                0.000000
geoNetwork_networkDomain                        0.000000
geoNetwork_region                               0.000000
geoNetwork_subContinent                         0.000000
totals_bounces                                  0.501324
totals_hits                                     0.000000
totals_newVisits                                0.221980
totals_pageviews                                0.000111
totals_transactionRevenue                       0.000000
trafficSource_adContent        

In [37]:
display_all(test_df.isnull().sum().sort_index()/len(test_df))

channelGrouping                                 0.000000
device_browser                                  0.019764
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.001150
geoNetwork_city                                 0.009569
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000034
geoNetwork_metro                                0.001099
geoNetwork_networkDomain                        0.031993
geoNetwork_region                               0.007047
geoNetwork_subContinent                         0.000000
totals_bounces                                  0.476878
totals_hits                                     0.000000
totals_newVisits                                0.248935
totals_pageviews                                0.000173
trafficSource_adContent                         0.994974
trafficSource_adwordsClickInfo.

In [46]:
# add back fullVisitosIds
full_train['fullVisitorId'] = train_visitorIds
test_df['fullVisitorId'] = test_visitorIds

In [50]:
# sort train dataframe by time
full_train.sort_values(by='visitStartTime', inplace=True)

In [54]:
full_train.reset_index(drop=True, inplace=True)

In [55]:
full_train.head()

Unnamed: 0,channelGrouping,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,...,visitStartTimeIs_month_start,visitStartTimeIs_quarter_end,visitStartTimeIs_quarter_start,visitStartTimeIs_year_end,visitStartTimeIs_year_start,visitStartTimeHour,visitStartTimeMinute,visitStartTimeSecond,visitStartTimeElapsed,fullVisitorId
0,Direct,3,2016-08-01 07:00:12,Safari,mobile,True,iOS,not available in demo dataset,Americas,United States,...,True,False,False,False,False,7,0,12,1470034812,423043652415339154
1,Social,1,2016-08-01 07:04:26,Chrome,desktop,False,Windows,not available in demo dataset,Asia,Thailand,...,True,False,False,False,False,7,4,26,1470035066,8294721032567046680
2,Organic Search,1,2016-08-01 07:04:41,Amazon Silk,tablet,True,Android,not available in demo dataset,Americas,United States,...,True,False,False,False,False,7,4,41,1470035081,7718623669497357235
3,Organic Search,1,2016-08-01 07:06:01,Chrome,desktop,False,Windows,not available in demo dataset,Americas,Canada,...,True,False,False,False,False,7,6,1,1470035161,4798058133221713505
4,Direct,1,2016-08-01 07:06:10,Chrome,desktop,False,Windows,Quezon City,Asia,Philippines,...,True,False,False,False,False,7,6,10,1470035170,5076747981380011349


In [56]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
full_train.to_feather('../data/tmp/raw_train')

CPU times: user 313 ms, sys: 169 ms, total: 482 ms
Wall time: 927 ms


In [57]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
test_df.to_feather('../data/tmp/raw_test')

CPU times: user 249 ms, sys: 119 ms, total: 368 ms
Wall time: 206 ms


#### Preprocess

In [16]:
%time full_train = pd.read_feather('../data/tmp/raw_train', nthreads=-1)

  return feather.read_dataframe(path, nthreads=nthreads)


CPU times: user 207 ms, sys: 63.7 ms, total: 271 ms
Wall time: 270 ms


In [36]:
%time test_df = pd.read_feather('../data/tmp/raw_test', nthreads=-1)

  return feather.read_dataframe(path, nthreads=nthreads)


CPU times: user 184 ms, sys: 34 ms, total: 218 ms
Wall time: 216 ms


#####  TimeSeries Split way to make Validation dataset

In [31]:
tscv = TimeSeriesSplit(n_splits=5)

In [32]:
logger = get_logger('test')

In [34]:
# sanity check
for i, (tr_index, val_index) in enumerate(tscv.split(full_train.visitStartTime)):
    logger.debug(f'Working with fold {i}\n===========\n')
    print(f'Train max time: {full_train.iloc[tr_index].visitStartTime.max()}, Valid max time: {full_train.iloc[val_index].visitStartTime.min()}')
    print(f'train_set length: {len(tr_index)}, valid_set length: {len(val_index)}')

2018-10-05 12:24:37,581 - test - DEBUG - Working with fold 0

2018-10-05 12:24:37,581 - test - DEBUG - Working with fold 0

Train max time: 2016-10-03 21:24:49, Valid max time: 2016-10-03 21:25:22
train_set length: 150613, valid_set length: 150608
2018-10-05 12:24:37,629 - test - DEBUG - Working with fold 1

2018-10-05 12:24:37,629 - test - DEBUG - Working with fold 1

Train max time: 2016-11-16 16:56:29, Valid max time: 2016-11-16 16:56:35
train_set length: 301221, valid_set length: 150608
2018-10-05 12:24:37,698 - test - DEBUG - Working with fold 2

2018-10-05 12:24:37,698 - test - DEBUG - Working with fold 2

Train max time: 2017-01-09 08:03:25, Valid max time: 2017-01-09 08:03:30
train_set length: 451829, valid_set length: 150608
2018-10-05 12:24:37,798 - test - DEBUG - Working with fold 3

2018-10-05 12:24:37,798 - test - DEBUG - Working with fold 3

Train max time: 2017-03-19 05:10:52, Valid max time: 2017-03-19 05:13:08
train_set length: 602437, valid_set length: 150608
2018-10-

##### Working on validation pipeline

In [41]:
sample_sz = 100000
# sample_sz = len(full_train)
tscv = TimeSeriesSplit(n_splits=5)

In [42]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())

In [43]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
           m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [44]:
full_train_smpl = full_train.iloc[:sample_sz].copy()

In [48]:
%%time
for i, (tr_index, val_index) in enumerate(tscv.split(full_train_smpl.visitStartTime)):
    
    logger.debug(f'Process train split on fold {i}{LN_SPLITTER}')
    X_train, y_train, nas = proc_df(full_train_smpl.iloc[tr_index], 'totals_transactionRevenue',
                                    skip_flds=['fullVisitorId', 'visitStartTime'], max_n_cat=100)
    
    logger.debug(f'Process valid split on fold {i}{LN_SPLITTER}')
    X_valid, y_valid, _ = proc_df(full_train_smpl.iloc[val_index], 'totals_transactionRevenue',
                                  skip_flds=['fullVisitorId', 'visitStartTime'],
                                  max_n_cat=100, na_dict=nas)
    print('Sanity check for column names and types')
    print(f'{check_columns_and_types_same_in(X_train, X_valid)}')
    
    y_train, y_valid = np.log1p(y_train), np.log1p(y_valid)
    
    m = RandomForestRegressor(n_jobs=-1, n_estimators=10)
    m.fit(X_train, y_train)
    
    logger.debug(f'Printing score for fold {i}{LN_SPLITTER}')
    print_score(m)

2018-10-05 13:00:20,406 - test - DEBUG - Process train split on fold 0
====

2018-10-05 13:00:20,406 - test - DEBUG - Process train split on fold 0
====

2018-10-05 13:00:20,497 - test - DEBUG - Process valid split on fold 0
====

2018-10-05 13:00:20,497 - test - DEBUG - Process valid split on fold 0
====

Sanity check for column names and types
(True, True)
2018-10-05 13:00:21,090 - test - DEBUG - Printing score for fold 0
====

2018-10-05 13:00:21,090 - test - DEBUG - Printing score for fold 0
====

[0.6791501715578174, 2.7063404431660802, 0.8260798154562069, -0.3844965301278176]
2018-10-05 13:00:22,437 - test - DEBUG - Process train split on fold 1
====

2018-10-05 13:00:22,437 - test - DEBUG - Process train split on fold 1
====

2018-10-05 13:00:22,555 - test - DEBUG - Process valid split on fold 1
====

2018-10-05 13:00:22,555 - test - DEBUG - Process valid split on fold 1
====

Sanity check for column names and types
(True, True)
2018-10-05 13:00:23,545 - test - DEBUG - Printing 

In [48]:
test, _, _ = proc_df(test_df, skip_flds=['fullVisitorId'], max_n_cat=100, na_dict=nas)

In [51]:
all(df.columns == test.columns)

True

In [46]:
y.mean()

0.2421829352036406

In [45]:
y = np.log1p(y)

In [54]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=100)
m.fit(df, y)
m.score(df,y)

0.9029624185260904

#### Train valid split

In [49]:
all(full_train.index == df.index)

ValueError: Lengths must match to compare

##### First simple way to make validation 

In [56]:
# valid_index = full_train[full_train.fullVisitorId.isin(validationIds)].index

In [57]:
# train_index = full_train[~full_train.fullVisitorId.isin(validationIds)].index

In [58]:
# def split_vals_array(a, train_index, val_index):
#     return a[train_index].copy(), a[val_index].copy()

# def split_vals_df(a, train_index, val_index):
#     return a.iloc[train_index].copy(), a.iloc[val_index].copy()


# raw_train, raw_valid = split_vals_df(full_train, train_index, valid_index)
# X_train, X_valid = split_vals_df(df, train_index, valid_index)
# y_train, y_valid = split_vals_array(y, train_index, valid_index)

# X_train.shape, y_train.shape, X_valid.shape

((681849, 321), (681849,), (221804, 321))

In [59]:
# y_train.mean(), y_valid.mean()

(0.20146147902692166, 0.3059895624328279)

In [60]:
# y_train.max(), y_valid.max()

(23.497337833653027, 23.86437469605166)

#### Base Model 

In [63]:
m_val = RandomForestRegressor(n_jobs=-1, n_estimators=100)
%time m_val.fit(X_train, y_train)

CPU times: user 46min 33s, sys: 50.6 s, total: 47min 24s
Wall time: 6min 57s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

#### Visit Level

In [64]:
print_score(m_val)

[0.5976430330813959, 2.1541521568709947, 0.8997188839647949, 0.14058583153614346]


#### Go to User Level

In [65]:
val_preds = m_val.predict(X_valid)

In [66]:
val_preds[val_preds < 0] = 0.

In [67]:
grouped = raw_valid[['fullVisitorId']].copy()

In [68]:
grouped['y'] = np.expm1(y_valid)
grouped['y_hat'] = np.expm1(val_preds)

In [69]:
grouped = grouped.groupby('fullVisitorId', as_index=False).sum()

In [70]:
grouped['y'] = np.log1p(grouped['y'])
grouped['y_hat'] = np.log1p(grouped['y_hat'])

In [71]:
rmse(grouped.y, grouped.y_hat)

2.1447894992588807

####  Test flow

In [72]:
test_preds = m.predict(test)
test_preds[test_preds < 0] = 0.

In [76]:
test_grouped = test_df[['fullVisitorId']].copy()
test_grouped['y_hat'] = np.expm1(test_preds)

In [79]:
test_grouped = test_grouped.groupby('fullVisitorId', as_index=False).sum()
test_grouped['y_hat'] = np.log1p(test_grouped['y_hat'])

In [82]:
submit = pd.read_csv('../data/sample_submission.csv')

In [83]:
test_grouped.fullVisitorId.tolist() == submit.fullVisitorId.tolist()

True

In [84]:
submit['fullVisitorId'] = test_grouped['fullVisitorId']

In [85]:
submit['PredictedLogRevenue'] = test_grouped['y_hat']

In [86]:
os.makedirs('../submissions', exist_ok=True)

In [87]:
submit.to_csv('../submissions/rf_baseline.csv', header=True, index=False)