### Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os
import math

In [3]:
import pandas as pd
import numpy as np

In [4]:
from collections import defaultdict

In [5]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [6]:
pd.__version__

'0.23.1'

In [7]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit

In [8]:
import sys
sys.path.append('../src')

In [9]:
from workflow.data_utils import load_csv

In [10]:
from workflow.log_utils import get_logger

In [11]:
# from workflow.structured import add_datepart, train_cats, proc_df, apply_cats

In [12]:
from fastai.structured import add_datepart, train_cats, proc_df, apply_cats

### Constants

In [13]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [14]:
def half_bool_columns_to_float_type(df):
    for col in df.columns:
        if is_string_dtype(df[col]):
            unqs = df[col].unique()
            if len(unqs) == 2 and (False in unqs or True in unqs):
                print(col)
                df[col] = df[col].astype(float)

In [15]:
def check_columns_and_types_same_in(a,b):
    same_names = all(a.columns == b.columns)
    same_types = [a[col].dtype == b[col].dtype for col in a.columns]
    return same_names, all(same_types)

In [16]:
# some duplicated info
drop_columns = ['date', 'sessionId', 'visitId']
LN_SPLITTER ='\n====\n'

### Loads 

In [24]:
%%time
full_train = load_csv('../data/processed_data/processed_train.csv',
                      low_memory=False, nrows=None)

CPU times: user 6.87 s, sys: 964 ms, total: 7.83 s
Wall time: 8.6 s


In [25]:
full_train.drop(columns=drop_columns, inplace=True)

In [26]:
half_bool_columns_to_float_type(full_train)

trafficSource_adwordsClickInfo.isVideoAd
trafficSource_isTrueDirect


In [27]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 32 columns):
channelGrouping                                 903653 non-null object
fullVisitorId                                   903653 non-null object
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null int64
device_browser                                  903653 non-null object
device_deviceCategory                           903653 non-null object
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null object
geoNetwork_city                                 903653 non-null object
geoNetwork_continent                            903653 non-null object
geoNetwork_country                              903653 non-null object
geoNetwork_metro                                903653 non-null object
geoNetwork_networkDomain                 

In [21]:
display_all(full_train.head())

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue
0,Organic Search,1131660440785968503,1,1472830385,Chrome,desktop,False,Windows,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
1,Organic Search,377306020877927890,1,1472880147,Firefox,desktop,False,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
2,Organic Search,3895546263509774583,1,1472865386,Chrome,desktop,False,Windows,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
3,Organic Search,4763447161404445595,1,1472881213,UC Browser,desktop,False,Linux,not available in demo dataset,Asia,Indonesia,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),,google + online,organic,,google,
4,Organic Search,27294437909732085,2,1472822600,Chrome,mobile,True,Android,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,unknown.unknown,not available in demo dataset,Northern Europe,1.0,1,,1.0,,,,,,,(not set),1.0,(not provided),organic,,google,


In [28]:
%%time
test_df = load_csv('../data/processed_data/processed_test.csv', low_memory=False, nrows=None)

CPU times: user 6.46 s, sys: 812 ms, total: 7.28 s
Wall time: 7.99 s


In [29]:
test_df.drop(columns=drop_columns, inplace=True)

In [30]:
half_bool_columns_to_float_type(test_df)

trafficSource_adwordsClickInfo.isVideoAd
trafficSource_isTrueDirect


In [31]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 31 columns):
channelGrouping                                 804684 non-null object
fullVisitorId                                   804684 non-null object
visitNumber                                     804684 non-null int64
visitStartTime                                  804684 non-null int64
device_browser                                  804684 non-null object
device_deviceCategory                           804684 non-null object
device_isMobile                                 804684 non-null bool
device_operatingSystem                          804684 non-null object
geoNetwork_city                                 804684 non-null object
geoNetwork_continent                            804684 non-null object
geoNetwork_country                              804684 non-null object
geoNetwork_metro                                804684 non-null object
geoNetwork_networkDomain                 

#### Check column names and types are == for train and test

In [32]:
check_columns_and_types_same_in(full_train.drop('totals_transactionRevenue', axis=1), test_df)

(True, True)

In [23]:
display_all(full_train.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
channelGrouping,903653,8.0,Organic Search,381561.0,,,,,,,
fullVisitorId,903653,714167.0,1957458976293878100,278.0,,,,,,,
visitNumber,903653,,,,2.2649,9.28373,1.0,1.0,1.0,1.0,395.0
visitStartTime,903653,,,,1485010000.0,9022120.0,1470030000.0,1477560000.0,1483950000.0,1492760000.0,1501660000.0
device_browser,903653,54.0,Chrome,620364.0,,,,,,,
device_deviceCategory,903653,3.0,desktop,664479.0,,,,,,,
device_isMobile,903653,2.0,False,664530.0,,,,,,,
device_operatingSystem,903653,20.0,Windows,350072.0,,,,,,,
geoNetwork_city,903653,649.0,not available in demo dataset,508229.0,,,,,,,
geoNetwork_continent,903653,6.0,Americas,450377.0,,,,,,,


### Workflow

In [33]:
full_train['totals_transactionRevenue'] = full_train['totals_transactionRevenue'].fillna(value=0)

In [34]:
# posix seconds to datetime
full_train['visitStartTime'] = pd.to_datetime(full_train.visitStartTime, unit='s')

In [35]:
test_df['visitStartTime'] = pd.to_datetime(test_df.visitStartTime, unit='s')

#### Take fullVisitorId -> validation ids, if it has transaction after 2017-04-30 

In [28]:
validationIds = full_train[full_train.visitStartTime > '2017-04-30'].fullVisitorId.unique()

In [29]:
validationIds.shape

(161118,)

In [30]:
# save for later use
np.save('../data/tmp/validIds.npy',validationIds)

#### Continue workflow: make time feature columns

In [36]:
# make time features
add_datepart(full_train, 'visitStartTime', time=True, drop=False)

In [37]:
# make time features
add_datepart(test_df, 'visitStartTime', time=True, drop=False)

In [38]:
set(full_train.drop('totals_transactionRevenue', axis=1).columns) - set(test_df.columns)

set()

#### Drop fullVisitorId columns to not categorize them

In [39]:
train_visitorIds = full_train.fullVisitorId
test_visitorIds = test_df.fullVisitorId

In [40]:
full_train.drop(columns=['fullVisitorId'], inplace=True)
test_df.drop(columns=['fullVisitorId'], inplace=True)

In [41]:
# make categorical data
train_cats(full_train)

In [42]:
apply_cats(test_df, full_train)

In [43]:
display_all(full_train.isnull().sum().sort_index()/len(full_train))

channelGrouping                                 0.000000
device_browser                                  0.000000
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.000000
geoNetwork_city                                 0.000000
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000000
geoNetwork_metro                                0.000000
geoNetwork_networkDomain                        0.000000
geoNetwork_region                               0.000000
geoNetwork_subContinent                         0.000000
totals_bounces                                  0.501324
totals_hits                                     0.000000
totals_newVisits                                0.221980
totals_pageviews                                0.000111
totals_transactionRevenue                       0.000000
trafficSource_adContent        

In [44]:
display_all(test_df.isnull().sum().sort_index()/len(test_df))

channelGrouping                                 0.000000
device_browser                                  0.019764
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.001150
geoNetwork_city                                 0.009569
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000034
geoNetwork_metro                                0.001099
geoNetwork_networkDomain                        0.031993
geoNetwork_region                               0.007047
geoNetwork_subContinent                         0.000000
totals_bounces                                  0.476878
totals_hits                                     0.000000
totals_newVisits                                0.248935
totals_pageviews                                0.000173
trafficSource_adContent                         0.994974
trafficSource_adwordsClickInfo.

In [45]:
# add back fullVisitosIds
full_train['fullVisitorId'] = train_visitorIds
test_df['fullVisitorId'] = test_visitorIds

In [46]:
# sort train dataframe by time
full_train.sort_values(by='visitStartTime', inplace=True)

In [47]:
full_train.reset_index(drop=True, inplace=True)

In [48]:
full_train.head()

Unnamed: 0,channelGrouping,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,...,visitStartTimeIs_month_start,visitStartTimeIs_quarter_end,visitStartTimeIs_quarter_start,visitStartTimeIs_year_end,visitStartTimeIs_year_start,visitStartTimeHour,visitStartTimeMinute,visitStartTimeSecond,visitStartTimeElapsed,fullVisitorId
0,Direct,3,2016-08-01 07:00:12,Safari,mobile,True,iOS,not available in demo dataset,Americas,United States,...,True,False,False,False,False,7,0,12,1470034812,423043652415339154
1,Social,1,2016-08-01 07:04:26,Chrome,desktop,False,Windows,not available in demo dataset,Asia,Thailand,...,True,False,False,False,False,7,4,26,1470035066,8294721032567046680
2,Organic Search,1,2016-08-01 07:04:41,Amazon Silk,tablet,True,Android,not available in demo dataset,Americas,United States,...,True,False,False,False,False,7,4,41,1470035081,7718623669497357235
3,Organic Search,1,2016-08-01 07:06:01,Chrome,desktop,False,Windows,not available in demo dataset,Americas,Canada,...,True,False,False,False,False,7,6,1,1470035161,4798058133221713505
4,Direct,1,2016-08-01 07:06:10,Chrome,desktop,False,Windows,Quezon City,Asia,Philippines,...,True,False,False,False,False,7,6,10,1470035170,5076747981380011349


In [49]:
check_columns_and_types_same_in(full_train.drop(columns=['totals_transactionRevenue']), test_df)

(True, True)

In [51]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
full_train.to_feather('../data/tmp/raw_train')

CPU times: user 352 ms, sys: 252 ms, total: 604 ms
Wall time: 539 ms


In [52]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
test_df.to_feather('../data/tmp/raw_test')

CPU times: user 232 ms, sys: 152 ms, total: 384 ms
Wall time: 281 ms


#### Preprocess

In [17]:
%time full_train = pd.read_feather('../data/tmp/raw_train', nthreads=-1)

  return feather.read_dataframe(path, nthreads=nthreads)


CPU times: user 268 ms, sys: 104 ms, total: 372 ms
Wall time: 372 ms


In [18]:
%time test_df = pd.read_feather('../data/tmp/raw_test', nthreads=-1)

  return feather.read_dataframe(path, nthreads=nthreads)


CPU times: user 196 ms, sys: 128 ms, total: 324 ms
Wall time: 319 ms


#####  TimeSeries Split way to make Validation dataset

In [31]:
tscv = TimeSeriesSplit(n_splits=5)

In [19]:
logger = get_logger('new')

In [53]:
# sanity check
for i, (tr_index, val_index) in enumerate(tscv.split(full_train.visitStartTime)):
    logger.debug(f'Working with fold {i}\n===========\n')
    print(f'Train max time: {full_train.iloc[tr_index].visitStartTime.max()}, Valid max time: {full_train.iloc[val_index].visitStartTime.min()}')
    print(f'train_set length: {len(tr_index)}, valid_set length: {len(val_index)}')

2018-10-05 13:16:48,720 - new - DEBUG - Working with fold 0

Train max time: 2016-10-03 21:24:49, Valid max time: 2016-10-03 21:25:22
train_set length: 150613, valid_set length: 150608
2018-10-05 13:16:48,768 - new - DEBUG - Working with fold 1

Train max time: 2016-11-16 16:56:29, Valid max time: 2016-11-16 16:56:35
train_set length: 301221, valid_set length: 150608
2018-10-05 13:16:48,840 - new - DEBUG - Working with fold 2

Train max time: 2017-01-09 08:03:25, Valid max time: 2017-01-09 08:03:30
train_set length: 451829, valid_set length: 150608
2018-10-05 13:16:48,937 - new - DEBUG - Working with fold 3

Train max time: 2017-03-19 05:10:52, Valid max time: 2017-03-19 05:13:08
train_set length: 602437, valid_set length: 150608
2018-10-05 13:16:49,060 - new - DEBUG - Working with fold 4

Train max time: 2017-05-25 16:53:03, Valid max time: 2017-05-25 16:53:20
train_set length: 753045, valid_set length: 150608


##### Working on validation pipeline for feature selection

In [20]:
trn_sample_sz = 100000
trn_sample_sz = len(full_train)

tst_sample_sz = 100000
tst_sample_sz = len(test_df)

N_SPLITS = 5
tscv = TimeSeriesSplit(n_splits=N_SPLITS)

In [21]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())

In [22]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
           m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)
    return res

In [23]:
train_smpl_data = full_train.iloc[:trn_sample_sz].copy()
# test_smpl_data = test_df.iloc[:tst_sample_sz].copy()

In [24]:
stats_cols = ['col_name'] + [f'val_rmse_{i}' for i in range(N_SPLITS)] +\
             [f'val_rsq_{i}' for i in range(N_SPLITS)] +\
             ['val_mean_rmse', 'val_mean_rsq', 'val_var_rmse', 'val_var_rsq']

stats_dict = {name:[] for name in stats_cols}

In [25]:
all_features = train_smpl_data.drop(columns=[
    'visitStartTime', 'fullVisitorId', 'totals_transactionRevenue']).columns
columns_to_use = ['visitStartTime', 'fullVisitorId', 'totals_transactionRevenue']

In [26]:
%%time
for feature_name in all_features:
    logger.debug(f'Adding feature name: {feature_name}{LN_SPLITTER}')
    # append feature name
    columns_to_use.append(feature_name)
    
    # create sample to train and validate model on
    full_train_smpl = train_smpl_data[columns_to_use].copy()
    
    # test_preds = np.zeros(len(test_smpl))

    # create array to collect metrics for every new data sample
    metrics = np.zeros((N_SPLITS, 4))

    stats_dict['col_name'].append(feature_name)

    for i, (tr_index, val_index) in enumerate(tscv.split(full_train_smpl.visitStartTime)):

        logger.debug(f'Process train split on fold {i}{LN_SPLITTER}')
        X_train, y_train, nas = proc_df(full_train_smpl.iloc[tr_index], 'totals_transactionRevenue',
                                        skip_flds=['fullVisitorId', 'visitStartTime'], max_n_cat=100)

        logger.debug(f'Process valid split on fold {i}{LN_SPLITTER}')
        X_valid, y_valid, _ = proc_df(full_train_smpl.iloc[val_index], 'totals_transactionRevenue',
                                      skip_flds=['fullVisitorId', 'visitStartTime'],
                                      max_n_cat=100, na_dict=nas)
    #     if i == 0:
    #         logger.debug(f'Process test_set on fold {i}{LN_SPLITTER}')
    #         X_test, _, _ = proc_df(test_smpl, skip_flds=['fullVisitorId', 'visitStartTime'],
    #                                max_n_cat=100, na_dict=nas)

        print('Sanity check for column names and types for train and valid')
        print(f'{check_columns_and_types_same_in(X_train, X_valid)}')

    #     print('Sanity check for column names and types for train and test')
    #     print(f'{check_columns_and_types_same_in(X_train, X_test)}')

        # log(y) for training the model IMPORTANT!!!
        y_train, y_valid = np.log1p(y_train), np.log1p(y_valid)
        
        # train model
        m = RandomForestRegressor(n_jobs=-1, n_estimators=10)
        m.fit(X_train, y_train)

        logger.debug(f'Printing score for fold {i}{LN_SPLITTER}')
        result = print_score(m)
        metrics[i] = result

        stats_dict[f'val_rmse_{i}'].append(result[1])
        stats_dict[f'val_rsq_{i}'].append(result[3])

    #     logger.debug(f'Predicting test on iteration: {i}{LN_SPLITTER}')
    #     y_hat = m.predict(X_test)
    #     y_hat[y_hat < 0] = 0 
    #     test_preds += y_hat / N_SPLITS

    stats_dict['val_mean_rmse'].append(metrics[:,1].mean())
    stats_dict['val_mean_rsq'].append(metrics[:,3].mean())
    stats_dict['val_var_rmse'].append(metrics[:,1].var())
    stats_dict['val_var_rsq'].append(metrics[:,3].var())
    


2018-10-05 12:26:34,052 - new - DEBUG - Adding feature name: channelGrouping
====

2018-10-05 12:26:34,121 - new - DEBUG - Process train split on fold 0
====

2018-10-05 12:26:34,162 - new - DEBUG - Process valid split on fold 0
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:26:34,413 - new - DEBUG - Printing score for fold 0
====

[2.0551776691368198, 1.618670761376283, 0.02141865195446324, 0.014403368334703792]
2018-10-05 12:26:34,839 - new - DEBUG - Process train split on fold 1
====

2018-10-05 12:26:34,894 - new - DEBUG - Process valid split on fold 1
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:26:35,246 - new - DEBUG - Printing score for fold 1
====

[1.849506140748355, 2.037474364319672, 0.0197127225316025, 0.020994898036929666]
2018-10-05 12:26:35,679 - new - DEBUG - Process train split on fold 2
====

2018-10-05 12:26:35,755 - new - DEBUG - Process valid split on fold 2
====

Sanity 

2018-10-05 12:27:38,608 - new - DEBUG - Printing score for fold 0
====

[2.024279972046746, 1.620649806275661, 0.05062160217811295, 0.01199184347941129]
2018-10-05 12:27:41,296 - new - DEBUG - Process train split on fold 1
====

2018-10-05 12:27:41,532 - new - DEBUG - Process valid split on fold 1
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:27:44,991 - new - DEBUG - Printing score for fold 1
====

[1.8267698381673785, 2.0383986661576383, 0.04366626543930208, 0.020106443699137277]
2018-10-05 12:27:48,791 - new - DEBUG - Process train split on fold 2
====

2018-10-05 12:27:49,113 - new - DEBUG - Process valid split on fold 2
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:27:53,868 - new - DEBUG - Printing score for fold 2
====

[1.8968020931014908, 1.882546099331648, 0.03805762271718338, 0.015526037839036945]
2018-10-05 12:27:58,901 - new - DEBUG - Process train split on fold 3
====

2018-10-05

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:32:33,267 - new - DEBUG - Printing score for fold 1
====

[1.7230257309072559, 2.1020840372439187, 0.1492042003233197, -0.04207936798049894]
2018-10-05 12:32:39,235 - new - DEBUG - Process train split on fold 2
====

2018-10-05 12:32:39,737 - new - DEBUG - Process valid split on fold 2
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:32:49,841 - new - DEBUG - Printing score for fold 2
====

[1.8050884498035422, 1.9229709942008826, 0.12883184495708733, -0.02720816140799953]
2018-10-05 12:32:57,988 - new - DEBUG - Process train split on fold 3
====

2018-10-05 12:32:58,634 - new - DEBUG - Process valid split on fold 3
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:33:13,672 - new - DEBUG - Printing score for fold 3
====

[1.8068598932402973, 2.170373926755587, 0.1188588704124236, 0.0025445012732260297]
2018-10-05 

2018-10-05 12:42:59,011 - new - DEBUG - Process valid split on fold 2
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:43:14,634 - new - DEBUG - Printing score for fold 2
====

[1.6801024992529823, 1.986377946746318, 0.24529613727240898, -0.0960661466941557]
2018-10-05 12:43:28,009 - new - DEBUG - Process train split on fold 3
====

2018-10-05 12:43:29,430 - new - DEBUG - Process valid split on fold 3
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:43:52,603 - new - DEBUG - Printing score for fold 3
====

[1.6867583432745077, 2.2270517570120107, 0.23210427187106542, -0.05023145687687758]
2018-10-05 12:44:09,073 - new - DEBUG - Process train split on fold 4
====

2018-10-05 12:44:10,849 - new - DEBUG - Process valid split on fold 4
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:44:41,662 - new - DEBUG - Printing score for fold 4
====

[1.7479026485690041

2018-10-05 12:55:45,924 - new - DEBUG - Printing score for fold 1
====

[0.7692264959152285, 1.8238548478236696, 0.8304295076587781, 0.21552123426055536]
2018-10-05 12:55:57,973 - new - DEBUG - Process train split on fold 2
====

2018-10-05 12:55:59,274 - new - DEBUG - Process valid split on fold 2
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:56:18,588 - new - DEBUG - Printing score for fold 2
====

[0.7994536098652917, 1.696896859254562, 0.8291198750769415, 0.200121753110855]
2018-10-05 12:56:34,565 - new - DEBUG - Process train split on fold 3
====

2018-10-05 12:56:36,392 - new - DEBUG - Process valid split on fold 3
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 12:57:03,666 - new - DEBUG - Printing score for fold 3
====

[0.8045654220923131, 1.89028426619089, 0.825289403941505, 0.24337856468675156]
2018-10-05 12:57:23,490 - new - DEBUG - Process train split on fold 4
====

2018-10-05 12:57:2

2018-10-05 13:08:24,004 - new - DEBUG - Process valid split on fold 2
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:08:45,847 - new - DEBUG - Printing score for fold 2
====

[0.800339853041585, 1.6855312708326717, 0.8287408029611878, 0.21080082467559158]
2018-10-05 13:09:02,074 - new - DEBUG - Process train split on fold 3
====

2018-10-05 13:09:03,971 - new - DEBUG - Process valid split on fold 3
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:09:34,474 - new - DEBUG - Printing score for fold 3
====

[0.8036888349484986, 1.8878871691406207, 0.8256698966372882, 0.24529631330967502]
2018-10-05 13:09:54,742 - new - DEBUG - Process train split on fold 4
====

2018-10-05 13:09:57,203 - new - DEBUG - Process valid split on fold 4
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:10:35,271 - new - DEBUG - Printing score for fold 4
====

[0.8394265314856552, 1

2018-10-05 13:23:21,367 - new - DEBUG - Process valid split on fold 3
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:23:56,041 - new - DEBUG - Printing score for fold 3
====

[0.7948590945812942, 1.8772279213576462, 0.829479415370297, 0.25379455769041515]
2018-10-05 13:24:19,200 - new - DEBUG - Process train split on fold 4
====

2018-10-05 13:24:21,832 - new - DEBUG - Process valid split on fold 4
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:25:04,427 - new - DEBUG - Printing score for fold 4
====

[0.8305438910908894, 1.8718104550632884, 0.8235419533233023, 0.22862175422777187]
2018-10-05 13:25:31,837 - new - DEBUG - Adding feature name: trafficSource_medium
====

2018-10-05 13:25:31,966 - new - DEBUG - Process train split on fold 0
====

2018-10-05 13:25:32,392 - new - DEBUG - Process valid split on fold 0
====

Sanity check for column names and types for train and valid
(True, True)
2018-1

2018-10-05 13:39:09,036 - new - DEBUG - Process valid split on fold 4
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:39:53,029 - new - DEBUG - Printing score for fold 4
====

[0.7850678380621627, 1.8673492023525586, 0.8423366816597458, 0.2322943614474723]
2018-10-05 13:40:19,689 - new - DEBUG - Adding feature name: visitStartTimeMonth
====

2018-10-05 13:40:19,829 - new - DEBUG - Process train split on fold 0
====

2018-10-05 13:40:20,181 - new - DEBUG - Process valid split on fold 0
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:40:25,360 - new - DEBUG - Printing score for fold 0
====

[0.8282142246242004, 1.4754650442532173, 0.8410781692103438, 0.18108275534344198]
2018-10-05 13:40:34,442 - new - DEBUG - Process train split on fold 1
====

2018-10-05 13:40:35,382 - new - DEBUG - Process valid split on fold 1
====

Sanity check for column names and types for train and valid
(True, True)
2018-10

2018-10-05 13:54:30,159 - new - DEBUG - Process train split on fold 0
====

2018-10-05 13:54:30,566 - new - DEBUG - Process valid split on fold 0
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:54:35,931 - new - DEBUG - Printing score for fold 0
====

[0.8124091149115429, 1.5216907793046048, 0.8470858187436541, 0.1289662465701027]
2018-10-05 13:54:45,157 - new - DEBUG - Process train split on fold 1
====

2018-10-05 13:54:46,223 - new - DEBUG - Process valid split on fold 1
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:54:57,456 - new - DEBUG - Printing score for fold 1
====

[0.7106105165536197, 1.8501936755476762, 0.8552878246606713, 0.19269985027746284]
2018-10-05 13:55:11,071 - new - DEBUG - Process train split on fold 2
====

2018-10-05 13:55:12,702 - new - DEBUG - Process valid split on fold 2
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 13:55:3

2018-10-05 14:10:45,806 - new - DEBUG - Process valid split on fold 3
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 14:11:25,113 - new - DEBUG - Printing score for fold 3
====

[0.7197164299247057, 1.842848240258934, 0.860196079119491, 0.2808763920252373]
2018-10-05 14:11:51,243 - new - DEBUG - Process train split on fold 4
====

2018-10-05 14:11:54,505 - new - DEBUG - Process valid split on fold 4
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 14:13:01,305 - new - DEBUG - Printing score for fold 4
====

[0.7306913256708576, 1.8194393641319984, 0.8634209164031982, 0.2711824489914789]
2018-10-05 14:13:38,775 - new - DEBUG - Adding feature name: visitStartTimeIs_year_end
====

2018-10-05 14:13:39,015 - new - DEBUG - Process train split on fold 0
====

2018-10-05 14:13:39,667 - new - DEBUG - Process valid split on fold 0
====

Sanity check for column names and types for train and valid
(True, True)
2018

2018-10-05 14:34:27,907 - new - DEBUG - Process train split on fold 0
====

2018-10-05 14:34:28,439 - new - DEBUG - Process valid split on fold 0
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 14:34:34,932 - new - DEBUG - Printing score for fold 0
====

[0.8045337349605907, 1.5074213251092279, 0.8500361064860298, 0.1452256600285473]
2018-10-05 14:34:46,206 - new - DEBUG - Process train split on fold 1
====

2018-10-05 14:34:47,496 - new - DEBUG - Process valid split on fold 1
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 14:35:02,413 - new - DEBUG - Printing score for fold 1
====

[0.7161472500271238, 1.8332522374019669, 0.8530239849467538, 0.20741637351321662]
2018-10-05 14:35:19,632 - new - DEBUG - Process train split on fold 2
====

2018-10-05 14:35:21,784 - new - DEBUG - Process valid split on fold 2
====

Sanity check for column names and types for train and valid
(True, True)
2018-10-05 14:35:5

In [27]:
stats_df = pd.DataFrame(stats_dict)
os.makedirs('../log', exist_ok=True)
stats_df.to_feather('../log/ohe_features_log')

In [191]:
stats_df = pd.read_feather('../log/ohe_features_log')

  return feather.read_dataframe(path, nthreads=nthreads)


In [192]:
stats_df

Unnamed: 0,col_name,val_rmse_0,val_rmse_1,val_rmse_2,val_rmse_3,val_rmse_4,val_rsq_0,val_rsq_1,val_rsq_2,val_rsq_3,val_rsq_4,val_mean_rmse,val_mean_rsq,val_var_rmse,val_var_rsq
0,channelGrouping,2.284647,2.338164,2.403843,1.940202,1.957358,0.013345,0.026949,0.019899,0.02167,0.020552,2.184843,0.020483,0.038606,1.9e-05
1,visitNumber,2.296991,2.346918,2.394011,1.934502,1.954739,0.002654,0.019649,0.0279,0.02741,0.023171,2.185432,0.020157,0.039643,8.6e-05


##### First simple way to make validation 

In [56]:
# valid_index = full_train[full_train.fullVisitorId.isin(validationIds)].index

In [57]:
# train_index = full_train[~full_train.fullVisitorId.isin(validationIds)].index

In [58]:
# def split_vals_array(a, train_index, val_index):
#     return a[train_index].copy(), a[val_index].copy()

# def split_vals_df(a, train_index, val_index):
#     return a.iloc[train_index].copy(), a.iloc[val_index].copy()


# raw_train, raw_valid = split_vals_df(full_train, train_index, valid_index)
# X_train, X_valid = split_vals_df(df, train_index, valid_index)
# y_train, y_valid = split_vals_array(y, train_index, valid_index)

# X_train.shape, y_train.shape, X_valid.shape

((681849, 321), (681849,), (221804, 321))

In [59]:
# y_train.mean(), y_valid.mean()

(0.20146147902692166, 0.3059895624328279)

In [60]:
# y_train.max(), y_valid.max()

(23.497337833653027, 23.86437469605166)

#### CV and test

In [20]:
trn_sample_sz = 100000
trn_sample_sz = len(full_train)

tst_sample_sz = 100000
tst_sample_sz = len(test_df)

N_SPLITS = 5
tscv = TimeSeriesSplit(n_splits=N_SPLITS)

In [21]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())

In [22]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
           m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)
    return res

In [None]:
stats_cols = ['experiment'] + [f'val_rmse_{i}' for i in range(N_SPLITS)] +\
             [f'val_rsq_{i}' for i in range(N_SPLITS)] +\
             ['val_mean_rmse', 'val_mean_rsq', 'val_var_rmse', 'val_var_rsq', 'submission_fname']

stats_dict = {name:[] for name in stats_cols}

In [None]:
# TODO: add column names you want to use for experiment here
columns_to_use = ['visitStartTime', 'fullVisitorId', 'totals_transactionRevenue']

In [23]:
full_train_smpl = full_train[columns_to_use].iloc[:trn_sample_sz].copy()
test_smpl = test_df[columns_to_use].iloc[:tst_sample_sz].copy()

In [None]:
# This pipeline is for testing selected features on test set

stats_dict['experiment'] = """
Here should be the detailed experiment 
description
"""
# TODO!!! should change fname according to experiment
submission_fname = "unique_name.csv"
stats_dict['submission_fname'] = submission_fname

test_preds = np.zeros(len(test_smpl))

# create array to collect metrics for every new data sample
metrics = np.zeros((N_SPLITS, 4))

for i, (tr_index, val_index) in enumerate(tscv.split(full_train_smpl.visitStartTime)):

    logger.debug(f'Process train split on fold {i}{LN_SPLITTER}')
    X_train, y_train, nas = proc_df(full_train_smpl.iloc[tr_index], 'totals_transactionRevenue',
                                    skip_flds=['fullVisitorId', 'visitStartTime'], max_n_cat=100)

    logger.debug(f'Process valid split on fold {i}{LN_SPLITTER}')
    X_valid, y_valid, _ = proc_df(full_train_smpl.iloc[val_index], 'totals_transactionRevenue',
                                  skip_flds=['fullVisitorId', 'visitStartTime'],
                                  max_n_cat=100, na_dict=nas)
    if i == 0:
        logger.debug(f'Process test_set on fold {i}{LN_SPLITTER}')
        X_test, _, _ = proc_df(test_smpl, skip_flds=['fullVisitorId', 'visitStartTime'],
                               max_n_cat=100, na_dict=nas)

    print('Sanity check for column names and types for train and valid')
    print(f'{check_columns_and_types_same_in(X_train, X_valid)}')

    print('Sanity check for column names and types for train and test')
    print(f'{check_columns_and_types_same_in(X_train, X_test)}')

    # log(y) for training the model IMPORTANT!!!
    y_train, y_valid = np.log1p(y_train), np.log1p(y_valid)

    # train model
    m = RandomForestRegressor(n_jobs=-1, n_estimators=100)
    m.fit(X_train, y_train)

    logger.debug(f'Printing score for fold {i}{LN_SPLITTER}')
    result = print_score(m)
    metrics[i] = result

    stats_dict[f'val_rmse_{i}'].append(result[1])
    stats_dict[f'val_rsq_{i}'].append(result[3])

    logger.debug(f'Predicting test on iteration: {i}{LN_SPLITTER}')
    y_hat = m.predict(X_test)
    y_hat[y_hat < 0] = 0 
    test_preds += y_hat / N_SPLITS

stats_dict['val_mean_rmse'].append(metrics[:,1].mean())
stats_dict['val_mean_rsq'].append(metrics[:,3].mean())
stats_dict['val_var_rmse'].append(metrics[:,1].var())
stats_dict['val_var_rsq'].append(metrics[:,3].var())

In [None]:
stats_df = pd.DataFrame(stats_dict)
os.makedirs('../log', exist_ok=True)

In [None]:
# it's for the first time
stats_df.to_csv('../log/experiments.csv', index=False, header=Truee)

In [None]:
# this for the rest of experiments
with open('../log/experiments.csv', mode='a') as fin:
    stats_df.to_csv(fin, index=False, header=False)

In [108]:
# Sanity check for test prediction: should be around 0.20-0.25
test_preds.mean()

0.22253156325734827

####  Test flow

In [72]:
test_grouped = test_df[['fullVisitorId']].copy()
test_grouped['y_hat'] = np.expm1(test_preds)

In [73]:
test_grouped = test_grouped.groupby('fullVisitorId', as_index=False).sum()
test_grouped['y_hat'] = np.log1p(test_grouped['y_hat'])

In [74]:
submit = pd.read_csv('../data/sample_submission.csv')

In [75]:
test_grouped.fullVisitorId.tolist() == submit.fullVisitorId.tolist()

True

In [76]:
submit['fullVisitorId'] = test_grouped['fullVisitorId']

In [77]:
submit['PredictedLogRevenue'] = test_grouped['y_hat']

In [78]:
os.makedirs('../submissions', exist_ok=True)

In [79]:
submit.to_csv(f'../submissions/{submission_fname}', header=True, index=False)