### Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os
import math

In [3]:
import pandas as pd
import numpy as np

In [4]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [5]:
pd.__version__

'0.23.4'

In [6]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics

In [7]:
import sys
sys.path.append('../src')

In [8]:
from workflow.data_utils import load_csv, train_valid_split

In [9]:
from workflow.structured import add_datepart, train_cats, proc_df, apply_cats

### Constants

In [10]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [11]:
def half_bool_columns_to_float_type(df):
    for col in df.columns:
        if is_string_dtype(df[col]):
            unqs = df[col].unique()
            if len(unqs) == 2 and (False in unqs or True in unqs):
                print(col)
                df[col] = df[col].astype(float)

### Loads 

In [12]:
%%time
full_train = load_csv('../data/processed_data/processed_train.csv',
                      low_memory=False, nrows=10000)

CPU times: user 48.7 ms, sys: 11.2 ms, total: 59.9 ms
Wall time: 242 ms


In [13]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
channelGrouping                                 10000 non-null object
date                                            10000 non-null int64
fullVisitorId                                   10000 non-null object
sessionId                                       10000 non-null object
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
visitStartTime                                  10000 non-null int64
device_browser                                  10000 non-null object
device_deviceCategory                           10000 non-null object
device_isMobile                                 10000 non-null bool
device_operatingSystem                          10000 non-null object
geoNetwork_city                                 10000 non-null object
geoNetwork_continent                            10000 non-

In [14]:
half_bool_columns_to_float_type(full_train)

trafficSource_adwordsClickInfo.isVideoAd
trafficSource_isTrueDirect


In [15]:
display_all(full_train.head())

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,1472830385,1,1472830385,Chrome,desktop,False,Windows,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,1472880147,1,1472880147,Firefox,desktop,False,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,1472865386,1,1472865386,Chrome,desktop,False,Windows,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,1472881213,1,1472881213,UC Browser,desktop,False,Linux,not available in demo dataset,Asia,Indonesia,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),,google + online,organic,,google,
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,1472822600,2,1472822600,Chrome,mobile,True,Android,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,unknown.unknown,not available in demo dataset,Northern Europe,1.0,1,,1.0,,,,,,,(not set),1.0,(not provided),organic,,google,


In [15]:
%%time
test_df = load_csv('../data/processed_data/processed_test.csv', low_memory=False, nrows=10000)

CPU times: user 46.6 ms, sys: 9.58 ms, total: 56.1 ms
Wall time: 125 ms


In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 34 columns):
channelGrouping                                 10000 non-null object
date                                            10000 non-null int64
fullVisitorId                                   10000 non-null object
sessionId                                       10000 non-null object
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
visitStartTime                                  10000 non-null int64
device_browser                                  10000 non-null object
device_deviceCategory                           10000 non-null object
device_isMobile                                 10000 non-null bool
device_operatingSystem                          10000 non-null object
geoNetwork_city                                 10000 non-null object
geoNetwork_continent                            10000 non-

In [17]:
half_bool_columns_to_float_type(test_df)

trafficSource_adwordsClickInfo.isVideoAd
trafficSource_isTrueDirect


In [18]:
all(full_train.drop('totals_transactionRevenue', axis=1).columns == test_df.columns)

True

### Workflow

In [19]:
full_train['totals_transactionRevenue'] = full_train['totals_transactionRevenue'].fillna(value=0)

In [21]:
display_all(full_train.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
channelGrouping,903653,8.0,Organic Search,381561.0,,,,,,,
date,903653,,,,20165900.0,4697.7,20160800.0,20161000.0,20170100.0,20170400.0,20170800.0
fullVisitorId,903653,714167.0,1957458976293878100,278.0,,,,,,,
sessionId,903653,902755.0,1198068166576036308_1493621869,2.0,,,,,,,
visitId,903653,,,,1485010000.0,9022120.0,1470030000.0,1477560000.0,1483950000.0,1492760000.0,1501660000.0
visitNumber,903653,,,,2.2649,9.28373,1.0,1.0,1.0,1.0,395.0
visitStartTime,903653,,,,1485010000.0,9022120.0,1470030000.0,1477560000.0,1483950000.0,1492760000.0,1501660000.0
device_browser,903653,54.0,Chrome,620364.0,,,,,,,
device_deviceCategory,903653,3.0,desktop,664479.0,,,,,,,
device_isMobile,903653,2.0,False,664530.0,,,,,,,


In [20]:
# posix seconds to datetime
full_train.visitStartTime = pd.to_datetime(full_train.visitStartTime, unit='s')

In [21]:
test_df.visitStartTime = pd.to_datetime(test_df.visitStartTime, unit='s')

In [22]:
# make time features
add_datepart(full_train, 'visitStartTime', time=True)

In [23]:
# make time features
add_datepart(test_df, 'visitStartTime', time=True)

In [24]:
set(full_train.drop('totals_transactionRevenue', axis=1).columns) - set(test_df.columns)

set()

In [25]:
full_train = full_train.sort_values(by='date', ascending=True)

In [26]:
full_train.drop(columns=['date'], inplace=True)
full_train.reset_index(inplace=True, drop=True)

In [27]:
test_df.drop(columns=['date'], inplace=True)

In [28]:
train_visitorIds = full_train.fullVisitorId
test_visitorIds = test_df.fullVisitorId

In [29]:
full_train.drop(columns=['fullVisitorId'], inplace=True)
test_df.drop(columns=['fullVisitorId'], inplace=True)

In [30]:
# make categorical data
train_cats(full_train)

In [31]:
apply_cats(test_df, full_train)

In [30]:
display_all(full_train.isnull().sum().sort_index()/len(full_train))

channelGrouping                                 0.000000
date                                            0.000000
device_browser                                  0.000000
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.000000
fullVisitorId                                   0.000000
geoNetwork_city                                 0.000000
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000000
geoNetwork_metro                                0.000000
geoNetwork_networkDomain                        0.000000
geoNetwork_region                               0.000000
geoNetwork_subContinent                         0.000000
sessionId                                       0.000000
totals_bounces                                  0.501324
totals_hits                                     0.000000
totals_newVisits               

In [31]:
display_all(test_df.isnull().sum().sort_index()/len(test_df))

channelGrouping                                 0.000000
date                                            0.000000
device_browser                                  0.019764
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.001150
fullVisitorId                                   0.974793
geoNetwork_city                                 0.009569
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000034
geoNetwork_metro                                0.001099
geoNetwork_networkDomain                        0.031993
geoNetwork_region                               0.007047
geoNetwork_subContinent                         0.000000
sessionId                                       0.999994
totals_bounces                                  0.476878
totals_hits                                     0.000000
totals_newVisits               

In [32]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 48 columns):
channelGrouping                                 10000 non-null category
sessionId                                       10000 non-null category
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
device_browser                                  10000 non-null category
device_deviceCategory                           10000 non-null category
device_isMobile                                 10000 non-null bool
device_operatingSystem                          10000 non-null category
geoNetwork_city                                 10000 non-null category
geoNetwork_continent                            10000 non-null category
geoNetwork_country                              10000 non-null category
geoNetwork_metro                                10000 non-null category
geoNetwork_networkDomain              

In [33]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 47 columns):
channelGrouping                                 10000 non-null category
sessionId                                       0 non-null category
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
device_browser                                  9841 non-null category
device_deviceCategory                           10000 non-null category
device_isMobile                                 10000 non-null bool
device_operatingSystem                          9983 non-null category
geoNetwork_city                                 9327 non-null category
geoNetwork_continent                            10000 non-null category
geoNetwork_country                              9977 non-null category
geoNetwork_metro                                9765 non-null category
geoNetwork_networkDomain                       

In [34]:
full_train['fullVisitorId'] = train_visitorIds
test_df['fullVisitorId'] = test_visitorIds

In [35]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
full_train.to_feather('../data/tmp/raw_train')

CPU times: user 414 ms, sys: 184 ms, total: 598 ms
Wall time: 1.19 s


In [36]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
test_df.to_feather('../data/tmp/raw_test')

CPU times: user 288 ms, sys: 175 ms, total: 463 ms
Wall time: 1.78 s


#### Preprocess

In [40]:
full_train = pd.read_feather('../data/tmp/raw_train')

  return feather.read_dataframe(path, nthreads=nthreads)


In [69]:
skip_columns = ['sessionId', 'visitId', 'visitNumber', 'device_deviceCategory', 'device_isMobile',
                'device_operatingSystem', 'geoNetwork_metro', 'geoNetwork_networkDomain',
                'geoNetwork_region', 'geoNetwork_subContinent', 'totals_bounces',
                'totals_hits', 'totals_newVisits', 'totals_pageviews',
                'trafficSource_adContent',
                'trafficSource_adwordsClickInfo.adNetworkType',
                'trafficSource_adwordsClickInfo.gclId',
                'trafficSource_adwordsClickInfo.isVideoAd',
                'trafficSource_adwordsClickInfo.page',
                'trafficSource_adwordsClickInfo.slot', 'trafficSource_campaign',
                'trafficSource_isTrueDirect', 'trafficSource_keyword',
                'trafficSource_medium', 'trafficSource_referralPath',
                'visitStartTimeMinute', 'visitStartTimeSecond',
                'fullVisitorId']

In [71]:
df, y, nas = proc_df(full_train, 'totals_transactionRevenue', skip_flds=skip_columns)

In [74]:
skip_columns = ['sessionId', 'visitId', 'visitNumber', 'device_deviceCategory', 'device_isMobile',
                'device_operatingSystem', 'geoNetwork_metro', 'geoNetwork_networkDomain',
                'geoNetwork_region', 'geoNetwork_subContinent', 'totals_bounces',
                'totals_hits', 'totals_newVisits', 'totals_pageviews',
                'trafficSource_adContent',
                'trafficSource_adwordsClickInfo.adNetworkType',
                'trafficSource_adwordsClickInfo.gclId',
                'trafficSource_adwordsClickInfo.isVideoAd',
                'trafficSource_adwordsClickInfo.page',
                'trafficSource_adwordsClickInfo.slot', 'trafficSource_campaign',
                'trafficSource_isTrueDirect', 'trafficSource_keyword',
                'trafficSource_medium', 'trafficSource_referralPath',
                'visitStartTimeMinute', 'visitStartTimeSecond',
                'fullVisitorId']

In [75]:
test, _, _ = proc_df(test_df, skip_flds=skip_columns, na_dict=nas)

In [76]:
all(df.columns == test.columns)

True

In [77]:
y = np.log1p(y)

In [78]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)

0.7352450717291183

#### Train valid split

In [79]:
def split_vals(a,n):
    return a[:n].copy(), a[n:].copy()

n_valid = 1000
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(full_train, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((9000, 20), (9000,), (1000, 20))

#### Base Model 

In [80]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())


def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
           m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [81]:
m_val = RandomForestRegressor(n_jobs=-1)
%time m_val.fit(X_train, y_train)

CPU times: user 66.7 ms, sys: 2.03 ms, total: 68.8 ms
Wall time: 120 ms


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [82]:
print_score(m_val)

[0.7930676096480118, 2.832662076265478, 0.7267493048479146, -0.027404290486198857]


In [52]:
val_preds = m_val.predict(X_valid)

In [56]:
val_preds = np.exp(val_preds) - 1

In [57]:
grouped = raw_valid[['fullVisitorId', 'totals_transactionRevenue']]

In [58]:
grouped['pred_revenue'] = val_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [59]:
grouped = grouped.groupby('fullVisitorId', as_index=False).sum()

In [60]:
grouped['pred_revenue'] = np.log1p(grouped['pred_revenue'])
grouped['totals_transactionRevenue'] = np.log1p(grouped['totals_transactionRevenue'])

In [61]:
rmse(grouped.totals_transactionRevenue, grouped.pred_revenue)

2.582040202690091

####  Test flow

In [43]:
test_preds = m.predict(test)

In [44]:
test_grouped = pd.DataFrame()
test_grouped['fullVisitorId'] = test_visitorIds
test_grouped['pred_revenue'] = test_preds

In [47]:
test_grouped = test_grouped.groupby('fullVisitorId', as_index=False).sum()
test_grouped['pred_revenue'] = np.log1p(test_grouped['pred_revenue'])

In [49]:
submit = pd.read_csv('../data/sample_submission.csv')

In [52]:
test_grouped.fullVisitorId.tolist() == submit.fullVisitorId.tolist()

True

In [53]:
submit['fullVisitorId'] = test_grouped['fullVisitorId']

In [54]:
submit['PredictedLogRevenue'] = test_grouped['pred_revenue']

In [57]:
os.makedirs('../submissions', exist_ok=True)

In [58]:
submit.to_csv('../submissions/rf_baseline.csv', header=True, index=False)