### Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os
import math

In [3]:
import pandas as pd
import numpy as np

In [4]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [5]:
pd.__version__

'0.22.0'

In [6]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics

In [7]:
import sys
sys.path.append('../src')

In [8]:
from workflow.data_utils import load_csv, train_valid_split

In [9]:
from workflow.structured import add_datepart, train_cats, proc_df, apply_cats

### Constants

In [10]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [11]:
def half_bool_columns_to_float_type(df):
    for col in df.columns:
        if is_string_dtype(df[col]):
            unqs = df[col].unique()
            if len(unqs) == 2 and (False in unqs or True in unqs):
                df[col] = df[col].astype(float)

### Loads 

In [12]:
%%time
full_train = load_csv('../data/processed_data/processed_train.csv', nrows=10000,
                      low_memory=False)

CPU times: user 58.6 ms, sys: 11 ms, total: 69.5 ms
Wall time: 93.5 ms


In [15]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
channelGrouping                                 10000 non-null object
date                                            10000 non-null int64
fullVisitorId                                   10000 non-null object
sessionId                                       10000 non-null object
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
visitStartTime                                  10000 non-null int64
device_browser                                  10000 non-null object
device_deviceCategory                           10000 non-null object
device_isMobile                                 10000 non-null bool
device_operatingSystem                          10000 non-null object
geoNetwork_city                                 10000 non-null object
geoNetwork_continent                            10000 non-

In [14]:
half_bool_columns_to_float_type(full_train)

In [20]:
display_all(full_train.head())

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,1472830385,1,1472830385,Chrome,desktop,False,Windows,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,1472880147,1,1472880147,Firefox,desktop,False,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,1472865386,1,1472865386,Chrome,desktop,False,Windows,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,1472881213,1,1472881213,UC Browser,desktop,False,Linux,not available in demo dataset,Asia,Indonesia,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),,google + online,organic,,google,
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,1472822600,2,1472822600,Chrome,mobile,True,Android,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,unknown.unknown,not available in demo dataset,Northern Europe,1.0,1,,1.0,,,,,,,(not set),1.0,(not provided),organic,,google,


In [16]:
%%time
test_df = load_csv('../data/processed_data/processed_test.csv', low_memory=False, nrows=10000)

CPU times: user 57.4 ms, sys: 13.8 ms, total: 71.3 ms
Wall time: 102 ms


In [17]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 34 columns):
channelGrouping                                 10000 non-null object
date                                            10000 non-null int64
fullVisitorId                                   10000 non-null object
sessionId                                       10000 non-null object
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
visitStartTime                                  10000 non-null int64
device_browser                                  10000 non-null object
device_deviceCategory                           10000 non-null object
device_isMobile                                 10000 non-null bool
device_operatingSystem                          10000 non-null object
geoNetwork_city                                 10000 non-null object
geoNetwork_continent                            10000 non-

In [18]:
half_bool_columns_to_float_type(test_df)

In [19]:
all(full_train.drop('totals_transactionRevenue', axis=1).columns == test_df.columns)

True

### Workflow

In [20]:
full_train['totals_transactionRevenue'] = full_train['totals_transactionRevenue'].fillna(value=0)

In [21]:
display_all(full_train.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
channelGrouping,10000,7.0,Organic Search,4730.0,,,,,,,
date,10000,,,,20167900.0,4143.84,20160900.0,20160900.0,20170200.0,20170300.0,20170600.0
fullVisitorId,10000,9251.0,384990845055862114,7.0,,,,,,,
sessionId,10000,10000.0,0073226240195480402_1489356910,1.0,,,,,,,
visitId,10000,,,,1485620000.0,8864600.0,1472800000.0,1472880000.0,1486120000.0,1489370000.0,1498290000.0
visitNumber,10000,,,,2.1673,7.86228,1.0,1.0,1.0,1.0,317.0
visitStartTime,10000,,,,1485620000.0,8864600.0,1472800000.0,1472880000.0,1486120000.0,1489370000.0,1498290000.0
device_browser,10000,22.0,Chrome,6917.0,,,,,,,
device_deviceCategory,10000,3.0,desktop,7115.0,,,,,,,
device_isMobile,10000,2.0,False,7114.0,,,,,,,


In [22]:
# posix seconds to datetime
full_train.visitStartTime = pd.to_datetime(full_train.visitStartTime, unit='s')

In [23]:
test_df.visitStartTime = pd.to_datetime(test_df.visitStartTime, unit='s')

In [24]:
# make time features
add_datepart(full_train, 'visitStartTime', time=True)

In [25]:
# make time features
add_datepart(test_df, 'visitStartTime', time=True)

In [26]:
set(full_train.drop('totals_transactionRevenue', axis=1).columns) - set(test_df.columns)

set()

In [27]:
# make categorical data
train_cats(full_train)

In [31]:
apply_cats(test_df, full_train)

In [64]:
full_train = full_train.sort_values(by='date', ascending=True)

In [76]:
display_all(full_train.isnull().sum().sort_index()/len(full_train))

channelGrouping                                 0.000000
device_browser                                  0.000000
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.000000
fullVisitorId                                   0.000000
geoNetwork_city                                 0.000000
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000000
geoNetwork_metro                                0.000000
geoNetwork_networkDomain                        0.000000
geoNetwork_region                               0.000000
geoNetwork_subContinent                         0.000000
log_revenue                                     0.000000
sessionId                                       0.000000
totals_bounces                                  0.501324
totals_hits                                     0.000000
totals_newVisits               

In [71]:
full_train.drop(columns=['date'], inplace=True)
full_train.reset_index(inplace=True, drop=True)

In [77]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
full_train.to_feather('../data/tmp/raw_train')

CPU times: user 340 ms, sys: 139 ms, total: 479 ms
Wall time: 2.75 s


#### Preprocess

In [11]:
full_train = pd.read_feather('../data/tmp/raw_train')

  return feather.read_dataframe(path, nthreads=nthreads)


In [12]:
display_all(full_train.head())

Unnamed: 0,channelGrouping,fullVisitorId,sessionId,visitId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue,log_revenue,visitStartTimeYear,visitStartTimeMonth,visitStartTimeWeek,visitStartTimeDay,visitStartTimeDayofweek,visitStartTimeDayofyear,visitStartTimeIs_month_end,visitStartTimeIs_month_start,visitStartTimeIs_quarter_end,visitStartTimeIs_quarter_start,visitStartTimeIs_year_end,visitStartTimeIs_year_start,visitStartTimeHour,visitStartTimeMinute,visitStartTimeSecond,visitStartTimeElapsed
0,Direct,1492602573213666603,1492602573213666603_1470044332,1470044332,1,Chrome,desktop,False,Macintosh,not available in demo dataset,Asia,Japan,not available in demo dataset,i3-systems.net,not available in demo dataset,Eastern Asia,1.0,1,1.0,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,8,31,1,0,214,False,True,False,False,False,False,9,38,52,1470044332
1,Direct,7394165545362887055,7394165545362887055_1470044425,1470044425,3,Chrome,desktop,False,Windows,Hanoi,Asia,Vietnam,(not set),unknown.unknown,Hanoi,Southeast Asia,1.0,1,,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,8,31,1,0,214,False,True,False,False,False,False,9,40,25,1470044425
2,Referral,6107229716178617930,6107229716178617930_1470094529,1470094529,1,Chrome,desktop,False,Macintosh,Mountain View,Americas,United States,San Francisco-Oakland-San Jose CA,(not set),California,Northern America,1.0,1,1.0,1.0,,,,,,,(not set),,,referral,/,mall.googleplex.com,0.0,0.0,2016,8,31,1,0,214,False,True,False,False,False,False,23,35,29,1470094529
3,Direct,9459384188253198762,9459384188253198762_1470079413,1470079413,1,Chrome,desktop,False,Windows,not available in demo dataset,Americas,Brazil,not available in demo dataset,brasiltelecom.net.br,not available in demo dataset,South America,1.0,1,1.0,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,8,31,1,0,214,False,True,False,False,False,False,19,23,33,1470079413
4,Direct,4052177266351383392,4052177266351383392_1470111093,1470111093,1,Safari,desktop,False,Macintosh,not available in demo dataset,Asia,Thailand,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,8,31,2,1,215,False,False,False,False,False,False,4,11,33,1470111093


In [13]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 50 columns):
channelGrouping                                 903653 non-null category
fullVisitorId                                   903653 non-null category
sessionId                                       903653 non-null category
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
device_browser                                  903653 non-null category
device_deviceCategory                           903653 non-null category
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null category
geoNetwork_city                                 903653 non-null category
geoNetwork_continent                            903653 non-null category
geoNetwork_country                              903653 non-null category
geoNetwork_metro       

In [82]:
df, y, nas = proc_df(full_train, 'totals_transactionRevenue',
                     skip_flds=['log_revenue', 'fullVisitorId'])

In [83]:
display_all(df.head())

Unnamed: 0,channelGrouping,sessionId,visitId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,visitStartTimeYear,visitStartTimeMonth,visitStartTimeWeek,visitStartTimeDay,visitStartTimeDayofweek,visitStartTimeDayofyear,visitStartTimeIs_month_end,visitStartTimeIs_month_start,visitStartTimeIs_quarter_end,visitStartTimeIs_quarter_start,visitStartTimeIs_year_end,visitStartTimeIs_year_start,visitStartTimeHour,visitStartTimeMinute,visitStartTimeSecond,visitStartTimeElapsed,totals_bounces_na,totals_newVisits_na,totals_pageviews_na,trafficSource_adwordsClickInfo.page_na
0,3,133849,1470044332,1,12,1,False,8,649,4,103,94,10956,376,7,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,1,0,1,0,1,2016,8,31,1,0,214,False,True,False,False,False,False,9,38,52,1470044332,False,False,False,True
1,3,667424,1470044425,3,12,1,False,17,236,4,218,1,25842,130,17,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,1,0,1,0,1,2016,8,31,1,0,214,False,True,False,False,False,False,9,40,25,1470044425,False,True,False,True
2,7,551371,1470094529,1,12,1,False,8,367,3,213,79,1,58,13,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,0,0,7,1,268,2016,8,31,1,0,214,False,True,False,False,False,False,23,35,29,1470094529,False,False,False,True
3,3,853931,1470079413,1,12,1,False,17,649,3,29,94,3306,376,16,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,1,0,1,0,1,2016,8,31,1,0,214,False,True,False,False,False,False,19,23,33,1470079413,False,False,False,True
4,3,365644,1470111093,1,41,1,False,8,649,4,200,94,25842,376,17,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,1,0,1,0,1,2016,8,31,2,1,215,False,False,False,False,False,False,4,11,33,1470111093,False,False,False,True


In [86]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)

0.8032388861918506

#### Train valid split

In [87]:
def split_vals(a,n):
    return a[:n].copy(), a[n:].copy()

n_valid = 100000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(full_train, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((803653, 51), (803653,), (100000, 51))

#### Base Model 

In [88]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())


def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
           m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [None]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)

In [91]:
print_score(m)

[25346455.456382327, 67279468.4143674, 0.7669483600077002, -0.4998142205356229]


In [92]:
val_preds = m.predict(X_valid)

In [94]:
grouped = raw_valid[['fullVisitorId', 'log_revenue']]

In [96]:
grouped['pred_revenue'] = val_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [98]:
grouped = grouped.groupby('fullVisitorId', as_index=False).sum()

In [100]:
grouped['pred_revenue'] = np.log1p(grouped['pred_revenue'])

In [101]:
rmse(grouped.log_revenue, grouped.pred_revenue)

1.452212247340136

In [41]:
submit = pd.read_csv('../data/sample_submission.csv')

In [43]:
submit.PredictedLogRevenue = train_log_target_mean

In [45]:
submit.to_csv('../submissions/train_mean.csv', header=True, index=False)