### Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os
import math

In [3]:
import pandas as pd
import numpy as np

In [4]:
pd.__version__

'0.23.4'

In [None]:
from fastai.imports import *
# from fastai.structured import *

In [5]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics

In [6]:
import sys
sys.path.append('../src')

In [7]:
from workflow.data_utils import load_csv, train_valid_split

In [8]:
from workflow.structured import add_datepart, train_cats, proc_df, apply_cats

### Constants

In [8]:
DATE_COLUMN ='date'
SPLIT_DATE ='2017-05-01'
ID_COLUMN = 'fullVisitorId'
TARGET_COLUMN = 'totals_transactionRevenue'

In [9]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

### Loads 

In [23]:
%%time
full_train = load_csv('../data/processed_data/processed_train.csv',
                      low_memory=False)

CPU times: user 5.73 s, sys: 859 ms, total: 6.58 s
Wall time: 15 s


In [24]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 35 columns):
channelGrouping                                 903653 non-null object
date                                            903653 non-null int64
fullVisitorId                                   903653 non-null object
sessionId                                       903653 non-null object
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null int64
device_browser                                  903653 non-null object
device_deviceCategory                           903653 non-null object
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null object
geoNetwork_city                                 903653 non-null object
geoNetwork_continent                       

In [25]:
display_all(full_train.head())

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,1472830385,1,1472830385,Chrome,desktop,False,Windows,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,1472880147,1,1472880147,Firefox,desktop,False,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,1472865386,1,1472865386,Chrome,desktop,False,Windows,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,1472881213,1,1472881213,UC Browser,desktop,False,Linux,not available in demo dataset,Asia,Indonesia,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),,google + online,organic,,google,
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,1472822600,2,1472822600,Chrome,mobile,True,Android,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,unknown.unknown,not available in demo dataset,Northern Europe,1.0,1,,1.0,,,,,,,(not set),True,(not provided),organic,,google,


In [10]:
%%time
test_df = load_csv('../data/processed_data/processed_test.csv', low_memory=False)

CPU times: user 4.83 s, sys: 864 ms, total: 5.7 s
Wall time: 6.81 s


In [9]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 35 columns):
channelGrouping                                 903653 non-null object
date                                            903653 non-null int64
fullVisitorId                                   903653 non-null object
sessionId                                       903653 non-null object
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null int64
device_browser                                  903653 non-null object
device_deviceCategory                           903653 non-null object
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null object
geoNetwork_city                                 903653 non-null object
geoNetwork_continent                       

In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 34 columns):
channelGrouping                                 804684 non-null object
date                                            804684 non-null int64
fullVisitorId                                   804684 non-null object
sessionId                                       804684 non-null object
visitId                                         804684 non-null int64
visitNumber                                     804684 non-null int64
visitStartTime                                  804684 non-null int64
device_browser                                  804684 non-null object
device_deviceCategory                           804684 non-null object
device_isMobile                                 804684 non-null bool
device_operatingSystem                          804684 non-null object
geoNetwork_city                                 804684 non-null object
geoNetwork_continent                       

In [12]:
all(full_train.drop(TARGET_COLUMN, axis=1).columns == test_df.columns)

True

### Workflow

In [26]:
full_train[TARGET_COLUMN] = full_train[TARGET_COLUMN].fillna(value=0)

In [40]:
# for experimenting
sample = train_df.sample(1000).copy()

In [55]:
full_train['log_revenue'] = np.log1p(full_train.totals_transactionRevenue)

In [35]:
display_all(sample.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
channelGrouping,1000,7.0,Organic Search,401.0,,,,,,,,,
date,1000,271.0,2016-11-11 00:00:00,9.0,2016-08-01 00:00:00,2017-07-31 00:00:00,,,,,,,
fullVisitorId,1000,999.0,0678102787097322550,2.0,,,,,,,,,
sessionId,1000,1000.0,6352519092794696137_1478346971,1.0,,,,,,,,,
visitId,1000,,,,,,1481650000.0,6705230.0,1470060000.0,1476540000.0,1480740000.0,1487210000.0,1501530000.0
visitNumber,1000,,,,,,3.111,15.1332,1.0,1.0,1.0,1.0,284.0
visitStartTime,1000,,,,,,1481650000.0,6705230.0,1470060000.0,1476540000.0,1480740000.0,1487210000.0,1501530000.0
device_browser,1000,15.0,Chrome,676.0,,,,,,,,,
device_deviceCategory,1000,3.0,desktop,727.0,,,,,,,,,
device_isMobile,1000,2.0,False,727.0,,,,,,,,,


In [44]:
sample.visitStartTime = pd.to_datetime(sample.visitStartTime, unit='s')

In [45]:
add_datepart(sample, 'visitStartTime', time=True)

In [46]:
display_all(sample.head(10))

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue,log_revenue,visitStartTimeYear,visitStartTimeMonth,visitStartTimeWeek,visitStartTimeDay,visitStartTimeDayofweek,visitStartTimeDayofyear,visitStartTimeIs_month_end,visitStartTimeIs_month_start,visitStartTimeIs_quarter_end,visitStartTimeIs_quarter_start,visitStartTimeIs_year_end,visitStartTimeIs_year_start,visitStartTimeHour,visitStartTimeMinute,visitStartTimeSecond,visitStartTimeElapsed
373219,Social,2016-09-22,754534820228363195,754534820228363195_1474543923,1474543923,1,Opera,desktop,False,Windows,not available in demo dataset,Europe,Germany,not available in demo dataset,(not set),not available in demo dataset,Western Europe,,3,1.0,3.0,,,,,,,(not set),,,referral,/yt/about/,youtube.com,0.0,0.0,2016,9,38,22,3,266,False,False,False,False,False,False,11,32,3,1474543923
65285,Direct,2016-11-30,3428088107468109877,3428088107468109877_1480528812,1480528812,1,Chrome,desktop,False,Chrome OS,Toronto,Americas,Canada,(not set),(not set),Ontario,Northern America,1.0,1,1.0,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,11,48,30,2,335,True,False,False,False,False,False,18,0,12,1480528812
236565,Social,2016-11-06,1234775652180107660,1234775652180107660_1478505573,1478505573,1,Safari,desktop,False,Macintosh,Thanh Hoa,Asia,Vietnam,(not set),unknown.unknown,Thanh Hoa,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),,,referral,/yt/about/vi/,youtube.com,0.0,0.0,2016,11,45,7,0,312,False,False,False,False,False,False,7,59,33,1478505573
651463,Organic Search,2017-02-21,5867443882754804789,5867443882754804789_1487728338,1487728338,1,Safari,desktop,False,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dsl.net.au,not available in demo dataset,Australasia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,0.0,0.0,2017,2,8,22,2,53,False,False,False,False,False,False,1,52,18,1487728338
554637,Organic Search,2017-02-06,3047983816895254458,3047983816895254458_1486446298,1486446298,2,Chrome,desktop,False,Linux,San Francisco,Americas,United States,San Francisco-Oakland-San Jose CA,att.net,California,Northern America,1.0,1,,1.0,,,,,,,(not set),True,(not provided),organic,,google,0.0,0.0,2017,2,6,7,1,38,False,False,False,False,False,False,5,44,58,1486446298
120204,Organic Search,2017-03-29,3915579193808543140,3915579193808543140_1490838482,1490838482,1,Safari,mobile,True,iOS,(not set),Asia,Hong Kong,(not set),twowincolimited-hk.com,(not set),Eastern Asia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,0.0,0.0,2017,3,13,30,3,89,False,False,False,False,False,False,1,48,2,1490838482
537674,Organic Search,2016-08-27,7406034241760755547,7406034241760755547_1472284721,1472284721,1,Chrome,desktop,False,Chrome OS,not available in demo dataset,Americas,United States,not available in demo dataset,telepacific.net,not available in demo dataset,Northern America,,14,1.0,12.0,,,,,,,(not set),,(not provided),organic,,google,0.0,0.0,2016,8,34,27,5,240,False,False,False,False,False,False,7,58,41,1472284721
13458,Social,2017-01-13,2755348753280811333,2755348753280811333_1484332331,1484332331,2,Chrome,desktop,False,Chrome OS,not available in demo dataset,Americas,United States,not available in demo dataset,treca.org,not available in demo dataset,Northern America,1.0,1,,1.0,,,,,,,(not set),True,,referral,/yt/about/,youtube.com,0.0,0.0,2017,1,2,13,4,13,False,False,False,False,False,False,18,32,11,1484332331
304542,Organic Search,2016-09-16,1547122358071132267,1547122358071132267_1474067389,1474067389,1,Chrome,desktop,False,Windows,not available in demo dataset,Americas,Canada,not available in demo dataset,(not set),not available in demo dataset,Northern America,,2,1.0,2.0,,,,,,,(not set),,(not provided),organic,,google,0.0,0.0,2016,9,37,16,4,260,False,False,False,False,False,False,23,9,49,1474067389
534991,Direct,2017-03-16,4890458816338828012,4890458816338828012_1489662913,1489662913,1,Chrome,desktop,False,Windows,Moscow,Europe,Russia,(not set),misp.ru,Moscow,Eastern Europe,,11,1.0,11.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2017,3,11,16,3,75,False,False,False,False,False,False,11,15,13,1489662913


In [59]:
# posix seconds to datetime
full_train.visitStartTime = pd.to_datetime(full_train.visitStartTime, unit='s')

In [14]:
test_df.visitStartTime = pd.to_datetime(test_df.visitStartTime, unit='s')

In [60]:
# make time features
add_datepart(full_train, 'visitStartTime', time=True)

In [15]:
# make time features
add_datepart(test_df, 'visitStartTime', time=True)

In [61]:
# make categorical data
train_cats(full_train)

In [18]:
apply_cats(test_df, full_train.iloc[:,25:])

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'unsigned long'

In [64]:
full_train = full_train.sort_values(by='date', ascending=True)

In [76]:
display_all(full_train.isnull().sum().sort_index()/len(full_train))

channelGrouping                                 0.000000
device_browser                                  0.000000
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.000000
fullVisitorId                                   0.000000
geoNetwork_city                                 0.000000
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000000
geoNetwork_metro                                0.000000
geoNetwork_networkDomain                        0.000000
geoNetwork_region                               0.000000
geoNetwork_subContinent                         0.000000
log_revenue                                     0.000000
sessionId                                       0.000000
totals_bounces                                  0.501324
totals_hits                                     0.000000
totals_newVisits               

In [108]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 50 columns):
channelGrouping                                 903653 non-null category
fullVisitorId                                   903653 non-null category
sessionId                                       903653 non-null category
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
device_browser                                  903653 non-null category
device_deviceCategory                           903653 non-null category
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null category
geoNetwork_city                                 903653 non-null category
geoNetwork_continent                            903653 non-null category
geoNetwork_country                              903653 non-null category
geoNetwork_metro       

In [71]:
full_train.drop(columns=['date'], inplace=True)
full_train.reset_index(inplace=True, drop=True)

In [77]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
full_train.to_feather('../data/tmp/raw_train')

CPU times: user 340 ms, sys: 139 ms, total: 479 ms
Wall time: 2.75 s


#### Preprocess

In [11]:
full_train = pd.read_feather('../data/tmp/raw_train')

  return feather.read_dataframe(path, nthreads=nthreads)


In [12]:
display_all(full_train.head())

Unnamed: 0,channelGrouping,fullVisitorId,sessionId,visitId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue,log_revenue,visitStartTimeYear,visitStartTimeMonth,visitStartTimeWeek,visitStartTimeDay,visitStartTimeDayofweek,visitStartTimeDayofyear,visitStartTimeIs_month_end,visitStartTimeIs_month_start,visitStartTimeIs_quarter_end,visitStartTimeIs_quarter_start,visitStartTimeIs_year_end,visitStartTimeIs_year_start,visitStartTimeHour,visitStartTimeMinute,visitStartTimeSecond,visitStartTimeElapsed
0,Direct,1492602573213666603,1492602573213666603_1470044332,1470044332,1,Chrome,desktop,False,Macintosh,not available in demo dataset,Asia,Japan,not available in demo dataset,i3-systems.net,not available in demo dataset,Eastern Asia,1.0,1,1.0,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,8,31,1,0,214,False,True,False,False,False,False,9,38,52,1470044332
1,Direct,7394165545362887055,7394165545362887055_1470044425,1470044425,3,Chrome,desktop,False,Windows,Hanoi,Asia,Vietnam,(not set),unknown.unknown,Hanoi,Southeast Asia,1.0,1,,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,8,31,1,0,214,False,True,False,False,False,False,9,40,25,1470044425
2,Referral,6107229716178617930,6107229716178617930_1470094529,1470094529,1,Chrome,desktop,False,Macintosh,Mountain View,Americas,United States,San Francisco-Oakland-San Jose CA,(not set),California,Northern America,1.0,1,1.0,1.0,,,,,,,(not set),,,referral,/,mall.googleplex.com,0.0,0.0,2016,8,31,1,0,214,False,True,False,False,False,False,23,35,29,1470094529
3,Direct,9459384188253198762,9459384188253198762_1470079413,1470079413,1,Chrome,desktop,False,Windows,not available in demo dataset,Americas,Brazil,not available in demo dataset,brasiltelecom.net.br,not available in demo dataset,South America,1.0,1,1.0,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,8,31,1,0,214,False,True,False,False,False,False,19,23,33,1470079413
4,Direct,4052177266351383392,4052177266351383392_1470111093,1470111093,1,Safari,desktop,False,Macintosh,not available in demo dataset,Asia,Thailand,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),True,,(none),,(direct),0.0,0.0,2016,8,31,2,1,215,False,False,False,False,False,False,4,11,33,1470111093


In [13]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 50 columns):
channelGrouping                                 903653 non-null category
fullVisitorId                                   903653 non-null category
sessionId                                       903653 non-null category
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
device_browser                                  903653 non-null category
device_deviceCategory                           903653 non-null category
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null category
geoNetwork_city                                 903653 non-null category
geoNetwork_continent                            903653 non-null category
geoNetwork_country                              903653 non-null category
geoNetwork_metro       

In [82]:
df, y, nas = proc_df(full_train, 'totals_transactionRevenue',
                     skip_flds=['log_revenue', 'fullVisitorId'])

In [83]:
display_all(df.head())

Unnamed: 0,channelGrouping,sessionId,visitId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,visitStartTimeYear,visitStartTimeMonth,visitStartTimeWeek,visitStartTimeDay,visitStartTimeDayofweek,visitStartTimeDayofyear,visitStartTimeIs_month_end,visitStartTimeIs_month_start,visitStartTimeIs_quarter_end,visitStartTimeIs_quarter_start,visitStartTimeIs_year_end,visitStartTimeIs_year_start,visitStartTimeHour,visitStartTimeMinute,visitStartTimeSecond,visitStartTimeElapsed,totals_bounces_na,totals_newVisits_na,totals_pageviews_na,trafficSource_adwordsClickInfo.page_na
0,3,133849,1470044332,1,12,1,False,8,649,4,103,94,10956,376,7,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,1,0,1,0,1,2016,8,31,1,0,214,False,True,False,False,False,False,9,38,52,1470044332,False,False,False,True
1,3,667424,1470044425,3,12,1,False,17,236,4,218,1,25842,130,17,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,1,0,1,0,1,2016,8,31,1,0,214,False,True,False,False,False,False,9,40,25,1470044425,False,True,False,True
2,7,551371,1470094529,1,12,1,False,8,367,3,213,79,1,58,13,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,0,0,7,1,268,2016,8,31,1,0,214,False,True,False,False,False,False,23,35,29,1470094529,False,False,False,True
3,3,853931,1470079413,1,12,1,False,17,649,3,29,94,3306,376,16,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,1,0,1,0,1,2016,8,31,1,0,214,False,True,False,False,False,False,19,23,33,1470079413,False,False,False,True
4,3,365644,1470111093,1,41,1,False,8,649,4,200,94,25842,376,17,1.0,1,1.0,1.0,0,0,0,0,1.0,0,1,1,0,1,0,1,2016,8,31,2,1,215,False,False,False,False,False,False,4,11,33,1470111093,False,False,False,True


In [86]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)

0.8032388861918506

#### Train valid split

In [87]:
def split_vals(a,n):
    return a[:n].copy(), a[n:].copy()

n_valid = 100000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(full_train, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((803653, 51), (803653,), (100000, 51))

#### Base Model 

In [88]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())


def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
           m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [None]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)

In [91]:
print_score(m)

[25346455.456382327, 67279468.4143674, 0.7669483600077002, -0.4998142205356229]


In [92]:
val_preds = m.predict(X_valid)

In [94]:
grouped = raw_valid[['fullVisitorId', 'log_revenue']]

In [96]:
grouped['pred_revenue'] = val_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [98]:
grouped = grouped.groupby('fullVisitorId', as_index=False).sum()

In [100]:
grouped['pred_revenue'] = np.log1p(grouped['pred_revenue'])

In [101]:
rmse(grouped.log_revenue, grouped.pred_revenue)

1.452212247340136

In [41]:
submit = pd.read_csv('../data/sample_submission.csv')

In [43]:
submit.PredictedLogRevenue = train_log_target_mean

In [45]:
submit.to_csv('../submissions/train_mean.csv', header=True, index=False)