### Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os
import math

In [3]:
import pandas as pd
import numpy as np

In [4]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [5]:
pd.__version__

'0.23.4'

In [6]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit

In [7]:
import sys
sys.path.append('../src')

In [8]:
from workflow.data_utils import load_csv

In [9]:
# from workflow.structured import add_datepart, train_cats, proc_df, apply_cats

In [10]:
from fastai.structured import add_datepart, train_cats, proc_df, apply_cats

### Constants

In [11]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [12]:
def half_bool_columns_to_float_type(df):
    for col in df.columns:
        if is_string_dtype(df[col]):
            unqs = df[col].unique()
            if len(unqs) == 2 and (False in unqs or True in unqs):
                print(col)
                df[col] = df[col].astype(float)

In [13]:
# some duplicated info
drop_columns = ['date', 'sessionId', 'visitId']

### Loads 

In [14]:
%%time
full_train = load_csv('../data/processed_data/processed_train.csv',
                      low_memory=False, nrows=None)

CPU times: user 5.07 s, sys: 835 ms, total: 5.9 s
Wall time: 5.92 s


In [15]:
full_train.drop(columns=drop_columns, inplace=True)

In [16]:
half_bool_columns_to_float_type(full_train)

trafficSource_adwordsClickInfo.isVideoAd
trafficSource_isTrueDirect


In [41]:
full_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 47 columns):
channelGrouping                                 903653 non-null category
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null datetime64[ns]
device_browser                                  903653 non-null category
device_deviceCategory                           903653 non-null category
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null category
geoNetwork_city                                 903653 non-null category
geoNetwork_continent                            903653 non-null category
geoNetwork_country                              903653 non-null category
geoNetwork_metro                                903653 non-null category
geoNetwork_networkDomain                        903653 non-null category
geoNetwork_reg

In [21]:
display_all(full_train.head())

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_transactionRevenue
0,Organic Search,1131660440785968503,1,1472830385,Chrome,desktop,False,Windows,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
1,Organic Search,377306020877927890,1,1472880147,Firefox,desktop,False,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
2,Organic Search,3895546263509774583,1,1472865386,Chrome,desktop,False,Windows,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,1.0,1,1.0,1.0,,,,,,,(not set),,(not provided),organic,,google,
3,Organic Search,4763447161404445595,1,1472881213,UC Browser,desktop,False,Linux,not available in demo dataset,Asia,Indonesia,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1.0,,,,,,,(not set),,google + online,organic,,google,
4,Organic Search,27294437909732085,2,1472822600,Chrome,mobile,True,Android,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,unknown.unknown,not available in demo dataset,Northern Europe,1.0,1,,1.0,,,,,,,(not set),1.0,(not provided),organic,,google,


In [17]:
%%time
test_df = load_csv('../data/processed_data/processed_test.csv', low_memory=False, nrows=None)

CPU times: user 4.43 s, sys: 862 ms, total: 5.3 s
Wall time: 5.36 s


In [18]:
test_df.drop(columns=drop_columns, inplace=True)

In [19]:
half_bool_columns_to_float_type(test_df)

trafficSource_adwordsClickInfo.isVideoAd
trafficSource_isTrueDirect


In [34]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 46 columns):
channelGrouping                                 804684 non-null category
visitNumber                                     804684 non-null int64
visitStartTime                                  804684 non-null int64
device_browser                                  788780 non-null category
device_deviceCategory                           804684 non-null category
device_isMobile                                 804684 non-null bool
device_operatingSystem                          803759 non-null category
geoNetwork_city                                 796984 non-null category
geoNetwork_continent                            804684 non-null category
geoNetwork_country                              804657 non-null category
geoNetwork_metro                                803800 non-null category
geoNetwork_networkDomain                        778940 non-null category
geoNetwork_region      

#### Check column names and types are == for train and test

In [32]:
all(full_train.drop('totals_transactionRevenue', axis=1).columns == test_df.columns)

True

In [33]:
types_equal = []
for col_name in test_df.columns:
    types_equal.append(full_train[col_name].dtype == test_df[col_name].dtype)

all(types_equal)

True

In [23]:
display_all(full_train.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
channelGrouping,903653,8.0,Organic Search,381561.0,,,,,,,
fullVisitorId,903653,714167.0,1957458976293878100,278.0,,,,,,,
visitNumber,903653,,,,2.2649,9.28373,1.0,1.0,1.0,1.0,395.0
visitStartTime,903653,,,,1485010000.0,9022120.0,1470030000.0,1477560000.0,1483950000.0,1492760000.0,1501660000.0
device_browser,903653,54.0,Chrome,620364.0,,,,,,,
device_deviceCategory,903653,3.0,desktop,664479.0,,,,,,,
device_isMobile,903653,2.0,False,664530.0,,,,,,,
device_operatingSystem,903653,20.0,Windows,350072.0,,,,,,,
geoNetwork_city,903653,649.0,not available in demo dataset,508229.0,,,,,,,
geoNetwork_continent,903653,6.0,Americas,450377.0,,,,,,,


### Workflow

In [22]:
full_train['totals_transactionRevenue'] = full_train['totals_transactionRevenue'].fillna(value=0)

In [23]:
# posix seconds to datetime
full_train['visitStartTime_dt'] = pd.to_datetime(full_train.visitStartTime, unit='s')

In [24]:
test_df['visitStartTime_dt'] = pd.to_datetime(test_df.visitStartTime, unit='s')

#### Take fullVisitorId -> validation ids, if it has transaction after 2017-04-30 

In [28]:
validationIds = full_train[full_train.visitStartTime > '2017-04-30'].fullVisitorId.unique()

In [29]:
validationIds.shape

(161118,)

In [30]:
# save for later use
np.save('../data/tmp/validIds.npy',validationIds)

#### Continue workflow

In [25]:
# make time features
add_datepart(full_train, 'visitStartTime_dt', time=True, drop=True)

In [26]:
# make time features
add_datepart(test_df, 'visitStartTime_dt', time=True, drop=True)

In [27]:
set(full_train.drop('totals_transactionRevenue', axis=1).columns) - set(test_df.columns)

set()

#### Drop fullVisitorId columns to not categorize them

In [28]:
train_visitorIds = full_train.fullVisitorId
test_visitorIds = test_df.fullVisitorId

In [29]:
full_train.drop(columns=['fullVisitorId'], inplace=True)
test_df.drop(columns=['fullVisitorId'], inplace=True)

In [30]:
# make categorical data
train_cats(full_train)

In [31]:
apply_cats(test_df, full_train)

In [36]:
display_all(full_train.isnull().sum().sort_index()/len(full_train))

channelGrouping                                 0.000000
device_browser                                  0.000000
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.000000
geoNetwork_city                                 0.000000
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000000
geoNetwork_metro                                0.000000
geoNetwork_networkDomain                        0.000000
geoNetwork_region                               0.000000
geoNetwork_subContinent                         0.000000
totals_bounces                                  0.501324
totals_hits                                     0.000000
totals_newVisits                                0.221980
totals_pageviews                                0.000111
totals_transactionRevenue                       0.000000
trafficSource_adContent        

In [37]:
display_all(test_df.isnull().sum().sort_index()/len(test_df))

channelGrouping                                 0.000000
device_browser                                  0.019764
device_deviceCategory                           0.000000
device_isMobile                                 0.000000
device_operatingSystem                          0.001150
geoNetwork_city                                 0.009569
geoNetwork_continent                            0.000000
geoNetwork_country                              0.000034
geoNetwork_metro                                0.001099
geoNetwork_networkDomain                        0.031993
geoNetwork_region                               0.007047
geoNetwork_subContinent                         0.000000
totals_bounces                                  0.476878
totals_hits                                     0.000000
totals_newVisits                                0.248935
totals_pageviews                                0.000173
trafficSource_adContent                         0.994974
trafficSource_adwordsClickInfo.

In [38]:
# add back fullVisitosIds
full_train['fullVisitorId'] = train_visitorIds
test_df['fullVisitorId'] = test_visitorIds

In [39]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
full_train.to_feather('../data/tmp/raw_train')

CPU times: user 200 ms, sys: 108 ms, total: 308 ms
Wall time: 2.41 s


In [40]:
%%time
os.makedirs('../data/tmp', exist_ok=True)
test_df.to_feather('../data/tmp/raw_test')

CPU times: user 172 ms, sys: 83.7 ms, total: 255 ms
Wall time: 1.66 s


#### Preprocess

In [40]:
full_train = pd.read_feather('../data/tmp/raw_train')

  return feather.read_dataframe(path, nthreads=nthreads)


In [41]:
%%time
df, y, nas = proc_df(full_train, 'totals_transactionRevenue',
                     ignore_flds=['fullVisitorId', 'visitStartTime'],
                     max_n_cat=100, subset=80000)

CPU times: user 308 ms, sys: 68.1 ms, total: 376 ms
Wall time: 377 ms


In [42]:
df.columns.shape

(326,)

In [43]:
display_all(df.head(10))

Unnamed: 0,fullVisitorId,visitStartTime,visitNumber,device_isMobile,geoNetwork_city,geoNetwork_country,geoNetwork_networkDomain,geoNetwork_region,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_referralPath,trafficSource_source,visitStartTime_dtYear,visitStartTime_dtMonth,visitStartTime_dtWeek,visitStartTime_dtDay,visitStartTime_dtDayofweek,visitStartTime_dtDayofyear,visitStartTime_dtIs_month_end,visitStartTime_dtIs_month_start,visitStartTime_dtIs_quarter_end,visitStartTime_dtIs_quarter_start,visitStartTime_dtIs_year_end,visitStartTime_dtIs_year_start,visitStartTime_dtHour,visitStartTime_dtMinute,visitStartTime_dtSecond,visitStartTime_dtElapsed,totals_bounces_na,totals_newVisits_na,totals_pageviews_na,trafficSource_adwordsClickInfo.isVideoAd_na,trafficSource_adwordsClickInfo.page_na,trafficSource_isTrueDirect_na,channelGrouping_(Other),channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,channelGrouping_Organic Search,channelGrouping_Paid Search,channelGrouping_Referral,channelGrouping_Social,channelGrouping_nan,device_browser_(not set),device_browser_0,device_browser_ADM,device_browser_Amazon Silk,device_browser_Android Browser,device_browser_Android Runtime,device_browser_Android Webview,device_browser_Apple-iPhone7C2,device_browser_BlackBerry,device_browser_CSM Click,device_browser_Changa 99695759,device_browser_Chrome,device_browser_Coc Coc,device_browser_DASH_JR_3G,device_browser_DoCoMo,device_browser_Edge,device_browser_Firefox,device_browser_HTC802t_TD,device_browser_Hisense M20-M_LTE,device_browser_IE with Chrome Frame,device_browser_Internet Explorer,device_browser_Iron,device_browser_Konqueror,device_browser_LYF_LS_4002_11,device_browser_LYF_LS_4002_12,device_browser_Lunascape,device_browser_M5,device_browser_MQQBrowser,device_browser_MRCHROME,device_browser_Maxthon,device_browser_Mozilla,device_browser_Mozilla Compatible Agent,device_browser_Nichrome,device_browser_Nintendo Browser,device_browser_Nokia Browser,device_browser_NokiaE52-1,device_browser_Opera,device_browser_Opera Mini,device_browser_Puffin,device_browser_Reddit,device_browser_Safari,device_browser_Safari (in-app),device_browser_SeaMonkey,device_browser_Seznam,device_browser_TCL P500M,device_browser_ThumbSniper,device_browser_UC Browser,device_browser_User Agent,device_browser_YE,device_browser_YaBrowser,device_browser_[Use default User-agent string] LIVRENPOCHE,device_browser_no-ua,device_browser_osee2unifiedRelease,device_browser_subjectAgent: NoticiasBoom,device_browser_nan,device_deviceCategory_desktop,device_deviceCategory_mobile,device_deviceCategory_tablet,device_deviceCategory_nan,device_operatingSystem_(not set),device_operatingSystem_Android,device_operatingSystem_BlackBerry,device_operatingSystem_Chrome OS,device_operatingSystem_Firefox OS,device_operatingSystem_FreeBSD,device_operatingSystem_Linux,device_operatingSystem_Macintosh,device_operatingSystem_NTT DoCoMo,device_operatingSystem_Nintendo 3DS,device_operatingSystem_Nintendo Wii,device_operatingSystem_Nintendo WiiU,device_operatingSystem_Nokia,device_operatingSystem_OpenBSD,device_operatingSystem_Samsung,device_operatingSystem_SunOS,device_operatingSystem_Windows,device_operatingSystem_Windows Phone,device_operatingSystem_Xbox,device_operatingSystem_iOS,device_operatingSystem_nan,geoNetwork_continent_(not set),geoNetwork_continent_Africa,geoNetwork_continent_Americas,geoNetwork_continent_Asia,geoNetwork_continent_Europe,geoNetwork_continent_Oceania,geoNetwork_continent_nan,geoNetwork_metro_(not set),geoNetwork_metro_Abilene-Sweetwater TX,geoNetwork_metro_Albany-Schenectady-Troy NY,geoNetwork_metro_Atlanta GA,geoNetwork_metro_Augusta GA,geoNetwork_metro_Austin TX,geoNetwork_metro_Baltimore MD,geoNetwork_metro_Boise ID,geoNetwork_metro_Boston MA-Manchester NH,geoNetwork_metro_Butte-Bozeman MT,geoNetwork_metro_Central Scotland,geoNetwork_metro_Charleston SC,geoNetwork_metro_Charlotte NC,geoNetwork_metro_Charlottesville VA,geoNetwork_metro_Chattanooga TN,geoNetwork_metro_Chicago IL,geoNetwork_metro_Chico-Redding CA,geoNetwork_metro_Cincinnati OH,geoNetwork_metro_Cleveland-Akron (Canton) OH,geoNetwork_metro_Colorado Springs-Pueblo CO,geoNetwork_metro_Columbus OH,geoNetwork_metro_Dallas-Ft. Worth TX,geoNetwork_metro_Denver CO,geoNetwork_metro_Detroit MI,geoNetwork_metro_El Paso TX,geoNetwork_metro_Fresno-Visalia CA,geoNetwork_metro_Grand Rapids-Kalamazoo-Battle Creek MI,geoNetwork_metro_Green Bay-Appleton WI,geoNetwork_metro_Greenville-Spartanburg-Asheville-Anderson,geoNetwork_metro_HTV Wales,geoNetwork_metro_HTV West,geoNetwork_metro_Harlingen-Weslaco-Brownsville-McAllen TX,geoNetwork_metro_Hartford & New Haven CT,geoNetwork_metro_Honolulu HI,geoNetwork_metro_Houston TX,geoNetwork_metro_Idaho Falls-Pocatello ID,geoNetwork_metro_Indianapolis IN,geoNetwork_metro_JP_KANTO,geoNetwork_metro_JP_KINKI,geoNetwork_metro_JP_OTHER,geoNetwork_metro_Jacksonville FL,geoNetwork_metro_Kansas City MO,geoNetwork_metro_La Crosse-Eau Claire WI,geoNetwork_metro_Lansing MI,geoNetwork_metro_Las Vegas NV,geoNetwork_metro_Lexington KY,geoNetwork_metro_London,geoNetwork_metro_Los Angeles CA,geoNetwork_metro_Louisville KY,geoNetwork_metro_Madison WI,geoNetwork_metro_Mankato MN,geoNetwork_metro_Memphis TN,geoNetwork_metro_Meridian (exc. Channel Islands),geoNetwork_metro_Miami-Ft. Lauderdale FL,geoNetwork_metro_Midlands,geoNetwork_metro_Milwaukee WI,geoNetwork_metro_Minneapolis-St. Paul MN,geoNetwork_metro_Nashville TN,geoNetwork_metro_New Orleans LA,geoNetwork_metro_New York NY,geoNetwork_metro_Norfolk-Portsmouth-Newport News VA,geoNetwork_metro_North Scotland,geoNetwork_metro_North West,geoNetwork_metro_Omaha NE,geoNetwork_metro_Orlando-Daytona Beach-Melbourne FL,geoNetwork_metro_Panama City FL,geoNetwork_metro_Philadelphia PA,geoNetwork_metro_Phoenix AZ,geoNetwork_metro_Pittsburgh PA,geoNetwork_metro_Portland OR,"geoNetwork_metro_Providence-New Bedford,MA",geoNetwork_metro_Raleigh-Durham (Fayetteville) NC,geoNetwork_metro_Roanoke-Lynchburg VA,"geoNetwork_metro_Rochester-Mason City-Austin,IA",geoNetwork_metro_Sacramento-Stockton-Modesto CA,geoNetwork_metro_Salt Lake City UT,geoNetwork_metro_San Antonio TX,geoNetwork_metro_San Diego CA,geoNetwork_metro_San Francisco-Oakland-San Jose CA,geoNetwork_metro_Seattle-Tacoma WA,geoNetwork_metro_Springfield MO,geoNetwork_metro_Springfield-Holyoke MA,geoNetwork_metro_St. Louis MO,geoNetwork_metro_Syracuse NY,geoNetwork_metro_Tallahassee FL-Thomasville GA,geoNetwork_metro_Tampa-St. Petersburg (Sarasota) FL,geoNetwork_metro_Tri-Cities TN-VA,geoNetwork_metro_Tucson (Sierra Vista) AZ,geoNetwork_metro_Tulsa OK,geoNetwork_metro_Utica NY,geoNetwork_metro_Washington DC (Hagerstown MD),geoNetwork_metro_Wheeling WV-Steubenville OH,geoNetwork_metro_Yorkshire,geoNetwork_metro_not available in demo dataset,geoNetwork_metro_nan,geoNetwork_subContinent_(not set),geoNetwork_subContinent_Australasia,geoNetwork_subContinent_Caribbean,geoNetwork_subContinent_Central America,geoNetwork_subContinent_Central Asia,geoNetwork_subContinent_Eastern Africa,geoNetwork_subContinent_Eastern Asia,geoNetwork_subContinent_Eastern Europe,geoNetwork_subContinent_Melanesia,geoNetwork_subContinent_Micronesian Region,geoNetwork_subContinent_Middle Africa,geoNetwork_subContinent_Northern Africa,geoNetwork_subContinent_Northern America,geoNetwork_subContinent_Northern Europe,geoNetwork_subContinent_Polynesia,geoNetwork_subContinent_South America,geoNetwork_subContinent_Southeast Asia,geoNetwork_subContinent_Southern Africa,geoNetwork_subContinent_Southern Asia,geoNetwork_subContinent_Southern Europe,geoNetwork_subContinent_Western Africa,geoNetwork_subContinent_Western Asia,geoNetwork_subContinent_Western Europe,geoNetwork_subContinent_nan,trafficSource_adContent_20% discount,trafficSource_adContent_Ad from 11/3/16,trafficSource_adContent_Ad from 11/7/16,trafficSource_adContent_Ad from 12/13/16,trafficSource_adContent_Ad from 2/17/17,trafficSource_adContent_Display Ad created 11/17/14,trafficSource_adContent_Display Ad created 3/11/14,trafficSource_adContent_Display Ad created 3/11/15,trafficSource_adContent_First Full Auto Template Test Ad,trafficSource_adContent_Free Shipping!,trafficSource_adContent_Full auto ad IMAGE ONLY,trafficSource_adContent_Full auto ad NATIVE ONLY,trafficSource_adContent_Full auto ad TEXT ONLY,trafficSource_adContent_Full auto ad TEXT/NATIVE,trafficSource_adContent_Full auto ad with Primary Color,trafficSource_adContent_GA Help Center,trafficSource_adContent_Google Merchandise,trafficSource_adContent_Google Merchandise Collection,trafficSource_adContent_Google Online Store,trafficSource_adContent_Google Paraphernalia,trafficSource_adContent_Google Store,trafficSource_adContent_Google store,trafficSource_adContent_JD_5a_v1,trafficSource_adContent_LeEco_1a,trafficSource_adContent_Men's-Outerwear Google Apparel,trafficSource_adContent_Official Google Merchandise - Fast Shipping,trafficSource_adContent_Swag w/ Google Logos,trafficSource_adContent_Swag with Google Logos,trafficSource_adContent_Want Google Sunglasses,trafficSource_adContent_free shipping,trafficSource_adContent_google store,trafficSource_adContent_url_builder,trafficSource_adContent_visit us again,trafficSource_adContent_{KeyWord:Google Brand Items},trafficSource_adContent_{KeyWord:Google Branded Apparel},trafficSource_adContent_{KeyWord:Google Branded Gear},trafficSource_adContent_{KeyWord:Google Branded Kit},trafficSource_adContent_{KeyWord:Google Branded Outerwear},trafficSource_adContent_{KeyWord:Google Drinkware},trafficSource_adContent_{KeyWord:Google Men's T-Shirts},trafficSource_adContent_{KeyWord:Google Merchandise},trafficSource_adContent_{KeyWord:Looking for Google Bags?},trafficSource_adContent_{KeyWord:Want Google Pet Toys?},trafficSource_adContent_{KeyWord:Want Google Stickers?},trafficSource_adContent_nan,trafficSource_adwordsClickInfo.adNetworkType_Google Search,trafficSource_adwordsClickInfo.adNetworkType_Search partners,trafficSource_adwordsClickInfo.adNetworkType_nan,trafficSource_adwordsClickInfo.slot_RHS,trafficSource_adwordsClickInfo.slot_Top,trafficSource_adwordsClickInfo.slot_nan,trafficSource_campaign_(not set),trafficSource_campaign_AW - Accessories,trafficSource_campaign_AW - Apparel,trafficSource_campaign_AW - Dynamic Search Ads Whole Site,trafficSource_campaign_AW - Electronics,trafficSource_campaign_All Products,trafficSource_campaign_Data Share,trafficSource_campaign_Data Share Promo,trafficSource_campaign_Retail (DO NOT EDIT owners nophakun and tianyu),trafficSource_campaign_test-liyuhz,trafficSource_campaign_nan,trafficSource_medium_(none),trafficSource_medium_(not set),trafficSource_medium_affiliate,trafficSource_medium_cpc,trafficSource_medium_cpm,trafficSource_medium_organic,trafficSource_medium_referral,trafficSource_medium_nan
2,3895546263509774583,1472865386,1,False,326,182,25842,79,1.0,1,1.0,1.0,0,0.0,1.0,1.0,12,0,150,2016,9,35,3,5,247,False,False,False,False,False,False,1,16,26,1472865386,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
17,2222266935962032743,1472826820,1,False,432,70,16600,145,1.0,1,1.0,1.0,0,0.0,1.0,1.0,12,0,150,2016,9,35,2,4,246,False,False,False,False,False,False,14,33,40,1472826820,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
19,3696906537737368442,1472856874,1,False,94,10,18831,54,1.0,1,1.0,1.0,0,0.0,1.0,1.0,12,0,150,2016,9,35,2,4,246,False,False,False,False,False,False,22,54,34,1472856874,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
37,62441254657008214,1472875520,1,False,649,13,26131,376,1.0,1,1.0,1.0,0,0.0,1.0,1.0,12,0,150,2016,9,35,3,5,247,False,False,False,False,False,False,4,5,20,1472875520,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
41,1397786681184924394,1472822131,1,True,649,56,121,376,1.0,1,1.0,1.0,0,0.0,1.0,1.0,12,0,150,2016,9,35,2,4,246,False,False,False,False,False,False,13,15,31,1472822131,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
70,3956355672499179103,1472839218,1,False,2,18,25842,1,1.0,1,1.0,1.0,0,0.0,1.0,1.0,12,0,150,2016,9,35,2,4,246,False,False,False,False,False,False,18,0,18,1472839218,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
80,5223324341104792776,1472820730,1,True,649,212,6630,376,1.0,1,1.0,1.0,0,0.0,1.0,1.0,0,0,78,2016,9,35,2,4,246,False,False,False,False,False,False,12,52,10,1472820730,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
85,1754209820046771603,1472830062,1,False,649,44,28047,376,1.0,1,1.0,1.0,0,0.0,1.0,1.0,12,0,150,2016,9,35,2,4,246,False,False,False,False,False,False,15,27,42,1472830062,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
94,113895825226302532,1472881883,1,True,391,94,18010,90,1.0,1,1.0,1.0,0,0.0,1.0,1.0,340,0,150,2016,9,35,3,5,247,False,False,False,False,False,False,5,51,23,1472881883,False,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
102,2870162892603227525,1472858877,1,True,649,213,1,376,1.0,1,1.0,1.0,11045,0.0,1.0,1.0,96,0,150,2016,9,35,2,4,246,False,False,False,False,False,False,23,27,57,1472858877,False,False,False,False,False,True,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [48]:
test, _, _ = proc_df(test_df, skip_flds=['fullVisitorId'], max_n_cat=100, na_dict=nas)

In [49]:
test.columns.shape

(321,)

In [50]:
display_all(test.head())

Unnamed: 0,visitNumber,device_isMobile,geoNetwork_city,geoNetwork_country,geoNetwork_networkDomain,geoNetwork_region,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_referralPath,trafficSource_source,visitStartTimeYear,visitStartTimeMonth,visitStartTimeWeek,visitStartTimeDay,visitStartTimeDayofweek,visitStartTimeDayofyear,visitStartTimeIs_month_end,visitStartTimeIs_month_start,visitStartTimeIs_quarter_end,visitStartTimeIs_quarter_start,visitStartTimeIs_year_end,visitStartTimeIs_year_start,visitStartTimeElapsed,totals_bounces_na,totals_newVisits_na,totals_pageviews_na,trafficSource_adwordsClickInfo.isVideoAd_na,trafficSource_adwordsClickInfo.page_na,trafficSource_isTrueDirect_na,channelGrouping_(Other),channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,channelGrouping_Organic Search,channelGrouping_Paid Search,channelGrouping_Referral,channelGrouping_Social,channelGrouping_nan,device_browser_(not set),device_browser_0,device_browser_ADM,device_browser_Amazon Silk,device_browser_Android Browser,device_browser_Android Runtime,device_browser_Android Webview,device_browser_Apple-iPhone7C2,device_browser_BlackBerry,device_browser_CSM Click,device_browser_Changa 99695759,device_browser_Chrome,device_browser_Coc Coc,device_browser_DASH_JR_3G,device_browser_DoCoMo,device_browser_Edge,device_browser_Firefox,device_browser_HTC802t_TD,device_browser_Hisense M20-M_LTE,device_browser_IE with Chrome Frame,device_browser_Internet Explorer,device_browser_Iron,device_browser_Konqueror,device_browser_LYF_LS_4002_11,device_browser_LYF_LS_4002_12,device_browser_Lunascape,device_browser_M5,device_browser_MQQBrowser,device_browser_MRCHROME,device_browser_Maxthon,device_browser_Mozilla,device_browser_Mozilla Compatible Agent,device_browser_Nichrome,device_browser_Nintendo Browser,device_browser_Nokia Browser,device_browser_NokiaE52-1,device_browser_Opera,device_browser_Opera Mini,device_browser_Puffin,device_browser_Reddit,device_browser_Safari,device_browser_Safari (in-app),device_browser_SeaMonkey,device_browser_Seznam,device_browser_TCL P500M,device_browser_ThumbSniper,device_browser_UC Browser,device_browser_User Agent,device_browser_YE,device_browser_YaBrowser,device_browser_[Use default User-agent string] LIVRENPOCHE,device_browser_no-ua,device_browser_osee2unifiedRelease,device_browser_subjectAgent: NoticiasBoom,device_browser_nan,device_deviceCategory_desktop,device_deviceCategory_mobile,device_deviceCategory_tablet,device_deviceCategory_nan,device_operatingSystem_(not set),device_operatingSystem_Android,device_operatingSystem_BlackBerry,device_operatingSystem_Chrome OS,device_operatingSystem_Firefox OS,device_operatingSystem_FreeBSD,device_operatingSystem_Linux,device_operatingSystem_Macintosh,device_operatingSystem_NTT DoCoMo,device_operatingSystem_Nintendo 3DS,device_operatingSystem_Nintendo Wii,device_operatingSystem_Nintendo WiiU,device_operatingSystem_Nokia,device_operatingSystem_OpenBSD,device_operatingSystem_Samsung,device_operatingSystem_SunOS,device_operatingSystem_Windows,device_operatingSystem_Windows Phone,device_operatingSystem_Xbox,device_operatingSystem_iOS,device_operatingSystem_nan,geoNetwork_continent_(not set),geoNetwork_continent_Africa,geoNetwork_continent_Americas,geoNetwork_continent_Asia,geoNetwork_continent_Europe,geoNetwork_continent_Oceania,geoNetwork_continent_nan,geoNetwork_metro_(not set),geoNetwork_metro_Abilene-Sweetwater TX,geoNetwork_metro_Albany-Schenectady-Troy NY,geoNetwork_metro_Atlanta GA,geoNetwork_metro_Augusta GA,geoNetwork_metro_Austin TX,geoNetwork_metro_Baltimore MD,geoNetwork_metro_Boise ID,geoNetwork_metro_Boston MA-Manchester NH,geoNetwork_metro_Butte-Bozeman MT,geoNetwork_metro_Central Scotland,geoNetwork_metro_Charleston SC,geoNetwork_metro_Charlotte NC,geoNetwork_metro_Charlottesville VA,geoNetwork_metro_Chattanooga TN,geoNetwork_metro_Chicago IL,geoNetwork_metro_Chico-Redding CA,geoNetwork_metro_Cincinnati OH,geoNetwork_metro_Cleveland-Akron (Canton) OH,geoNetwork_metro_Colorado Springs-Pueblo CO,geoNetwork_metro_Columbus OH,geoNetwork_metro_Dallas-Ft. Worth TX,geoNetwork_metro_Denver CO,geoNetwork_metro_Detroit MI,geoNetwork_metro_El Paso TX,geoNetwork_metro_Fresno-Visalia CA,geoNetwork_metro_Grand Rapids-Kalamazoo-Battle Creek MI,geoNetwork_metro_Green Bay-Appleton WI,geoNetwork_metro_Greenville-Spartanburg-Asheville-Anderson,geoNetwork_metro_HTV Wales,geoNetwork_metro_HTV West,geoNetwork_metro_Harlingen-Weslaco-Brownsville-McAllen TX,geoNetwork_metro_Hartford & New Haven CT,geoNetwork_metro_Honolulu HI,geoNetwork_metro_Houston TX,geoNetwork_metro_Idaho Falls-Pocatello ID,geoNetwork_metro_Indianapolis IN,geoNetwork_metro_JP_KANTO,geoNetwork_metro_JP_KINKI,geoNetwork_metro_JP_OTHER,geoNetwork_metro_Jacksonville FL,geoNetwork_metro_Kansas City MO,geoNetwork_metro_La Crosse-Eau Claire WI,geoNetwork_metro_Lansing MI,geoNetwork_metro_Las Vegas NV,geoNetwork_metro_Lexington KY,geoNetwork_metro_London,geoNetwork_metro_Los Angeles CA,geoNetwork_metro_Louisville KY,geoNetwork_metro_Madison WI,geoNetwork_metro_Mankato MN,geoNetwork_metro_Memphis TN,geoNetwork_metro_Meridian (exc. Channel Islands),geoNetwork_metro_Miami-Ft. Lauderdale FL,geoNetwork_metro_Midlands,geoNetwork_metro_Milwaukee WI,geoNetwork_metro_Minneapolis-St. Paul MN,geoNetwork_metro_Nashville TN,geoNetwork_metro_New Orleans LA,geoNetwork_metro_New York NY,geoNetwork_metro_Norfolk-Portsmouth-Newport News VA,geoNetwork_metro_North Scotland,geoNetwork_metro_North West,geoNetwork_metro_Omaha NE,geoNetwork_metro_Orlando-Daytona Beach-Melbourne FL,geoNetwork_metro_Panama City FL,geoNetwork_metro_Philadelphia PA,geoNetwork_metro_Phoenix AZ,geoNetwork_metro_Pittsburgh PA,geoNetwork_metro_Portland OR,"geoNetwork_metro_Providence-New Bedford,MA",geoNetwork_metro_Raleigh-Durham (Fayetteville) NC,geoNetwork_metro_Roanoke-Lynchburg VA,"geoNetwork_metro_Rochester-Mason City-Austin,IA",geoNetwork_metro_Sacramento-Stockton-Modesto CA,geoNetwork_metro_Salt Lake City UT,geoNetwork_metro_San Antonio TX,geoNetwork_metro_San Diego CA,geoNetwork_metro_San Francisco-Oakland-San Jose CA,geoNetwork_metro_Seattle-Tacoma WA,geoNetwork_metro_Springfield MO,geoNetwork_metro_Springfield-Holyoke MA,geoNetwork_metro_St. Louis MO,geoNetwork_metro_Syracuse NY,geoNetwork_metro_Tallahassee FL-Thomasville GA,geoNetwork_metro_Tampa-St. Petersburg (Sarasota) FL,geoNetwork_metro_Tri-Cities TN-VA,geoNetwork_metro_Tucson (Sierra Vista) AZ,geoNetwork_metro_Tulsa OK,geoNetwork_metro_Utica NY,geoNetwork_metro_Washington DC (Hagerstown MD),geoNetwork_metro_Wheeling WV-Steubenville OH,geoNetwork_metro_Yorkshire,geoNetwork_metro_not available in demo dataset,geoNetwork_metro_nan,geoNetwork_subContinent_(not set),geoNetwork_subContinent_Australasia,geoNetwork_subContinent_Caribbean,geoNetwork_subContinent_Central America,geoNetwork_subContinent_Central Asia,geoNetwork_subContinent_Eastern Africa,geoNetwork_subContinent_Eastern Asia,geoNetwork_subContinent_Eastern Europe,geoNetwork_subContinent_Melanesia,geoNetwork_subContinent_Micronesian Region,geoNetwork_subContinent_Middle Africa,geoNetwork_subContinent_Northern Africa,geoNetwork_subContinent_Northern America,geoNetwork_subContinent_Northern Europe,geoNetwork_subContinent_Polynesia,geoNetwork_subContinent_South America,geoNetwork_subContinent_Southeast Asia,geoNetwork_subContinent_Southern Africa,geoNetwork_subContinent_Southern Asia,geoNetwork_subContinent_Southern Europe,geoNetwork_subContinent_Western Africa,geoNetwork_subContinent_Western Asia,geoNetwork_subContinent_Western Europe,geoNetwork_subContinent_nan,trafficSource_adContent_20% discount,trafficSource_adContent_Ad from 11/3/16,trafficSource_adContent_Ad from 11/7/16,trafficSource_adContent_Ad from 12/13/16,trafficSource_adContent_Ad from 2/17/17,trafficSource_adContent_Display Ad created 11/17/14,trafficSource_adContent_Display Ad created 3/11/14,trafficSource_adContent_Display Ad created 3/11/15,trafficSource_adContent_First Full Auto Template Test Ad,trafficSource_adContent_Free Shipping!,trafficSource_adContent_Full auto ad IMAGE ONLY,trafficSource_adContent_Full auto ad NATIVE ONLY,trafficSource_adContent_Full auto ad TEXT ONLY,trafficSource_adContent_Full auto ad TEXT/NATIVE,trafficSource_adContent_Full auto ad with Primary Color,trafficSource_adContent_GA Help Center,trafficSource_adContent_Google Merchandise,trafficSource_adContent_Google Merchandise Collection,trafficSource_adContent_Google Online Store,trafficSource_adContent_Google Paraphernalia,trafficSource_adContent_Google Store,trafficSource_adContent_Google store,trafficSource_adContent_JD_5a_v1,trafficSource_adContent_LeEco_1a,trafficSource_adContent_Men's-Outerwear Google Apparel,trafficSource_adContent_Official Google Merchandise - Fast Shipping,trafficSource_adContent_Swag w/ Google Logos,trafficSource_adContent_Swag with Google Logos,trafficSource_adContent_Want Google Sunglasses,trafficSource_adContent_free shipping,trafficSource_adContent_google store,trafficSource_adContent_url_builder,trafficSource_adContent_visit us again,trafficSource_adContent_{KeyWord:Google Brand Items},trafficSource_adContent_{KeyWord:Google Branded Apparel},trafficSource_adContent_{KeyWord:Google Branded Gear},trafficSource_adContent_{KeyWord:Google Branded Kit},trafficSource_adContent_{KeyWord:Google Branded Outerwear},trafficSource_adContent_{KeyWord:Google Drinkware},trafficSource_adContent_{KeyWord:Google Men's T-Shirts},trafficSource_adContent_{KeyWord:Google Merchandise},trafficSource_adContent_{KeyWord:Looking for Google Bags?},trafficSource_adContent_{KeyWord:Want Google Pet Toys?},trafficSource_adContent_{KeyWord:Want Google Stickers?},trafficSource_adContent_nan,trafficSource_adwordsClickInfo.adNetworkType_Google Search,trafficSource_adwordsClickInfo.adNetworkType_Search partners,trafficSource_adwordsClickInfo.adNetworkType_nan,trafficSource_adwordsClickInfo.slot_RHS,trafficSource_adwordsClickInfo.slot_Top,trafficSource_adwordsClickInfo.slot_nan,trafficSource_campaign_(not set),trafficSource_campaign_AW - Accessories,trafficSource_campaign_AW - Apparel,trafficSource_campaign_AW - Dynamic Search Ads Whole Site,trafficSource_campaign_AW - Electronics,trafficSource_campaign_All Products,trafficSource_campaign_Data Share,trafficSource_campaign_Data Share Promo,trafficSource_campaign_Retail (DO NOT EDIT owners nophakun and tianyu),trafficSource_campaign_test-liyuhz,trafficSource_campaign_nan,trafficSource_medium_(none),trafficSource_medium_(not set),trafficSource_medium_affiliate,trafficSource_medium_cpc,trafficSource_medium_cpm,trafficSource_medium_organic,trafficSource_medium_referral,trafficSource_medium_nan
0,2,False,2,175,16269,1,1.0,4,1.0,4.0,0,0.0,1.0,1.0,12,0,150,2017,10,42,16,0,289,False,False,False,False,False,False,1508151024,True,True,False,True,True,False,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,False,643,182,20382,18,1.0,5,1.0,5.0,0,0.0,1.0,1.0,12,0,150,2017,10,42,16,0,289,False,False,False,False,False,False,1508175522,True,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,False,649,70,21582,376,1.0,7,1.0,7.0,0,0.0,1.0,1.0,12,0,150,2017,10,42,16,0,289,False,False,False,False,False,False,1508143220,True,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,True,367,213,1,58,1.0,8,1.0,4.0,0,0.0,1.0,1.0,12,0,150,2017,10,42,16,0,289,False,False,False,False,False,False,1508193530,True,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,False,511,213,1,58,1.0,9,1.0,4.0,0,0.0,1.0,1.0,12,0,150,2017,10,42,17,1,290,False,False,False,False,False,False,1508217442,True,False,False,True,True,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [51]:
all(df.columns == test.columns)

True

In [46]:
y.mean()

0.2421829352036406

In [45]:
y = np.log1p(y)

In [54]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=100)
m.fit(df, y)
m.score(df,y)

0.9029624185260904

#### Train valid split

In [49]:
all(full_train.index == df.index)

ValueError: Lengths must match to compare

##### First simple way to make validation 

In [56]:
valid_index = full_train[full_train.fullVisitorId.isin(validationIds)].index

In [57]:
train_index = full_train[~full_train.fullVisitorId.isin(validationIds)].index

In [58]:
def split_vals_array(a, train_index, val_index):
    return a[train_index].copy(), a[val_index].copy()

def split_vals_df(a, train_index, val_index):
    return a.iloc[train_index].copy(), a.iloc[val_index].copy()


raw_train, raw_valid = split_vals_df(full_train, train_index, valid_index)
X_train, X_valid = split_vals_df(df, train_index, valid_index)
y_train, y_valid = split_vals_array(y, train_index, valid_index)

X_train.shape, y_train.shape, X_valid.shape

((681849, 321), (681849,), (221804, 321))

#####  Second way to validate

In [51]:
# dataframe should be sorted by time
# TODO

False

In [47]:
tscv = TimeSeriesSplit(n_splits=5)

In [48]:
for tr_index, val_index in tscv.split(df.visitStartTime):
    print(f'Train max time: {df.iloc[tr_index].visitStartTime.max()}, Valid max time: {df.iloc[val_index].visitStartTime.max()}')

Train max time: 1501657193, Valid max time: 1501311350
Train max time: 1501657193, Valid max time: 1500879240
Train max time: 1501657193, Valid max time: 1501138795
Train max time: 1501657193, Valid max time: 1501570458
Train max time: 1501657193, Valid max time: 1501479440


In [49]:
1501657193 < 1501311350

False

In [59]:
y_train.mean(), y_valid.mean()

(0.20146147902692166, 0.3059895624328279)

In [60]:
y_train.max(), y_valid.max()

(23.497337833653027, 23.86437469605166)

#### Example of TimeSeriesSplit

In [89]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv = TimeSeriesSplit(n_splits=5)
print(tscv)  

for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TimeSeriesSplit(max_train_size=None, n_splits=5)
TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]


In [90]:
tscv = TimeSeriesSplit(n_splits=5)

#### Base Model 

In [61]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
           m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [62]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())

In [63]:
m_val = RandomForestRegressor(n_jobs=-1, n_estimators=100)
%time m_val.fit(X_train, y_train)

CPU times: user 46min 33s, sys: 50.6 s, total: 47min 24s
Wall time: 6min 57s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

#### Visit Level

In [64]:
print_score(m_val)

[0.5976430330813959, 2.1541521568709947, 0.8997188839647949, 0.14058583153614346]


#### Go to User Level

In [65]:
val_preds = m_val.predict(X_valid)

In [66]:
val_preds[val_preds < 0] = 0.

In [67]:
grouped = raw_valid[['fullVisitorId']].copy()

In [68]:
grouped['y'] = np.expm1(y_valid)
grouped['y_hat'] = np.expm1(val_preds)

In [69]:
grouped = grouped.groupby('fullVisitorId', as_index=False).sum()

In [70]:
grouped['y'] = np.log1p(grouped['y'])
grouped['y_hat'] = np.log1p(grouped['y_hat'])

In [71]:
rmse(grouped.y, grouped.y_hat)

2.1447894992588807

####  Test flow

In [72]:
test_preds = m.predict(test)
test_preds[test_preds < 0] = 0.

In [76]:
test_grouped = test_df[['fullVisitorId']].copy()
test_grouped['y_hat'] = np.expm1(test_preds)

In [79]:
test_grouped = test_grouped.groupby('fullVisitorId', as_index=False).sum()
test_grouped['y_hat'] = np.log1p(test_grouped['y_hat'])

In [82]:
submit = pd.read_csv('../data/sample_submission.csv')

In [83]:
test_grouped.fullVisitorId.tolist() == submit.fullVisitorId.tolist()

True

In [84]:
submit['fullVisitorId'] = test_grouped['fullVisitorId']

In [85]:
submit['PredictedLogRevenue'] = test_grouped['y_hat']

In [86]:
os.makedirs('../submissions', exist_ok=True)

In [87]:
submit.to_csv('../submissions/rf_baseline.csv', header=True, index=False)