### Imports 

In [1]:
import sys
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
sys.path.append('../src/')

In [4]:
from data_utils import process_data, load_data, align_frames, preprocess_pipeline
from cleaner import Cleaner

### Loads

In [4]:
# uncomment if there are no processed (flattened) files
# process_data(nrows=10000)

In [13]:
%%time
train_df, test_df = load_data()

CPU times: user 311 ms, sys: 32.2 ms, total: 343 ms
Wall time: 384 ms


In [5]:
%%time
# check preprocessing pipeline
train_df, test_df = preprocess_pipeline(nrows=10000, nan_fraction=0.999)

2018-09-24 16:47:09,604 - data_utils - DEBUG - Preprocessing pipeline started..
2018-09-24 16:47:09,606 - data_utils - DEBUG - start processing
2018-09-24 16:47:09,608 - data_utils - DEBUG - flatten json data at path: ../src/../data/train.csv
2018-09-24 16:47:13,161 - data_utils - DEBUG - done flattening
2018-09-24 16:47:14,200 - data_utils - DEBUG - flatten json data at path: ../src/../data/test.csv
2018-09-24 16:47:17,505 - data_utils - DEBUG - done flattening
2018-09-24 16:47:18,088 - data_utils - DEBUG - done processing
2018-09-24 16:47:18,089 - data_utils - DEBUG - loading flatenned data..
2018-09-24 16:47:18,752 - data_utils - DEBUG - cleaning data..
2018-09-24 16:47:19,315 - data_utils - DEBUG - aligning data..
2018-09-24 16:47:19,352 - data_utils - DEBUG - finally saving preprocessed data
2018-09-24 16:47:19,862 - data_utils - DEBUG - Preprocessing pipeline - done.
CPU times: user 6.18 s, sys: 665 ms, total: 6.85 s
Wall time: 10.3 s


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
channelGrouping                                 10000 non-null object
date                                            10000 non-null int64
fullVisitorId                                   10000 non-null object
sessionId                                       10000 non-null object
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
visitStartTime                                  10000 non-null int64
device_browser                                  10000 non-null object
device_deviceCategory                           10000 non-null object
device_isMobile                                 10000 non-null bool
device_operatingSystem                          10000 non-null object
geoNetwork_city                                 10000 non-null object
geoNetwork_continent                            10000 non-

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 34 columns):
channelGrouping                                 10000 non-null object
date                                            10000 non-null int64
fullVisitorId                                   10000 non-null object
sessionId                                       10000 non-null object
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
visitStartTime                                  10000 non-null int64
device_browser                                  10000 non-null object
device_deviceCategory                           10000 non-null object
device_isMobile                                 10000 non-null bool
device_operatingSystem                          10000 non-null object
geoNetwork_city                                 10000 non-null object
geoNetwork_continent                            10000 non-

In [8]:
train_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device_browser,device_browserSize,device_browserVersion,device_deviceCategory,device_flashVersion,device_isMobile,device_language,device_mobileDeviceBranding,device_mobileDeviceInfo,device_mobileDeviceMarketingName,device_mobileDeviceModel,device_mobileInputSelector,device_operatingSystem,device_operatingSystemVersion,device_screenColors,device_screenResolution,geoNetwork_city,geoNetwork_cityId,geoNetwork_continent,geoNetwork_country,geoNetwork_latitude,geoNetwork_longitude,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_networkLocation,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,totals_transactionRevenue,totals_visits,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.criteriaParameters,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Izmir,not available in demo dataset,Asia,Turkey,not available in demo dataset,not available in demo dataset,(not set),ttnet.com.tr,not available in demo dataset,Izmir,Western Asia,1.0,1,1.0,1,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Oceania,Australia,not available in demo dataset,not available in demo dataset,not available in demo dataset,dodo.net.au,not available in demo dataset,not available in demo dataset,Australasia,1.0,1,1.0,1,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Madrid,not available in demo dataset,Europe,Spain,not available in demo dataset,not available in demo dataset,(not set),unknown.unknown,not available in demo dataset,Community of Madrid,Southern Europe,1.0,1,1.0,1,,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Linux,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Asia,Indonesia,not available in demo dataset,not available in demo dataset,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,Southeast Asia,1.0,1,1.0,1,,1,,,not available in demo dataset,,,,,(not set),,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,not available in demo dataset,not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,Northern Europe,1.0,1,,1,,1,,,not available in demo dataset,,,,,(not set),True,(not provided),organic,,google


In [9]:
test_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device_browser,device_browserSize,device_browserVersion,device_deviceCategory,device_flashVersion,device_isMobile,device_language,device_mobileDeviceBranding,device_mobileDeviceInfo,device_mobileDeviceMarketingName,device_mobileDeviceModel,device_mobileInputSelector,device_operatingSystem,device_operatingSystemVersion,device_screenColors,device_screenResolution,geoNetwork_city,geoNetwork_cityId,geoNetwork_continent,geoNetwork_country,geoNetwork_latitude,geoNetwork_longitude,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_networkLocation,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,totals_visits,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.criteriaParameters,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source
0,Organic Search,20171016,6167871330617112363,6167871330617112363_1508151024,Not Socially Engaged,1508151024,2,1508151024,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,Asia,Singapore,not available in demo dataset,not available in demo dataset,(not set),myrepublic.com.sg,not available in demo dataset,(not set),Southeast Asia,,4,,4.0,1,,,not available in demo dataset,,,,,(not set),True,(not provided),organic,,google
1,Organic Search,20171016,643697640977915618,0643697640977915618_1508175522,Not Socially Engaged,1508175522,1,1508175522,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset,Zaragoza,not available in demo dataset,Europe,Spain,not available in demo dataset,not available in demo dataset,(not set),rima-tde.net,not available in demo dataset,Aragon,Southern Europe,,5,1.0,5.0,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google
2,Organic Search,20171016,6059383810968229466,6059383810968229466_1508143220,Not Socially Engaged,1508143220,1,1508143220,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Europe,France,not available in demo dataset,not available in demo dataset,not available in demo dataset,sfr.net,not available in demo dataset,not available in demo dataset,Western Europe,,7,1.0,7.0,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google
3,Organic Search,20171016,2376720078563423631,2376720078563423631_1508193530,Not Socially Engaged,1508193530,1,1508193530,Safari,not available in demo dataset,not available in demo dataset,mobile,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,not available in demo dataset,not available in demo dataset,Mountain View,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,San Francisco-Oakland-San Jose CA,(not set),not available in demo dataset,California,Northern America,,8,1.0,4.0,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google
4,Organic Search,20171016,2314544520795440038,2314544520795440038_1508217442,Not Socially Engaged,1508217442,1,1508217442,Safari,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,not available in demo dataset,not available in demo dataset,San Jose,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,San Francisco-Oakland-San Jose CA,(not set),not available in demo dataset,California,Northern America,,9,1.0,4.0,1,,,not available in demo dataset,,,,,(not set),,(not provided),organic,,google


### Working with processing pipeline

In [16]:
cleaner = Cleaner()

In [17]:
%%time
train_df = cleaner.clean(train_df, nan_fraction=0.999)
test_df = cleaner.clean(test_df, nan_fraction=0.999)

CPU times: user 293 ms, sys: 7.23 ms, total: 301 ms
Wall time: 316 ms


In [18]:
%%time
train_df, test_df = align_frames(train_df, test_df, 'totals_transactionRevenue')

CPU times: user 11 ms, sys: 1.17 ms, total: 12.2 ms
Wall time: 12.4 ms


In [8]:
set(train_df.columns).difference(test_df.columns)

{'totals_transactionRevenue'}

### Exploration

sessionID is 'fullVisitorId' + '_' + 'visitStartTime'

In [8]:
# if output is 1 == all sessionIds are unique
train_df.sessionId.value_counts().max()

2

In [29]:
test_df.sessionId.value_counts().max()

2

In [15]:
(train_df.sessionId.value_counts() == 2). value_counts()

False    901857
True        898
Name: sessionId, dtype: int64

In [30]:
(test_df.sessionId.value_counts() == 2). value_counts()

False    803042
True        821
Name: sessionId, dtype: int64

In [16]:
# Number of unique visitors
train_df.fullVisitorId.nunique()

714167

In [31]:
test_df.fullVisitorId.nunique()

617242

#### Find and remove columns with one unique value 

In [10]:
for column in train_df.columns:
    uniqs = train_df[column].unique()
    if len(uniqs) == 1:   # using .nunique() doesn't count nans
        print(column, uniqs)

socialEngagementType ['Not Socially Engaged']
device_browserSize ['not available in demo dataset']
device_browserVersion ['not available in demo dataset']
device_flashVersion ['not available in demo dataset']
device_language ['not available in demo dataset']
device_mobileDeviceBranding ['not available in demo dataset']
device_mobileDeviceInfo ['not available in demo dataset']
device_mobileDeviceMarketingName ['not available in demo dataset']
device_mobileDeviceModel ['not available in demo dataset']
device_mobileInputSelector ['not available in demo dataset']
device_operatingSystemVersion ['not available in demo dataset']
device_screenColors ['not available in demo dataset']
device_screenResolution ['not available in demo dataset']
geoNetwork_cityId ['not available in demo dataset']
geoNetwork_latitude ['not available in demo dataset']
geoNetwork_longitude ['not available in demo dataset']
geoNetwork_networkLocation ['not available in demo dataset']
totals_visits [1]
trafficSource_adwo

In [11]:
for column in test_df.columns:
    uniqs = test_df[column].unique()
    if len(uniqs) == 1:   # using .nunique() doesn't count nans
        print(column, uniqs)

socialEngagementType ['Not Socially Engaged']
device_browserSize ['not available in demo dataset']
device_browserVersion ['not available in demo dataset']
device_flashVersion ['not available in demo dataset']
device_language ['not available in demo dataset']
device_mobileDeviceBranding ['not available in demo dataset']
device_mobileDeviceInfo ['not available in demo dataset']
device_mobileDeviceMarketingName ['not available in demo dataset']
device_mobileDeviceModel ['not available in demo dataset']
device_mobileInputSelector ['not available in demo dataset']
device_operatingSystemVersion ['not available in demo dataset']
device_screenColors ['not available in demo dataset']
device_screenResolution ['not available in demo dataset']
geoNetwork_cityId ['not available in demo dataset']
geoNetwork_latitude ['not available in demo dataset']
geoNetwork_longitude ['not available in demo dataset']
geoNetwork_networkLocation ['not available in demo dataset']
totals_visits [1]
trafficSource_adwo

In [12]:
def drop_col_with_one_val(df):
    for column in df.columns:
        uniqs = df[column].unique()
        if len(uniqs) == 1:   # using .nunique() doesn't count nans
            df = df.drop(column, axis=1)
    return df

In [13]:
%%time
train_df = drop_col_with_one_val(train_df)

CPU times: user 123 ms, sys: 13.1 ms, total: 137 ms
Wall time: 166 ms


In [14]:
%%time
test_df = drop_col_with_one_val(test_df)

CPU times: user 138 ms, sys: 5.9 ms, total: 144 ms
Wall time: 166 ms


In [19]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 28 columns):
channelGrouping               10000 non-null object
date                          10000 non-null int64
fullVisitorId                 10000 non-null object
sessionId                     10000 non-null object
visitId                       10000 non-null int64
visitNumber                   10000 non-null int64
visitStartTime                10000 non-null int64
device_browser                10000 non-null object
device_deviceCategory         10000 non-null object
device_isMobile               10000 non-null bool
device_operatingSystem        10000 non-null object
geoNetwork_city               10000 non-null object
geoNetwork_continent          10000 non-null object
geoNetwork_country            10000 non-null object
geoNetwork_metro              10000 non-null object
geoNetwork_networkDomain      10000 non-null object
geoNetwork_region             10000 non-null object
geoNetwork_sub

In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 34 columns):
channelGrouping                                 10000 non-null object
date                                            10000 non-null int64
fullVisitorId                                   10000 non-null object
sessionId                                       10000 non-null object
visitId                                         10000 non-null int64
visitNumber                                     10000 non-null int64
visitStartTime                                  10000 non-null int64
device_browser                                  10000 non-null object
device_deviceCategory                           10000 non-null object
device_isMobile                                 10000 non-null bool
device_operatingSystem                          10000 non-null object
geoNetwork_city                                 10000 non-null object
geoNetwork_continent                            10000 non-

In [17]:
def remove_nan_columns(df, fraction):
    """
    Remove column in dataframe
    if fraction of NaNs in that column
    is >= `fraction`
    """
    tresh = 1 - fraction
    for column in df.columns:
        if df[column].notna().astype(int).mean() < tresh:
            df = df.drop(column, axis=1)
    
    return df

In [18]:
train_df = remove_nan_columns(train_df, 0.85)

In [20]:
test_df = remove_nan_columns(test_df, 0.85)

In [21]:
train_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,1472830385,1,1472830385,Chrome,desktop,False,Windows,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1.0,1,1.0,1,(not set),,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,1472880147,1,1472880147,Firefox,desktop,False,Macintosh,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,1.0,1,1.0,1,(not set),,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,1472865386,1,1472865386,Chrome,desktop,False,Windows,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,1.0,1,1.0,1,(not set),,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,1472881213,1,1472881213,UC Browser,desktop,False,Linux,not available in demo dataset,Asia,Indonesia,not available in demo dataset,unknown.unknown,not available in demo dataset,Southeast Asia,1.0,1,1.0,1,(not set),,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,1472822600,2,1472822600,Chrome,mobile,True,Android,not available in demo dataset,Europe,United Kingdom,not available in demo dataset,unknown.unknown,not available in demo dataset,Northern Europe,1.0,1,,1,(not set),True,(not provided),organic,,google


In [22]:
test_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source
0,Organic Search,20171016,6167871330617112363,6167871330617112363_1508151024,1508151024,2,1508151024,Chrome,desktop,False,Macintosh,(not set),Asia,Singapore,(not set),myrepublic.com.sg,(not set),Southeast Asia,,4,,4.0,(not set),True,(not provided),organic,,google
1,Organic Search,20171016,643697640977915618,0643697640977915618_1508175522,1508175522,1,1508175522,Chrome,desktop,False,Windows,Zaragoza,Europe,Spain,(not set),rima-tde.net,Aragon,Southern Europe,,5,1.0,5.0,(not set),,(not provided),organic,,google
2,Organic Search,20171016,6059383810968229466,6059383810968229466_1508143220,1508143220,1,1508143220,Chrome,desktop,False,Macintosh,not available in demo dataset,Europe,France,not available in demo dataset,sfr.net,not available in demo dataset,Western Europe,,7,1.0,7.0,(not set),,(not provided),organic,,google
3,Organic Search,20171016,2376720078563423631,2376720078563423631_1508193530,1508193530,1,1508193530,Safari,mobile,True,iOS,Mountain View,Americas,United States,San Francisco-Oakland-San Jose CA,(not set),California,Northern America,,8,1.0,4.0,(not set),,(not provided),organic,,google
4,Organic Search,20171016,2314544520795440038,2314544520795440038_1508217442,1508217442,1,1508217442,Safari,desktop,False,Macintosh,San Jose,Americas,United States,San Francisco-Oakland-San Jose CA,(not set),California,Northern America,,9,1.0,4.0,(not set),,(not provided),organic,,google


In [23]:
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)

In [24]:
train_cols.difference(test_cols)

set()

In [31]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
channelGrouping               10000 non-null object
date                          10000 non-null int64
fullVisitorId                 10000 non-null object
sessionId                     10000 non-null object
visitId                       10000 non-null int64
visitNumber                   10000 non-null int64
visitStartTime                10000 non-null int64
device_browser                10000 non-null object
device_deviceCategory         10000 non-null object
device_isMobile               10000 non-null bool
device_operatingSystem        10000 non-null object
geoNetwork_city               10000 non-null object
geoNetwork_continent          10000 non-null object
geoNetwork_country            10000 non-null object
geoNetwork_metro              10000 non-null object
geoNetwork_networkDomain      10000 non-null object
geoNetwork_region             10000 non-null object
geoNetwork_sub

In [32]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
channelGrouping               10000 non-null object
date                          10000 non-null int64
fullVisitorId                 10000 non-null object
sessionId                     10000 non-null object
visitId                       10000 non-null int64
visitNumber                   10000 non-null int64
visitStartTime                10000 non-null int64
device_browser                10000 non-null object
device_deviceCategory         10000 non-null object
device_isMobile               10000 non-null bool
device_operatingSystem        10000 non-null object
geoNetwork_city               10000 non-null object
geoNetwork_continent          10000 non-null object
geoNetwork_country            10000 non-null object
geoNetwork_metro              10000 non-null object
geoNetwork_networkDomain      10000 non-null object
geoNetwork_region             10000 non-null object
geoNetwork_sub

In [34]:
all(train_df.columns == test_df.columns)

True

In [28]:
test_df = test_df.drop('totals_bounces', axis=1)

In [30]:
test_df, train_df = test_df.align(train_df, join='inner', axis=1)

#### Change datatypes 

In [38]:
# make categorical if number of unique values in column < 5% of number of examples
# TODO: find number of uniques to make the column categorical
cat_num = 50

In [56]:
cat_columns_train = []

In [57]:
for column in train_df.columns:
    uniqs = train_df[column].unique()
    if (len(uniqs) < cat_num) and (len(uniqs) > 2):
        print(column, uniqs)
        cat_columns_train.append(column)
        print()

channelGrouping ['Organic Search' 'Referral' 'Paid Search' 'Affiliates' 'Direct' 'Display'
 'Social' '(Other)']

device_deviceCategory ['desktop' 'mobile' 'tablet']

device_operatingSystem ['Windows' 'Macintosh' 'Linux' 'Android' 'iOS' 'Chrome OS' 'BlackBerry'
 '(not set)' 'Samsung' 'Windows Phone' 'Xbox' 'Nintendo Wii' 'Firefox OS'
 'Nintendo WiiU' 'FreeBSD' 'Nokia' 'NTT DoCoMo' 'Nintendo 3DS' 'SunOS'
 'OpenBSD']

geoNetwork_continent ['Asia' 'Oceania' 'Europe' 'Americas' 'Africa' '(not set)']

geoNetwork_subContinent ['Western Asia' 'Australasia' 'Southern Europe' 'Southeast Asia'
 'Northern Europe' 'Southern Asia' 'Western Europe' 'South America'
 'Eastern Asia' 'Eastern Europe' 'Northern America' 'Western Africa'
 'Central America' 'Eastern Africa' '(not set)' 'Caribbean'
 'Southern Africa' 'Northern Africa' 'Central Asia' 'Middle Africa'
 'Melanesia' 'Micronesian Region' 'Polynesia']

trafficSource_adContent [nan 'Full auto ad IMAGE ONLY' 'First Full Auto Template Test Ad'
 '{KeyW

In [58]:
cat_columns_train = set(cat_columns_train)

In [59]:
cat_columns_test = []

In [60]:
for column in test_df.columns:
    uniqs = test_df[column].unique()
    if (len(uniqs) < cat_num) and (len(uniqs) > 2):
        print(column, uniqs)
        cat_columns_test.append(column)
        print()

channelGrouping ['Organic Search' 'Paid Search' 'Display' 'Direct' 'Referral' 'Social'
 'Affiliates' '(Other)']

device_deviceCategory ['desktop' 'mobile' 'tablet']

device_operatingSystem ['Macintosh' 'Windows' 'iOS' 'Linux' 'Android' 'Chrome OS' '(not set)'
 'Windows Phone' 'Samsung' 'Tizen' 'Playstation Vita' 'OS/2' 'Xbox'
 'Nintendo 3DS' 'BlackBerry' 'Nintendo WiiU' 'SymbianOS' 'Firefox OS'
 'FreeBSD' 'OpenBSD' 'SunOS' 'Nokia']

geoNetwork_continent ['Asia' 'Europe' 'Americas' 'Africa' 'Oceania' '(not set)']

geoNetwork_subContinent ['Southeast Asia' 'Southern Europe' 'Western Europe' 'Northern America'
 'Central America' 'South America' 'Northern Europe' 'Eastern Asia'
 'Western Asia' 'Eastern Europe' 'Southern Asia' 'Northern Africa'
 'Eastern Africa' 'Western Africa' 'Australasia' 'Caribbean'
 'Central Asia' '(not set)' 'Southern Africa' 'Middle Africa'
 'Micronesian Region' 'Polynesia' 'Melanesia']

trafficSource_adwordsClickInfo.adNetworkType [nan 'Google Search' 'Content' 'Se

In [61]:
cat_columns_test = set(cat_columns_test)

In [62]:
cat_columns_test.difference(cat_columns_train)

set()

In [63]:
cat_columns_test

{'channelGrouping',
 'device_deviceCategory',
 'device_operatingSystem',
 'geoNetwork_continent',
 'geoNetwork_subContinent',
 'trafficSource_adwordsClickInfo.adNetworkType',
 'trafficSource_adwordsClickInfo.page',
 'trafficSource_adwordsClickInfo.slot',
 'trafficSource_campaign',
 'trafficSource_medium'}

In [64]:
cat_columns_train

{'channelGrouping',
 'device_deviceCategory',
 'device_operatingSystem',
 'geoNetwork_continent',
 'geoNetwork_subContinent',
 'trafficSource_adContent',
 'trafficSource_adwordsClickInfo.adNetworkType',
 'trafficSource_adwordsClickInfo.page',
 'trafficSource_adwordsClickInfo.slot',
 'trafficSource_campaign',
 'trafficSource_medium'}

In [65]:
for column in cat_columns_train:
    train_df[column] = train_df[column].astype('category')
    test_df[column] = test_df[column].astype('category')

In [66]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 35 columns):
channelGrouping                                 903653 non-null category
date                                            903653 non-null int64
fullVisitorId                                   903653 non-null object
sessionId                                       903653 non-null object
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null int64
visitStartTime                                  903653 non-null int64
device_browser                                  903653 non-null object
device_deviceCategory                           903653 non-null category
device_isMobile                                 903653 non-null bool
device_operatingSystem                          903653 non-null category
geoNetwork_city                                 903653 non-null object
geoNetwork_continent                 

In [67]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 34 columns):
channelGrouping                                 804684 non-null category
date                                            804684 non-null int64
fullVisitorId                                   804684 non-null object
sessionId                                       804684 non-null object
visitId                                         804684 non-null int64
visitNumber                                     804684 non-null int64
visitStartTime                                  804684 non-null int64
device_browser                                  804684 non-null object
device_deviceCategory                           804684 non-null category
device_isMobile                                 804684 non-null bool
device_operatingSystem                          804684 non-null category
geoNetwork_city                                 804684 non-null object
geoNetwork_continent                 

In [68]:
sample_df = train_df.sample(100)

In [73]:
sample_df['trafficSource_adwordsClickInfo.isVideoAd'] = sample_df['trafficSource_adwordsClickInfo.isVideoAd'].astype(float)

In [77]:
sample_df['trafficSource_isTrueDirect'] = sample_df['trafficSource_isTrueDirect'].astype(float)

In [79]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 104103 to 99635
Data columns (total 35 columns):
channelGrouping                                 100 non-null category
date                                            100 non-null int64
fullVisitorId                                   100 non-null object
sessionId                                       100 non-null object
visitId                                         100 non-null int64
visitNumber                                     100 non-null int64
visitStartTime                                  100 non-null int64
device_browser                                  100 non-null object
device_deviceCategory                           100 non-null category
device_isMobile                                 100 non-null bool
device_operatingSystem                          100 non-null category
geoNetwork_city                                 100 non-null object
geoNetwork_continent                            100 non-null category
ge

In [78]:
sample_df.to_hdf('../data/processed_data/sample.h5', key='sample', format='table')

In [80]:
del sample_df

In [81]:
sample_df = pd.read_hdf('../data/processed_data/sample.h5')

In [82]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 104103 to 99635
Data columns (total 35 columns):
channelGrouping                                 100 non-null category
date                                            100 non-null int64
fullVisitorId                                   100 non-null object
sessionId                                       100 non-null object
visitId                                         100 non-null int64
visitNumber                                     100 non-null int64
visitStartTime                                  100 non-null int64
device_browser                                  100 non-null object
device_deviceCategory                           100 non-null category
device_isMobile                                 100 non-null bool
device_operatingSystem                          100 non-null category
geoNetwork_city                                 100 non-null object
geoNetwork_continent                            100 non-null category
ge

In [83]:
train_df['trafficSource_adwordsClickInfo.isVideoAd'] = train_df['trafficSource_adwordsClickInfo.isVideoAd'].astype(
    float)
test_df['trafficSource_adwordsClickInfo.isVideoAd'] = test_df['trafficSource_adwordsClickInfo.isVideoAd'].astype(
    float)

In [84]:
train_df['trafficSource_isTrueDirect'] = train_df['trafficSource_isTrueDirect'].astype(
    float)
test_df['trafficSource_isTrueDirect'] = test_df['trafficSource_isTrueDirect'].astype(
    float)

In [85]:
train_df.to_hdf('../data/processed_data/train.h5', key='train_df', format='table')