In [100]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import ast
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.metrics import mean_squared_error

In [101]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']

    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)

    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

def flatten_hits(df):
    df_ = pd.DataFrame()
    for index, row in df.iterrows():
        initial_id = df['fullVisitorId'][index]
        s = json.dumps(df['hits'][index])
        r = json.loads(s)
        d = ast.literal_eval(r)
        for each in d:
            each['fullVisitorId'] = initial_id
        column_as_df = json_normalize(d)
        if 'product' in column_as_df.columns:
            column_as_df['v2ProductName'] = column_as_df['product'].apply(lambda x: [p['v2ProductName'] for p in x] if type(x) == list else [])
            column_as_df['v2ProductCategory'] = column_as_df['product'].apply(lambda x: [p['v2ProductCategory'] for p in x] if type(x) == list else [])
            del column_as_df['product']
        if 'promotion' in column_as_df.columns:
            column_as_df['promoId']  = column_as_df['promotion'].apply(lambda x: [p['promoId'] for p in x] if type(x) == list else [])
            column_as_df['promoName']  = column_as_df['promotion'].apply(lambda x: [p['promoName'] for p in x] if type(x) == list else [])
            del column_as_df['promotion']
        df_ = df_.append(column_as_df)
    df = df.merge(df_, on='fullVisitorId')
    return df


In [102]:
### Loading TRAIN Data
df_train = load_df('D:\\Documents\\ga-customer-revenue-prediction\\train_v2.csv', nrows=450)
for each in df_train.columns:
    print(each)
print('-'*20)
df_train = flatten_hits(df_train)
for each in df_train.columns:
    print(each)
### Loading TEST Data
df_test = load_df('D:\\Documents\\ga-customer-revenue-prediction\\test_v2.csv', nrows=450)
df_test = flatten_hits(df_test)


Loaded train_v2.csv. Shape: (450, 59)
channelGrouping
customDimensions
date
fullVisitorId
hits
socialEngagementType
visitId
visitNumber
visitStartTime
device.browser
device.browserSize
device.browserVersion
device.deviceCategory
device.flashVersion
device.isMobile
device.language
device.mobileDeviceBranding
device.mobileDeviceInfo
device.mobileDeviceMarketingName
device.mobileDeviceModel
device.mobileInputSelector
device.operatingSystem
device.operatingSystemVersion
device.screenColors
device.screenResolution
geoNetwork.city
geoNetwork.cityId
geoNetwork.continent
geoNetwork.country
geoNetwork.latitude
geoNetwork.longitude
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.networkLocation
geoNetwork.region
geoNetwork.subContinent
totals.bounces
totals.hits
totals.newVisits
totals.pageviews
totals.sessionQualityDim
totals.timeOnSite
totals.totalTransactionRevenue
totals.transactionRevenue
totals.transactions
totals.visits
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetwork

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


channelGrouping
customDimensions_x
date
fullVisitorId
hits
socialEngagementType
visitId
visitNumber
visitStartTime
device.browser
device.browserSize
device.browserVersion
device.deviceCategory
device.flashVersion
device.isMobile
device.language
device.mobileDeviceBranding
device.mobileDeviceInfo
device.mobileDeviceMarketingName
device.mobileDeviceModel
device.mobileInputSelector
device.operatingSystem
device.operatingSystemVersion
device.screenColors
device.screenResolution
geoNetwork.city
geoNetwork.cityId
geoNetwork.continent
geoNetwork.country
geoNetwork.latitude
geoNetwork.longitude
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.networkLocation
geoNetwork.region
geoNetwork.subContinent
totals.bounces
totals.hits
totals.newVisits
totals.pageviews
totals.sessionQualityDim
totals.timeOnSite
totals.totalTransactionRevenue
totals.transactionRevenue
totals.transactions
totals.visits
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.

In [103]:
ones = []
for each in df_train.columns:
#     print(str(each) + ': ' + str(df_train[each].nunique()))
    try:
        if df_train[each].nunique() == 1:
            ones.append(each)
    except:
        print(each)
cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])]
cols_to_remove.append('hits')
cols_to_remove.append('customDimensions_x')
cols_to_remove.append('customDimensions_y')

cols_to_remove.append('customVariables')
cols_to_remove.append('customMetrics')
cols_to_remove.append('experiment')
cols_to_remove.append('promoId')
cols_to_remove.append('promoName')
cols_to_remove.append('promotionActionInfo.promoIsView')
cols_to_remove.append('publisher_infos')
cols_to_remove.append('v2ProductCategory')
cols_to_remove.append('v2ProductName')


customDimensions_y
customMetrics
customVariables
experiment
promoId
promoName
publisher_infos
v2ProductCategory
v2ProductName


In [104]:
#### CHANGED THIS TO GET AVERAGE ####
average = df_train['totals.totalTransactionRevenue'].dropna().mean()

y = df_train['totals.totalTransactionRevenue'].fillna(average).astype(float)
y = y.apply(lambda x: np.log1p(x))
df_train = df_train.drop('totals.totalTransactionRevenue', axis=1)


In [105]:
### Removing columns that contain no data
df_train = df_train.drop(list(cols_to_remove), axis=1)
df_test = df_test.drop(list(cols_to_remove), axis=1)

In [121]:
pd.set_option('display.max_rows', -1)
df_train.iloc[19]

                                                3                  
channelGrouping                                 20171016           
date                                            7390444353235629134
fullVisitorId                                   0                  
socialEngagementType                            1508157285         
visitId                                         1                  
visitNumber                                     1508157285         
visitStartTime                                  1                  
device.browser                                  0                  
device.deviceCategory                           False              
device.isMobile                                 1                  
device.operatingSystem                          84                 
geoNetwork.city                                 4                  
geoNetwork.continent                            58                 
geoNetwork.country                              

In [115]:
cat_columns = ['channelGrouping',
               'socialEngagementType',
               'device.browser',
               'device.deviceCategory',
               'device.operatingSystem',
               'geoNetwork.city',
               'geoNetwork.continent',
               'geoNetwork.country',
               'geoNetwork.metro', 
               'geoNetwork.networkDomain',
               'geoNetwork.region', 
               'geoNetwork.subContinent',
               'trafficSource.adContent',
               'trafficSource.adwordsClickInfo.adNetworkType',
               'trafficSource.adwordsClickInfo.gclId',
               'trafficSource.adwordsClickInfo.page',
               'trafficSource.adwordsClickInfo.slot', 
               'trafficSource.campaign',
               'trafficSource.keyword',
               'trafficSource.referralPath',
               'trafficSource.source',
               'trafficSource.medium',
               'appInfo.exitScreenName',
               'appInfo.landingScreenName',
               'appInfo.screenName',
               'contentGroup.contentGroup1',
               'contentGroup.contentGroup2',
               'contentGroup.contentGroup3',
               'contentGroup.contentGroup4',
               'contentGroup.contentGroup5',
               'contentGroup.previousContentGroup1',
               'contentGroup.previousContentGroup2',
               'contentGroup.previousContentGroup3',
               'contentGroup.previousContentGroup4',
               'contentGroup.previousContentGroup5',
               'dataSource',
               'item.currencyCode',
               'transaction.currencyCode',
               'page.hostname',
               'page.pagePath',
               'page.pagePathLevel1',
               'page.pagePathLevel2',
               'page.pagePathLevel3',
               'page.pagePathLevel4',
               'page.pageTitle',
#                'page.searchCategory',
#                'page.searchKeyword',
               'referer',
               'social.socialNetwork',
               'social.socialInteractionNetworkAction',
               'social.hasSocialSourceReferral',
               'type'
              ]


'\ncustomVariables              []                                                                                                                                                                              \ncustomDimensions_y                              []\ncustomDimensions_x\ncustomMetrics                                   []\nexperiment                                      []                                                                                                                                                                              \npromoId\npromoName\npromotionActionInfo.promoIsView                 []                                                                                                                                                                              \npublisher_infos\nv2ProductCategory\nv2ProductName\n'

In [116]:
from sklearn import preprocessing
for each in cat_columns:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(df_train[each].values) + list(df_test[each].values))
    df_train[each] = lbl.transform(list(df_train[each].values))
    df_test[each] = lbl.transform(list(df_test[each].values))

In [109]:
# Getting our vailidation y for scoring
y_true = df_test['totals.totalTransactionRevenue'].fillna(0).astype(float)
y_true = y_true.apply(lambda x: np.log1p(x))
df_test = df_test.drop('totals.totalTransactionRevenue', axis=1)

y_mean = np.mean(y)
y_base = np.full_like(y_true, y_mean)

In [110]:
# df_train = df_train.drop('customDimensions', axis=1)
# df_test = df_test.drop('customDimensions', axis=1)
df_train = df_train.drop('totals.transactionRevenue', axis=1)
df_test = df_test.drop('totals.transactionRevenue', axis=1)
df_train = df_train.drop('totals.transactions', axis=1)
df_test = df_test.drop('totals.transactions', axis=1)

In [111]:
def preprocess(df):
#     df['totals.bounces'] = df['totals.bounces'].fillna(0).astype(np.float)
    df['totals.newVisits'] = df['totals.newVisits'].fillna(0).astype(np.float)
    df['totals.timeOnSite'] = df['totals.newVisits'].fillna(0).astype(np.float)
#     df['totals.transactionRevenue'] = df['totals.transactionRevenue'].fillna(0).astype(np.float)
#     df['totals.transactions'] = df['totals.transactions'].fillna(0).astype(np.float)
    df['trafficSource.adwordsClickInfo.isVideoAd'] = df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(0).astype(np.float)
    df['trafficSource.isTrueDirect'] = df['trafficSource.isTrueDirect'].fillna(0).astype(np.float)
    
    return df
df_train = preprocess(df_train)
df_test = preprocess(df_test)

In [112]:
#### IMPORTANT ####
# Find the issue here before proceeding on Flux

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [113]:
from sklearn import tree
clf_tree = tree.DecisionTreeRegressor()
clf_tree = clf_tree.fit(df_train, y)

ValueError: could not convert string to float: 'Billing and Shipping'

In [None]:
y_pred = clf_tree.predict(df_test)

In [None]:
RMSE = sqrt(mean_squared_error(y_true, y_pred))

In [None]:
for idx, each in enumerate(clf_tree.feature_importances_):
    print(idx, each*1e5)

print('-'*10)

for idx, each in enumerate(df_train.columns):
    print(idx, each)