<a href="https://colab.research.google.com/github/svarogjk/shop_revenue_prediction/blob/master/gstore_competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Readings

In [0]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
current_palette = sns.color_palette()
# from tqdm import tqdm_notebook
%matplotlib inline 
import io
from google.colab import files

In [0]:
import json
from pandas.io.json import json_normalize
import random

In [0]:
from itertools import product
from sklearn.preprocessing import LabelEncoder

In [0]:
# !pip install fbprophet
# from fbprophet import Prophet

In [0]:
from xgboost import XGBRegressor
from xgboost import plot_importance

In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import r2_score, make_scorer

In [0]:
!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [8]:
file_list = drive.ListFile({'q': "'19ry360g_dRMf_i9Gi4A2WrOzE682GA7O' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

title: gstore_competition.ipynb, id: 1gRRqwzC-VDFe_fOQW9Kc-5YX3Gmofxnt
title: train.csv, id: 1-9OX7E7uSovNTI4-h43oMQpZavJai6sb
title: test.csv, id: 1SohvLEveFD4yhhR_alpJtfgtY5QIYMV4
title: sample_submission.csv, id: 1WLB9e5uswsrR8NWq4yLwFKlXE9uYUZKB
title: all.zip (Unzipped Files), id: 13XaW-vTIJKNDKMrbhamaduANnnPFojkQ
title: all.zip, id: 1Mi8GWSji2h1FGzWkQHYXmRuQNqHcIUlj


In [0]:
train = drive.CreateFile({'id': '1-9OX7E7uSovNTI4-h43oMQpZavJai6sb'})
train.GetContentFile('train.csv')

test_data = drive.CreateFile({'id': '1SohvLEveFD4yhhR_alpJtfgtY5QIYMV4'})
test_data.GetContentFile('test.csv')

submission = drive.CreateFile({'id': '1WLB9e5uswsrR8NWq4yLwFKlXE9uYUZKB'})
test_data.GetContentFile('sample_submission.csv')

### Here we read our DataFrame and transform its json columns into normal ones

In [0]:
def json_read(df):
    
    columns = ['device', 'geoNetwork', 'totals', 'trafficSource']
    p = 0.07
    
    data_frame = df
    
    #Importing the dataset
    df = pd.read_csv(data_frame, 
                     converters={column: json.loads for column in columns}, # loading the json columns properly
                     dtype={'fullVisitorId': 'str'},
                    skiprows=lambda i: i>0 and random.random() > p
                    ) # transforming this column to string
    
    for column in columns: #loop to finally transform the columns in data frame
        #It will normalize and set the json to a table
        column_as_df = json_normalize(df[column]) 
        # here will be set the name using the category and subcategory of json columns
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns] 
        # after extracting the values, let drop the original columns
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
        
    # Printing the shape of dataframes that was imported     
    print(f"Loaded {os.path.basename(data_frame)}. Shape: {df.shape}")
    return df # returning the df after importing and transforming

In [0]:
train = pd.read_csv('train.csv', low_memory=False)
test_data = pd.read_csv('test.csv', low_memory=False)
submission = pd.read_csv('sample_submission.csv', low_memory=False)

In [16]:
train.head(2)

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",1131660440785968503,"{""continent"": ""Asia"", ""subContinent"": ""Western...",1131660440785968503_1472830385,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472830385,1,1472830385
1,Organic Search,20160902,"{""browser"": ""Firefox"", ""browserVersion"": ""not ...",377306020877927890,"{""continent"": ""Oceania"", ""subContinent"": ""Aust...",377306020877927890_1472880147,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472880147,1,1472880147


In [13]:
test_data.head(2)

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6167871330617112363,"{""continent"": ""Asia"", ""subContinent"": ""Southea...",6167871330617112363_1508151024,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""4""}","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508151024,2,1508151024
1,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",643697640977915618,"{""continent"": ""Europe"", ""subContinent"": ""South...",0643697640977915618_1508175522,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""5"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508175522,1,1508175522


In [14]:
submission.head(2)

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6167871330617112363,"{""continent"": ""Asia"", ""subContinent"": ""Southea...",6167871330617112363_1508151024,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""4""}","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508151024,2,1508151024
1,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",643697640977915618,"{""continent"": ""Europe"", ""subContinent"": ""South...",0643697640977915618_1508175522,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""5"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508175522,1,1508175522


### Extract target

In [0]:
train_totals = train.totals.apply(json.loads).apply(pd.Series)

In [18]:
train_totals.head(2)

Unnamed: 0,bounces,hits,newVisits,pageviews,transactionRevenue,visits
0,1,1,1,1,,1
1,1,1,1,1,,1


In [0]:
train = pd.concat([train, train_totals], axis=1)
train.drop(["totals"], axis=1, inplace=True)

In [21]:
train.head(2)

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,trafficSource,visitId,visitNumber,visitStartTime,bounces,hits,newVisits,pageviews,transactionRevenue,visits
0,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",1131660440785968503,"{""continent"": ""Asia"", ""subContinent"": ""Western...",1131660440785968503_1472830385,Not Socially Engaged,"{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472830385,1,1472830385,1,1,1,1,,1
1,Organic Search,20160902,"{""browser"": ""Firefox"", ""browserVersion"": ""not ...",377306020877927890,"{""continent"": ""Oceania"", ""subContinent"": ""Aust...",377306020877927890_1472880147,Not Socially Engaged,"{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472880147,1,1472880147,1,1,1,1,,1


In [0]:
train_rev = train[~train.transactionRevenue.isnull()]
train_no_rev = train[train.transactionRevenue.isnull()]

In [24]:
train_rev.head(2)

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,trafficSource,visitId,visitNumber,visitStartTime,bounces,hits,newVisits,pageviews,transactionRevenue,visits
752,Direct,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6194193421514403509,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",6194193421514403509_1472843572,Not Socially Engaged,"{""campaign"": ""(not set)"", ""source"": ""(direct)""...",1472843572,1,1472843572,,11,1.0,11,37860000,1
753,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",5327166854580374902,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",5327166854580374902_1472844906,Not Socially Engaged,"{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472844906,3,1472844906,,11,,10,306670000,1


In [26]:
train_rev.shape

(11515, 17)

In [25]:
train_no_rev.head(2)

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,trafficSource,visitId,visitNumber,visitStartTime,bounces,hits,newVisits,pageviews,transactionRevenue,visits
0,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",1131660440785968503,"{""continent"": ""Asia"", ""subContinent"": ""Western...",1131660440785968503_1472830385,Not Socially Engaged,"{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472830385,1,1472830385,1,1,1,1,,1
1,Organic Search,20160902,"{""browser"": ""Firefox"", ""browserVersion"": ""not ...",377306020877927890,"{""continent"": ""Oceania"", ""subContinent"": ""Aust...",377306020877927890_1472880147,Not Socially Engaged,"{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472880147,1,1472880147,1,1,1,1,,1


In [27]:
train_no_rev.shape

(892138, 17)

In [0]:
train_no_rev = train_no_rev.sample(n=train_rev.shape[0])

### Merge rows with target with random number of samples without target for further classification

In [0]:
train_clf = pd.concat([train_rev, train_no_rev])

In [30]:
train_clf.shape

(23030, 17)

In [31]:
train_clf.head(2)

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,trafficSource,visitId,visitNumber,visitStartTime,bounces,hits,newVisits,pageviews,transactionRevenue,visits
752,Direct,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6194193421514403509,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",6194193421514403509_1472843572,Not Socially Engaged,"{""campaign"": ""(not set)"", ""source"": ""(direct)""...",1472843572,1,1472843572,,11,1.0,11,37860000,1
753,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",5327166854580374902,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",5327166854580374902_1472844906,Not Socially Engaged,"{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472844906,3,1472844906,,11,,10,306670000,1


In [0]:
cols_json = ["device", "geoNetwork", "trafficSource"]

In [0]:
df_lst = []
for col in cols_json:
  df_cur = train_clf[col].apply(json.loads).apply(pd.Series)
  df_lst.append(df_cur)

In [0]:
train_clf.drop(cols_json, axis=1, inplace=True)

In [0]:
train_clf = pd.concat([train_clf, *df_lst], axis=1)

In [38]:
train_clf.head(2)

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,bounces,hits,...,region,subContinent,adContent,adwordsClickInfo,campaign,isTrueDirect,keyword,medium,referralPath,source
752,Direct,20160902,6194193421514403509,6194193421514403509_1472843572,Not Socially Engaged,1472843572,1,1472843572,,11,...,Michigan,Northern America,,{'criteriaParameters': 'not available in demo ...,(not set),True,,(none),,(direct)
753,Organic Search,20160902,5327166854580374902,5327166854580374902_1472844906,Not Socially Engaged,1472844906,3,1472844906,,11,...,New York,Northern America,,{'criteriaParameters': 'not available in demo ...,(not set),True,(not provided),organic,,google


In [0]:
train_ad = train_clf.adwordsClickInfo.apply(pd.Series)
train_clf.drop(["adwordsClickInfo"], axis=1, inplace=True)
train_clf = pd.concat([train_clf, train_ad], axis=1)

## Process the resulting cols

In [41]:
train_clf.head(2)

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,bounces,hits,...,medium,referralPath,source,adNetworkType,criteriaParameters,gclId,isVideoAd,page,slot,targetingCriteria
752,Direct,20160902,6194193421514403509,6194193421514403509_1472843572,Not Socially Engaged,1472843572,1,1472843572,,11,...,(none),,(direct),,not available in demo dataset,,,,,
753,Organic Search,20160902,5327166854580374902,5327166854580374902_1472844906,Not Socially Engaged,1472844906,3,1472844906,,11,...,organic,,google,,not available in demo dataset,,,,,


In [42]:
train_clf.columns

Index(['channelGrouping', 'date', 'fullVisitorId', 'sessionId',
       'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime',
       'bounces', 'hits', 'newVisits', 'pageviews', 'transactionRevenue',
       'visits', 'browser', 'browserSize', 'browserVersion', 'deviceCategory',
       'flashVersion', 'isMobile', 'language', 'mobileDeviceBranding',
       'mobileDeviceInfo', 'mobileDeviceMarketingName', 'mobileDeviceModel',
       'mobileInputSelector', 'operatingSystem', 'operatingSystemVersion',
       'screenColors', 'screenResolution', 'city', 'cityId', 'continent',
       'country', 'latitude', 'longitude', 'metro', 'networkDomain',
       'networkLocation', 'region', 'subContinent', 'adContent', 'campaign',
       'isTrueDirect', 'keyword', 'medium', 'referralPath', 'source',
       'adNetworkType', 'criteriaParameters', 'gclId', 'isVideoAd', 'page',
       'slot', 'targetingCriteria'],
      dtype='object')

### Replace train_clf.channelGrouping with categorical numbering

In [43]:
train_clf.channelGrouping.unique()

array(['Direct', 'Organic Search', 'Referral', 'Display', 'Paid Search',
       'Social', 'Affiliates', '(Other)'], dtype=object)

In [0]:
le = LabelEncoder()

In [46]:
train_clf["channelGrouping"] = le.fit_transform(train_clf.channelGrouping)
train_clf.channelGrouping[:3]

752    2
753    4
799    6
Name: channelGrouping, dtype: int64

### Treat train_clf.date