In [226]:
# all required imports
import numpy as np
import pandas as pd
from geopy.distance import geodesic
import json
import os
import sys
import math
import random
import datetime
import scipy as sp
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [227]:
#reading the training data and test data
lines_percent = 1
train_data = pd.read_csv("./Data/train_small.csv", low_memory=False, error_bad_lines=False, 
                         header=0, skiprows=lambda i: i>0 and random.random() > lines_percent)

#test_data = pd.read_csv("./Data/test.csv", usecols=range(0,7), low_memory=False, error_bad_lines=False)

In [228]:
# adding new columns for json data columns

json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']

#parse if valid json and return value for key
def get_value(jstr, key):
    try:
        val = jstr
        if type(jstr) != dict:
            val = json.loads(jstr)
        return val[key]
    except:
        return None

# add new columns for each key in the json
def add_json_keys_as_new_columns(col_name):
    if type(train_data[col_name][0]) != dict:
        device_col_keys = json.loads(train_data[col_name][0]).keys()
    else:
        device_col_keys = train_data[col_name][0].keys()
    for key in device_col_keys:
        print("New column added: '%s'" % (key))
        train_data[key] = train_data[col_name].apply(lambda jstr: get_value(jstr, key))
    del train_data[col_name]

for col in json_cols:
    add_json_keys_as_new_columns(col)
    

New column added: 'browser'
New column added: 'browserVersion'
New column added: 'browserSize'
New column added: 'operatingSystem'
New column added: 'operatingSystemVersion'
New column added: 'isMobile'
New column added: 'mobileDeviceBranding'
New column added: 'mobileDeviceModel'
New column added: 'mobileInputSelector'
New column added: 'mobileDeviceInfo'
New column added: 'mobileDeviceMarketingName'
New column added: 'flashVersion'
New column added: 'language'
New column added: 'screenColors'
New column added: 'screenResolution'
New column added: 'deviceCategory'
New column added: 'continent'
New column added: 'subContinent'
New column added: 'country'
New column added: 'region'
New column added: 'metro'
New column added: 'city'
New column added: 'cityId'
New column added: 'networkDomain'
New column added: 'latitude'
New column added: 'longitude'
New column added: 'networkLocation'
New column added: 'visits'
New column added: 'hits'
New column added: 'pageviews'
New column added: 'bo

In [229]:
# deleting columns whose values are censored

col_to_del = ['adwordsClickInfo', 'browserVersion', 'browserSize', 'networkLocation', 'operatingSystemVersion', 
              'longitude', 'latitude']

for col_name in col_to_del:
    del train_data[col_name]

In [230]:
# deleting columns with just one unique

col_with_one_unique_val = ['socialEngagementType'] # checked: ['campaign', 'source', 'medium']

for col_name in col_with_one_unique_val:
    print(train_data[col_name].unique())
    if len(train_data[col_name].unique()) == 1:
        print("Deleted: '%s'" % (col_name))
        del train_data[col_name]

['Not Socially Engaged']
Deleted: 'socialEngagementType'


In [231]:
# functions to map string to int

unique_val_map = {}

def map_channel_grouping(column_name):
    if column_name not in unique_val_map:
        unique_val = train_data[column_name].unique()
        unique_val_map[column_name] = {}
        for i in range(len(unique_val)):
            unique_val_map[column_name][unique_val[i]] = i
    print("Mapped Values:")
    print(unique_val_map[column_name])
    train_data.channelGrouping = train_data[column_name].apply(lambda val: unique_val_map[column_name][val])

In [232]:
# mapping values of columns having string values to int

col_to_map = ['channelGrouping']

for col_name in col_to_map:
    map_channel_grouping(col_name)

Mapped Values:
{'Organic Search': 0, 'Referral': 1, 'Paid Search': 2, 'Affiliates': 3, 'Direct': 4, 'Display': 5}


In [234]:
#print(train_data_copy.head())
#print(train_data.visitNumber.unique())
#train_data.describe().transpose()
#train_data.info()
#train_data['adwordsClickInfo'][0]
train_data.values[0]

array([0, 20160902, 1.13166e+18, '1131660440785968503_1472830385',
       1472830385, 1, 1472830385, 'Chrome', 'Windows', False,
       'not available in demo dataset', 'not available in demo dataset',
       'not available in demo dataset', 'not available in demo dataset',
       'not available in demo dataset', 'not available in demo dataset',
       'not available in demo dataset', 'not available in demo dataset',
       'not available in demo dataset', 'desktop', 'Asia', 'Western Asia',
       'Turkey', 'Izmir', '(not set)', 'Izmir',
       'not available in demo dataset', 'ttnet.com.tr', '1', '1', '1',
       '1', '1', '(not set)', 'google', 'organic', '(not provided)'],
      dtype=object)