In [65]:
import json
from pandas.io.json import json_normalize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings("ignore")
color = sns.color_palette()

In [66]:
# Loading the data
def load_df(csv_path='/Users/manideepattanti/Acads/SEM1/DSF/HW_3/all/train.csv', nrows=None):
    columns_to_normalize = ['device', 'geoNetwork', 'totals', 'trafficSource']

    df = pd.read_csv(csv_path,
                     converters={column: json.loads for column in columns_to_normalize},
                     dtype={'fullVisitorId': 'str'},
                     nrows=nrows)

    for column in columns_to_normalize:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    return df


train = load_df()
print(f"Loaded Training Data. Shape: {train.shape}")
test = load_df('/Users/manideepattanti/Acads/SEM1/DSF/HW_3/all/test.csv')
print(f"Loaded Test Data. Shape: {test.shape}")
sub = pd.read_csv('/Users/manideepattanti/Acads/SEM1/DSF/HW_3/all/sample_submission.csv')

print(train.shape)
print(test.shape)

train_init = train
test_init = test

Loaded Training Data. Shape: (903653, 55)
Loaded Test Data. Shape: (804684, 53)
(903653, 55)
(804684, 53)


In [67]:
print(train.head())
for i in train.columns.values:
    if i not in test.columns:
        print(i)

  channelGrouping      date        fullVisitorId  \
0  Organic Search  20160902  1131660440785968503   
1  Organic Search  20160902   377306020877927890   
2  Organic Search  20160902  3895546263509774583   
3  Organic Search  20160902  4763447161404445595   
4  Organic Search  20160902    27294437909732085   

                        sessionId  socialEngagementType     visitId  \
0  1131660440785968503_1472830385  Not Socially Engaged  1472830385   
1   377306020877927890_1472880147  Not Socially Engaged  1472880147   
2  3895546263509774583_1472865386  Not Socially Engaged  1472865386   
3  4763447161404445595_1472881213  Not Socially Engaged  1472881213   
4    27294437909732085_1472822600  Not Socially Engaged  1472822600   

   visitNumber  visitStartTime device.browser             device.browserSize  \
0            1      1472830385         Chrome  not available in demo dataset   
1            1      1472880147        Firefox  not available in demo dataset   
2            1      

In [68]:
train = train_init
test = test_init
def unique_columns(frame):
    cols = []
    for col in frame.columns:
        if len(frame[col].unique()) == 1:
            cols.append(col)
    return cols


target = np.nan_to_num(np.array([float(i) for i in train['totals.transactionRevenue']]))
print('The ratio of customers with transaction revenue is', str((target != 0).mean()))

train["totals.transactionRevenue"] = train["totals.transactionRevenue"].astype('float')
log_target = np.log1p(train.groupby("fullVisitorId")["totals.transactionRevenue"].sum())
print('The ratio of customers with transaction revenue is', str((log_target != 0).mean()))

u_cols = unique_columns(train)
print(u_cols)
number_cols = ['totals.hits', 'totals.pageviews', 'visitNumber', 'visitStartTime', 'totals.bounces', 'totals.newVisits']
drop_cols = ['fullVisitorId', 'sessionId', 'visitId']
non_common_cols = ['trafficSource.campaignCode']
train = train.drop(u_cols + drop_cols + non_common_cols, axis=1)
test = test.drop(u_cols + drop_cols, axis=1)
for i in number_cols:
    train[i] = train[i].astype('float').fillna(0)
    test[i] = test[i].astype('float').fillna(0)

The ratio of customers with transaction revenue is 0.0127427231470487
The ratio of customers with transaction revenue is 0.013996726255903731
['socialEngagementType', 'device.browserSize', 'device.browserVersion', 'device.flashVersion', 'device.language', 'device.mobileDeviceBranding', 'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.operatingSystemVersion', 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits', 'trafficSource.adwordsClickInfo.criteriaParameters']


In [69]:
print(train.columns)

def date_format(data):
    data['date'] = data['date'].astype("str")
    data['date'] = data['date'].apply(lambda x:x[:4] + "-" + x[4:6] + "-"+ x[6:] )
    data['date'] = pd.to_datetime(data['date'])
    data['weekday'] = data['date'].dt.weekday
    data['day'] = data['date'].dt.day
    data['year'] = data['date'].dt.year
    data['month']= data['date'].dt.month
    return data

final_train = date_format(train)
print(final_train.columns)
final_train = final_train.drop(["totals.transactionRevenue"], axis = 1)
final_target = log_target
final_test = date_format(test)

Index(['channelGrouping', 'date', 'visitNumber', 'visitStartTime',
       'device.browser', 'device.deviceCategory', 'device.isMobile',
       'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent',
       'geoNetwork.country', 'geoNetwork.metro', 'geoNetwork.networkDomain',
       'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces',
       'totals.hits', 'totals.newVisits', 'totals.pageviews',
       'totals.transactionRevenue', 'trafficSource.adContent',
       'trafficSource.adwordsClickInfo.adNetworkType',
       'trafficSource.adwordsClickInfo.gclId',
       'trafficSource.adwordsClickInfo.isVideoAd',
       'trafficSource.adwordsClickInfo.page',
       'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign',
       'trafficSource.isTrueDirect', 'trafficSource.keyword',
       'trafficSource.medium', 'trafficSource.referralPath',
       'trafficSource.source'],
      dtype='object')
Index(['channelGrouping', 'date', 'visitNumber', 'visitStartTime'

In [70]:
print(final_train.head())

  channelGrouping       date  visitNumber  visitStartTime device.browser  \
0  Organic Search 2016-09-02          1.0    1.472830e+09         Chrome   
1  Organic Search 2016-09-02          1.0    1.472880e+09        Firefox   
2  Organic Search 2016-09-02          1.0    1.472865e+09         Chrome   
3  Organic Search 2016-09-02          1.0    1.472881e+09     UC Browser   
4  Organic Search 2016-09-02          2.0    1.472823e+09         Chrome   

  device.deviceCategory  device.isMobile device.operatingSystem  \
0               desktop            False                Windows   
1               desktop            False              Macintosh   
2               desktop            False                Windows   
3               desktop            False                  Linux   
4                mobile             True                Android   

                 geoNetwork.city geoNetwork.continent  ...   \
0                          Izmir                 Asia  ...    
1  not availab

In [71]:
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

final_train = final_train.drop(['date', 'trafficSource.source'], axis=1)
print(final_test.columns)
print(final_test.shape)
final_test = final_test.drop(['date', 'trafficSource.source'], axis=1)
cat_cols=[c for c in final_train.columns]
print(cat_cols)
for col in cat_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(final_train[col].values.astype('str')) + list(final_test[col].values.astype('str')))
    final_train[col] = lbl.transform(list(final_train[col].values.astype('str')))
    final_test[col] = lbl.transform(list(final_test[col].values.astype('str')))

Index(['channelGrouping', 'date', 'visitNumber', 'visitStartTime',
       'device.browser', 'device.deviceCategory', 'device.isMobile',
       'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent',
       'geoNetwork.country', 'geoNetwork.metro', 'geoNetwork.networkDomain',
       'geoNetwork.region', 'geoNetwork.subContinent', 'totals.bounces',
       'totals.hits', 'totals.newVisits', 'totals.pageviews',
       'trafficSource.adContent',
       'trafficSource.adwordsClickInfo.adNetworkType',
       'trafficSource.adwordsClickInfo.gclId',
       'trafficSource.adwordsClickInfo.isVideoAd',
       'trafficSource.adwordsClickInfo.page',
       'trafficSource.adwordsClickInfo.slot', 'trafficSource.campaign',
       'trafficSource.isTrueDirect', 'trafficSource.keyword',
       'trafficSource.medium', 'trafficSource.referralPath',
       'trafficSource.source', 'weekday', 'day', 'year', 'month'],
      dtype='object')
(804684, 35)
['channelGrouping', 'visitNumber', 'visitStart

In [72]:
from sklearn.model_selection import KFold

folds = KFold(n_splits=5, random_state=6)
oof_preds = np.zeros(final_train.shape[0])
sub_preds = np.zeros(final_test.shape[0])

valid_score = 0
print(final_train.shape, final_target.shape)
final_target = np.log1p(target)
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(final_train, final_target)):
    trn_x, trn_y = final_train.iloc[trn_idx], final_target[trn_idx]
    val_x, val_y = final_train.iloc[val_idx], final_target[val_idx]    
    
    train_data = lgb.Dataset(data=trn_x, label=trn_y)
    valid_data = lgb.Dataset(data=val_x, label=val_y)
    
    params = {"objective" : "regression", "metric" : "rmse", 'n_estimators':10000, 'early_stopping_rounds':100,
              "num_leaves" : 60, "learning_rate" : 0.01, "bagging_fraction" : 0.9,
              "feature_fraction" : 0.3, "bagging_seed" : 0}
    
    lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], verbose_eval=1000) 
    
    oof_preds[val_idx] = lgb_model.predict(val_x, num_iteration=lgb_model.best_iteration)
    oof_preds[oof_preds<0] = 0
    sub_pred = lgb_model.predict(final_test, num_iteration=lgb_model.best_iteration) / folds.n_splits
    sub_pred[sub_pred<0] = 0 # should be greater or equal to 0
    sub_preds += sub_pred
    print('Fold %2d RMSE : %.6f' % (n_fold + 1, np.sqrt(mean_squared_error(val_y, oof_preds[val_idx]))))
    valid_score += np.sqrt(mean_squared_error(val_y, oof_preds[val_idx]))

(903653, 33) (714167,)
Training until validation scores don't improve for 100 rounds.
[1000]	training's rmse: 1.61685	valid_1's rmse: 1.67012
[2000]	training's rmse: 1.55867	valid_1's rmse: 1.64931
[3000]	training's rmse: 1.52196	valid_1's rmse: 1.64177
[4000]	training's rmse: 1.49268	valid_1's rmse: 1.63754
Early stopping, best iteration is:
[4196]	training's rmse: 1.48676	valid_1's rmse: 1.63701
Fold  1 RMSE : 1.636057
Training until validation scores don't improve for 100 rounds.
[1000]	training's rmse: 1.61967	valid_1's rmse: 1.64502
[2000]	training's rmse: 1.56096	valid_1's rmse: 1.63112
[3000]	training's rmse: 1.52331	valid_1's rmse: 1.62587
[4000]	training's rmse: 1.4923	valid_1's rmse: 1.62377
Early stopping, best iteration is:
[4039]	training's rmse: 1.49093	valid_1's rmse: 1.62359
Fold  2 RMSE : 1.622462
Training until validation scores don't improve for 100 rounds.
[1000]	training's rmse: 1.60922	valid_1's rmse: 1.68839
[2000]	training's rmse: 1.5503	valid_1's rmse: 1.67232


In [73]:
test_pred = pd.DataFrame({"fullVisitorId":test_init.fullVisitorId})
test_pred["PredictedLogRevenue"] = np.expm1(sub_preds)
test_pred = test_pred.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
test_pred.columns = ["fullVisitorId", "PredictedLogRevenue"]
test_pred["PredictedLogRevenue"] = np.log1p(test_pred["PredictedLogRevenue"])
test_pred.to_csv("submission.csv", index=False)