In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import json
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
color = sns.color_palette()

# Import plotly
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

from pandas.io.json import json_normalize
from datetime import datetime
from sklearn import preprocessing

In [None]:
gc.enable()

features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device.browser',\
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',\
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',\
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',\
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',\
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',\
       'trafficSource.adContent', 'trafficSource.campaign',\
       'trafficSource.isTrueDirect', 'trafficSource.keyword',\
       'trafficSource.medium', 'trafficSource.referralPath',\
       'trafficSource.source', 'customDimensions']

def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
            converters={column: json.loads for column in JSON_COLUMNS}, 
            dtype={'fullVisitorId': 'str'}, # Important!!
            chunksize=100000)
    for df in dfs:
        df.reset_index(drop=True, inplace=True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        #print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
        use_df = df[features]
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis=0).reset_index(drop=True)
        #print(ans.shape)
    return ans


In [None]:
%%time
train_df = load_df('../input/train_v2.csv')
test_df = load_df('../input/test_v2.csv')

In [None]:
train_df.head()

In [None]:
print("Number of unique full VisitorIds: %d" % train_df.fullVisitorId.unique().shape[0])
print("Number of entries in training data: %d" % train_df.shape[0])
print("Ratio of unique visitorids per entries in train: %s" % str(train_df.fullVisitorId.unique().shape[0] * 1.0 / train_df.shape[0]))
print("Number of unique full VisitorIds: %d" % test_df.fullVisitorId.unique().shape[0])
print("Number of entries in test data: %d" % test_df.shape[0])
print("Ratio of unique visitorids per entries in test: %s" % str(test_df.fullVisitorId.unique().shape[0] * 1.0 / test_df.shape[0]))

In [None]:
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", "trafficSource.campaign", 
            "trafficSource.source",
            "trafficSource.isTrueDirect"]
num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits'] 

cols_to_drop = [ "device.isMobile", "customDimensions", "visitId", "trafficSource.referralPath", "trafficSource.medium", "trafficSource.keyword"]

for c in cat_cols:
    if c not in train_df.columns:
        print(c)
print("-"* 30)        
for c in train_df.columns:
    if (c not in cat_cols) and (c not in num_cols):
        print(c)
print("-" * 30)
for c in test_df.columns:
    if ( c not in cat_cols) and (c not in num_cols):
        print(c)


In [None]:
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype(float)
test_df["totals.transactionRevenue"] = test_df["totals.transactionRevenue"].astype(float)
train_df["totals.transactionRevenue"].fillna(0, inplace=True)
test_df["totals.transactionRevenue"].fillna(0, inplace=True)
test_id = test_df["fullVisitorId"].values

In [None]:
train_df['date'] = pd.to_datetime(train_df['date'], format='%Y%m%d')
test_df['date'] = pd.to_datetime(test_df['date'], format='%Y%m%d')
train_df.drop(cols_to_drop, axis=1, inplace=True)
test_df.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
%%time
for col in cat_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

In [None]:
for col in num_cols:
    train_df[col] = train_df[col].astype(float)
    test_df[col] = test_df[col].astype(float)
    

In [None]:
print(train_df.shape)
train_df.info()

In [None]:
# Split the train dataset into development and valid based on time 
dev_df = train_df[train_df['date']<=datetime(2017,6,30)]
val_df = train_df[train_df['date']>datetime(2017,6,30)]
print(dev_df.shape)
print(val_df.shape)
dev_y = np.log1p(dev_df["totals.transactionRevenue"].values)
val_y = np.log1p(val_df["totals.transactionRevenue"].values)

dev_X = dev_df[cat_cols + num_cols] 
val_X = val_df[cat_cols + num_cols] 
test_X = test_df[cat_cols + num_cols] 

In [None]:
# custom function to run light gbm model
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, model, pred_val_y

# Training the model #
pred_test, model, pred_val = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

In [None]:
from sklearn import metrics
pred_val[pred_val<0] = 0
val_pred_df = pd.DataFrame({"fullVisitorId":val_df["fullVisitorId"].values})
val_pred_df["transactionRevenue"] = val_df["totals.transactionRevenue"].values
val_pred_df["PredictedRevenue"] = np.expm1(pred_val)
#print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))
val_pred_df = val_pred_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))

In [None]:
sub_df = pd.DataFrame({"fullVisitorId":test_id})
pred_test[pred_test<0] = 0
sub_df["PredictedLogRevenue"] = np.expm1(pred_test)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])
sub_df.to_csv("baseline_lgb.csv", index=False)

In [None]:
sub_df.head()


In [None]:
fig, ax = plt.subplots(figsize=(12,18))
lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.show()

In [None]:
y_train = train_df["totals.transactionRevenue"].values
y_val = test_df["totals.transactionRevenue"].values

In [None]:
df_train_x = train_df.drop(["totals.transactionRevenue", "fullVisitorId", "date"], axis = 1)
df_test_x = test_df.drop(["totals.transactionRevenue", "fullVisitorId", "date"], axis = 1)

In [None]:
df_train_x.head()

In [None]:
#from keras.models import Sequential
#from keras.layers import Dense
#from keras.layers import LSTM, Bidirectional, Dropout
#from keras.callbacks import ReduceLROnPlateau

#X_train = df_train_x.values
#X_val = df_test_x.values
#y_train = y_train
#y_val = y_val
#X_train = X_train.reshape(X_train.shape[0],1,X_train.shape[1])
#X_val = X_val.reshape(X_val.shape[0],1,X_val.shape[1])

In [None]:
#from keras.layers import Input
#from keras.models import Model

#inputs = Input(shape=(1,21))
#x = Bidirectional(LSTM(200,recurrent_dropout=0.2, kernel_initializer='lecun_normal', return_sequences=True))(inputs)
#x = Bidirectional(LSTM(120,recurrent_dropout=0.2, kernel_initializer='lecun_normal'))(x)
#x = Dense(50, activation='sigmoid')(inputs)
#x = Dropout(0.1)(x)
#x = Dense(20,activation='elu')(x)
#output = Dense(1,activation='linear')(x)

#model2 = Model(inputs=inputs, outputs=output)
#model2.compile(loss='mse', optimizer='adam')
#model2.fit(X_train, y_train, epochs=4, batch_size=64, validation_data=(X_val, y_val), verbose=1, shuffle=False)

In [None]:
import xgboost as xgb
def run_xgb(X_train, y_train, X_val, y_val, X_test):
    params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': 0.001,
              'max_depth': 10,
              'subsample': 0.6,
              'colsample_bytree': 0.6,
              'alpha':0.001,
              'random_state': 42,
              'silent': True}

    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)
    xgb_submit_data = xgb.DMatrix(X_test)

    model = xgb.train(params, xgb_train_data, 
                      num_boost_round=1000, 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=100, 
                      verbose_eval=500
                     )

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)
    y_pred_submit = model.predict(xgb_submit_data, ntree_limit=model.best_ntree_limit)

    return y_pred_submit, model

In [None]:
%%time
xgb_preds, xgb_model = run_xgb(dev_X, dev_y, val_X, val_y, test_X)

In [None]:
sub_df = pd.DataFrame({"fullVisitorId":test_id})
pred_test[pred_test<0] = 0
sub_df["PredictedLogRevenue"] = np.expm1(pred_test * 0.6 + xgb_preds * 0.4)
sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"]
sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"])
sub_df.to_csv("prediction_ensemble_xgb_lgb.csv", index=False)