In [None]:
import numpy as np
import pandas as pd
import csv
from tqdm import tqdm

In [None]:
#Read preprocessed data
test = pd.read_csv('test.csv')
X_train = pd.read_csv('X_train.csv')
X_val = pd.read_csv('X_val.csv')
y_train = np.load('Y_train.npy')
y_val = np.load('Y_val.npy')
test_ids = np.load('test_ids.npy')

In [None]:
print(X_train.shape)
print(X_val.shape)
print(test.shape)
print(y_train.shape)
print(y_val.shape)
print(test_ids.shape)

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

In [None]:
import xgboost as xgb
import lightgbm as lgb

In [None]:
import gc

from skopt.space import Real, Integer
from skopt.utils import use_named_args
import itertools
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize

In [None]:
#Min max values for hyperparameters
XGB_space = [Integer(100, 500, name='num_boost_round'),
          Integer(100, 500, name='early_stopping_rounds'),
          Real(0.001, 0.5, name='learning_rate'),
          Real(0.3, 0.8,  name='bagging_fraction'),
          Real(0.3, 0.8, name='feature_fraction')
         ]

In [None]:
def objective(values):
    params = {'objective': 'reg:linear',
          'eval_metric': 'rmse',
          'eta': values[2],
          'subsample': values[3], # 0-1
          'colsample_bytree': values[4], #0-1
          'random_state': 70,
          'silent': True}

    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)
    xgb_submit_data = xgb.DMatrix(test)

    model = xgb.train(params, xgb_train_data, 
                      num_boost_round=values[0], 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=values[1], 
                      verbose_eval=500
                     )

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)
    gc.collect()
    
    return rmse(y_val, y_pred_val)

In [None]:
#Hyperparameter tuning (*Bayesian optimization)
#n_calls : epoch
res_gp = gp_minimize(objective, XGB_space, n_calls=400,
                     random_state=70,n_random_starts=10)

print("Best score=%.4f" % res_gp.fun)

In [None]:
#Best hyperparameter values
res_gp.x

In [None]:
#Least lost value from training
res_gp.fun

In [None]:
from skopt.plots import plot_convergence

plot_convergence(res_gp)

In [None]:
#Use same random_state value in Bayesian opt.
def run_xgb(X_train, y_train, X_val, y_val, X_test, parameters):
    #'tree_method' : 'gpu_hist'
    params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': parameters[2],
              'subsample': parameters[3], # 0-1
              'colsample_bytree': parameters[4], #0-1
              'random_state': 70,
              'silent': True}

    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)
    xgb_submit_data = xgb.DMatrix(X_test)

    model = xgb.train(params, xgb_train_data, 
                      num_boost_round=parameters[0], 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=parameters[1], 
                      verbose_eval=500
                     )

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)
    y_pred_submit = model.predict(xgb_submit_data, ntree_limit=model.best_ntree_limit)

    print(f"XGB : RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model, rmse(y_val, y_pred_val), rmse(y_train, y_pred_train)

In [None]:
#Train with tuned hyperparameters
xgb_preds, xgb_model, rmse_val, rmse_train = run_xgb(X_train, y_train, X_val, y_val, test, res_gp.x)

In [None]:
predictions={
    'xgb_preds' : xgb_preds
}

In [None]:
for k,v in predictions.items():
  submission = pd.DataFrame({"fullVisitorId":test_ids})
  v[v<0] = 0
  submission["PredictedLogRevenue"] = v
  submission = submission.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
  submission.columns = ["fullVisitorId", "PredictedLogRevenue"]
  submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"]
  submission.to_csv("submission_%s.csv"%k, index=False)