In [None]:
import numpy as np
import pandas as pd
import csv
from tqdm import tqdm
import itertools

In [None]:
#Read preprocessed data
test = pd.read_csv('test.csv')
X_train = pd.read_csv('X_train.csv')
X_val = pd.read_csv('X_val.csv')
Y_train = np.load('Y_train.npy')
Y_val = np.load('Y_val.npy')
test_ids = np.load('test_ids.npy')

In [None]:
print(X_train.shape)
print(X_val.shape)
print(test.shape)
print(Y_train.shape)
print(Y_val.shape)
print(test_ids.shape)

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

In [None]:
import xgboost as xgb
import lightgbm as lgb

In [None]:
#XGB parameter explanations

#eta : learning rate [0,1]
#silent : verbose 0 or 1
#alpha L1, lambda L2 regularization
#tree_method : exact or hist (hist is fast cause of uses binnig and caching)
#max_depth : high value can cause overfitting
#subsample : observations, Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting.
#colsample_bytree : number of columns using for observations
#random_state : validation percentage

#num_boost_round : Number of boosting iterations.
#early_stopping_rounds : Validation error needs to decrease at least every early_stopping_rounds round(s) to continue training. 

In [None]:
#Sample hyperparameter values
num_boost_round = [100,300,500]
early_stopping_rounds = [100,300,500]
learning_rate = [0.5, 0.3, 0.1, 0.01, 0.005, 0.001]
bagging_fraction = [0.25, 0.5, 0.75]
feature_fraction = [0.3, 0.5, 0.8]

In [None]:
#Hyperparameter tuning (*Grid)
parameter_space_xgb = list(itertools.product(num_boost_round, early_stopping_rounds, learning_rate, bagging_fraction, feature_fraction))

In [None]:
def run_xgb(X_train, y_train, X_val, y_val, X_test, parameters):
    params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': parameters[2],
              'max_depth': 10,
              'subsample': parameters[3], # 0-1
              'colsample_bytree': parameters[4], #0-1
              'alpha':0.001,
              'lambda':0.001,
              'random_state': 70,
              'silent': True}

    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)
    xgb_submit_data = xgb.DMatrix(X_test)

    model = xgb.train(params, xgb_train_data, 
                      num_boost_round=parameters[0], 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=parameters[1], 
                      verbose_eval=500
                     )

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)
    y_pred_submit = model.predict(xgb_submit_data, ntree_limit=model.best_ntree_limit)

    print(f"XGB : RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model, rmse(y_val, y_pred_val), rmse(y_train, y_pred_train)

In [None]:
xgb_predictions = []
filename = "xgb.csv"
with open(filename,'a', newline='') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerow(["index", "num_boost_round", "early_stopping_rounds", "learning_rate", "bagging_fraction", "feature_fraction","rmse_val", "rmse_train"])

In [None]:
print("parameter_space_xgb : %d"%len(parameter_space_xgb))
for index, p in tqdm(enumerate(parameter_space_xgb)):
    try:
        xgb_preds, xgb_model, rmse_val, rmse_train = run_xgb(X_train, Y_train, X_val, Y_val, test, p)
        xgb_predictions.append(xgb_preds)
        with open(filename,'a', newline='') as resultFile:
            wr = csv.writer(resultFile, dialect='excel')
            wr.writerow([index, p[0], p[1], p[2], p[3], p[4], rmse_val, rmse_train])
            
    except Exception as e:
        print(e)
        print(index)

In [None]:
#Save XGB predictions
np.save("xgb_predictions.npy", np.array(xgb_predictions))
del  xgb_predictions

In [None]:
#Sample hyperparameter values
num_iterations = [500,1000,2500]
num_boost_round = [100,300,500]
early_stopping_rounds = [100,300,500]
learning_rate = [0.5, 0.3, 0.1, 0.01, 0.005, 0.001]
num_leaves = [20,30,50]
bagging_fraction = [0.25, 0.5, 0.75]
feature_fraction = [0.3, 0.5, 0.8]

In [None]:
#Hyperparameter tuning (*Grid)
parameter_space_lgb = list(itertools.product(num_iterations, num_boost_round, early_stopping_rounds, learning_rate, num_leaves, bagging_fraction, feature_fraction))

In [None]:
def run_lgb(X_train, y_train, X_val, y_val, X_test, parameters):
    
    params = {
        "objective" : "regression_l1",
        "num_iterations" : parameters[0],
        "learning_rate" : parameters[3],
        "num_leaves" : parameters[4],
        "num_threads" : 4,
        "metric" : "rmse",
        "bagging_fraction" : parameters[5], #subsample
        "feature_fraction" : parameters[6], #colsample_bytree
        "verbosity" : -1,
    }
    
    lgb_train_data = lgb.Dataset(X_train, label=y_train)
    lgb_val_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, lgb_train_data, 
                      num_boost_round=parameters[1],
                      valid_sets=[lgb_train_data, lgb_val_data],
                      early_stopping_rounds=parameters[2],
                      verbose_eval=500
                     )

    y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred_submit = model.predict(X_test, num_iteration=model.best_iteration)

    #print(f"LGBM: RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model, rmse(y_val, y_pred_val), rmse(y_train, y_pred_train)

In [None]:
lgb_predictions = []
filename = "lgb.csv"

with open(filename,'a', newline='') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerow(["index", "num_iterations", "num_boost_round", "early_stopping_rounds", "learning_rate", "num_leaves", "bagging_fraction", "feature_fraction","rmse_val", "rmse_train"])

In [None]:
for index, p in tqdm(enumerate(parameter_space_lgb)):
    try:
        lgb_preds, lgb_model, rmse_val, rmse_train = run_lgb(X_train, Y_train, X_val, Y_val, test, p)
        lgb_predictions.append(lgb_preds)
        with open(filename,'a', newline='') as resultFile:
            wr = csv.writer(resultFile, dialect='excel')
            wr.writerow([index, p[0], p[1], p[2], p[3], p[4], p[5], p[6], rmse_val, rmse_train])
            
    except Exception as e:
        print(e)
        print(index)

In [None]:
#Save LGB predictions
np.save("lgb_predictions.npy", np.array(lgb_predictions))
del  lgb_predictions

In [None]:
#load predictions
xgb_predictions = np.load("xgb_predictions.npy")
lgb_predictions = np.load("lgb_predictions.npy")

#select least loss valued prediction
xgb_pred = xgb_predictions[0] #sample
lgb_pred = lgb_predictions[0] #sample

In [None]:
ensemble_preds_lgb_90_xgb_10 = 0.90 * lgb_preds + 0.10 * xgb_preds
ensemble_preds_lgb_80_xgb_20 = 0.80 * lgb_preds + 0.20 * xgb_preds
ensemble_preds_lgb_70_xgb_30 = 0.70 * lgb_preds + 0.30 * xgb_preds
ensemble_preds_lgb_60_xgb_40 = 0.60 * lgb_preds + 0.40 * xgb_preds
ensemble_preds_lgb_50_xgb_50 = 0.50 * lgb_preds + 0.50 * xgb_preds
ensemble_preds_lgb_40_xgb_60 = 0.40 * lgb_preds + 0.60 * xgb_preds
ensemble_preds_lgb_30_xgb_70 = 0.30 * lgb_preds + 0.70 * xgb_preds
ensemble_preds_lgb_20_xgb_80 = 0.20 * lgb_preds + 0.80 * xgb_preds
ensemble_preds_lgb_10_xgb_90 = 0.10 * lgb_preds + 0.90 * xgb_preds

In [None]:
predictions={
    'xgb_preds' : xgb_preds,
    'lgb_preds' : lgb_preds,
    'ensemble_preds_lgb_90_xgb_10' : ensemble_preds_lgb_90_xgb_10,
    'ensemble_preds_lgb_80_xgb_20' : ensemble_preds_lgb_80_xgb_20,
    'ensemble_preds_lgb_70_xgb_30' : ensemble_preds_lgb_70_xgb_30,
    'ensemble_preds_lgb_60_xgb_40' : ensemble_preds_lgb_60_xgb_40,
    'ensemble_preds_lgb_50_xgb_50' : ensemble_preds_lgb_50_xgb_50,
    'ensemble_preds_lgb_40_xgb_60' : ensemble_preds_lgb_40_xgb_60,
    'ensemble_preds_lgb_30_xgb_70' : ensemble_preds_lgb_30_xgb_70,
    'ensemble_preds_lgb_20_xgb_80' : ensemble_preds_lgb_20_xgb_80,
    'ensemble_preds_lgb_10_xgb_90' : ensemble_preds_lgb_10_xgb_90
}

In [None]:
for k,v in predictions.items():
  submission = pd.DataFrame({"fullVisitorId":test_ids})
  v[v<0] = 0
  submission["PredictedLogRevenue"] = v
  submission = submission.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
  submission.columns = ["fullVisitorId", "PredictedLogRevenue"]
  submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"]
  submission.to_csv("submission_%s.csv"%k, index=False)