In [1]:
# importing the libraries

import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb
import tensorflow as tf

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# setting up the parameters

root_dir = os.path.dirname(os.path.abspath(os.getcwd()))
pd.set_option("display.max_rows", 16)
pd.set_option("display.max_columns", None)

## Helper Functions

In [3]:
def train_test_split(dataframe, validation_ratio):
    """
    randomly splits the dataset into train and valid sets
    
    input: dataframe, validation ratio (the %age of data to feed into the )
    output: trainset and validset
    """
    num_train = len(dataframe)
    indices = list(range(num_train))
    np.random.shuffle(indices)
    split = int(np.floor(validation_ratio*num_train))
    train_idx, valid_idx = indices[split:], indices[:split]
    train_df, valid_df = dataframe.iloc[train_idx], dataframe.iloc[valid_idx]
    return train_df, valid_df

In [4]:
# importing the dataset
df = pd.read_csv(os.path.join(root_dir, "data", "processed_data", "train_aggdf.csv"))
df.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,diff_time_mean,diff_time_max,diff_time_min,count_unique_app,hour,minute,vc_app_code,vc_user_id,app_code_count_unique_user,click_count_user_mean,time_elapsed_user,click_count_app_mean,time_elapsed_app,inst_count,user_unique_sessions,user_unique_item_ids,user_unique_category_1,user_unique_category_2,user_unique_category_3,user_unique_product_type,user_item_id_mode,user_category_1_mode,user_category_2_mode,user_category_3_mode,user_product_type_mode
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,2,0,0,74670.0,148200.0,1140.0,1,0,0,395,3,51,-1.0,-1.0,-1.0,-1.0,1,1,1,1,1,1,1,43886,11.0,35.0,20.0,5622.0
1,c81e728d9d4c2f636f067f89cc14862c,2018-11-15 00:00:00,89464,129,0,0,0,101901.818182,347520.0,180.0,1,0,0,7050,23,2000,-1.0,-1.0,-1.0,-1.0,226,40,150,15,50,90,132,38517,17.0,9.0,62.0,8028.0
2,eccbc87e4b5ce2fe28308fd9f2a7baf3,2018-11-15 00:00:00,58442,127,1,0,0,68812.727273,165720.0,8400.0,1,0,0,10851,34,2039,-1.0,-1.0,-1.0,-1.0,32,15,12,8,10,10,12,73224,1.0,64.0,263.0,5164.0
3,a87ff679a2f3e71d9181a67b7542122c,2018-11-15 00:00:00,4238,371,1,0,0,540.0,540.0,540.0,1,0,0,9343,2,1819,-1.0,-1.0,-1.0,-1.0,109,30,52,12,28,40,50,37336,17.0,8.0,84.0,231.0
4,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,1,1,1,45745.882353,167340.0,360.0,2,0,1,422,52,24,-1.0,-1.0,-1.0,-1.0,7,5,3,2,2,2,3,43209,4.0,74.0,292.0,577.0


In [9]:
# spliting the dataset
train_df, test_df = train_test_split(df, validation_ratio = 0.2)

print("shape of train_df = {}".format(train_df.shape))
print("shape of test_df = {}".format(test_df.shape))

# define predictors and targets in trainset and testset
x_train, y_train = train_df.drop(columns = ["impression_id", "impression_time", "is_click"]), train_df["is_click"]
x_test, y_test = test_df.drop(columns = ["impression_id", "impression_time", "is_click"]), test_df["is_click"]

print("-"*10)
print("Shape of x_train = {} ... y_train = {}".format(x_train.shape, y_train.shape))
print("Shape of x_test = {} ... y_test = {}".format(x_test.shape, y_test.shape))

shape of train_df = (190088, 32)
shape of test_df = (47521, 32)
----------
Shape of x_train = (190088, 29) ... y_train = (190088,)
Shape of x_test = (47521, 29) ... y_test = (47521,)


---
# Machine Learning Models
## 1. XGBoost

In [13]:
def hyperparameter_tuning(params):
    """
    hypertunes a XGBoost model
    
    inp: parameters
    outp: score per fold
    """
    params = {
        "max_depth": int(params["max_depth"]),                              # max depth of the tree
        "gamma": "{:.3f}".format(params["gamma"]),                          # min loss reduction required to make a split
        "subsample": "{:.2f}".format(params["subsample"]),                  # denotes the fraction of observations to be randomly samples for each tree
        "reg_alpha": "{:.3f}".format(params["reg_alpha"]),                  # L1 regularization weight
        "reg_lambda": "{:.3f}".format(params["reg_lambda"]),                # L2 regularization weight
        "learning_rate": "{:.3f}".format(params["learning_rate"]),          # learning rate of XGB
        "num_leaves": "{:.3f}".format(params["num_leaves"]),                 
        "colsample_bytree": "{:.3f}".format(params["colsample_bytree"]),
        "min_child_samples": "{:.3f}".format(params["min_child_samples"]),
        "feature_fraction": "{:.3f}".format(params["feature_fraction"]),
        "bagging_fraction": "{:.3f}".format(params["bagging_fraction"])
    }
    
    print("#"*25)
    print("Params = {}".format(params))
    FOLDS = 10  # defining the folds required
    count = 1   # count of HPT cycles
    skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = 42)
    y_oof = np.zeros(x_train.shape[0])
    score_mean = 0
    for trn_idx, val_idx in skf.split(x_train, y_train):
        # define the classifier
        clf = xgb.XGBClassifier(random_state = 42, 
                                verbose = True, 
                                tree_method = "gpu_hist", 
                                **params)
        # spliting data into train and valid sets
        train_x, valid_x = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        train_y, valid_y = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        
        # fit the estimator and predict
        clf.fit(train_x, train_y)
        pred = clf.predict(valid_x)
        
        # eval metrics
        score = make_scorer(roc_auc_score, needs_proba = True)(clf, valid_x, valid_y)
        score_mean += score
        print("Count = {} ... score = {}".format(count, score))
        count += 1
    
    gc.collect()
    print("Mean ROC_AUC = {}".format(score_mean / FOLDS))
    del train_x, valid_x, train_y, valid_y, clf, score
    
    return -(score_mean/FOLDS)

In [14]:
space = {
    "max_depth": hp.quniform("max_depth", 3, 8, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0.01, 0.05),
    "reg_lambda": hp.uniform("reg_lambda", 0.01, 0.05),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.2),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.3, 0.9),
    "gamma": hp.uniform("gamma", 0.01, 0.7),
    "num_leaves": hp.choice("num_leaves", list(range(20, 250, 10))),
    "min_child_samples": hp.choice("min_child_samples", list(range(100, 250, 10))),
    "subsample": hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}

In [15]:
%%time

# Set algoritm parameters
best = fmin(fn = hyperparameter_tuning,
            space = space,
            algo = tpe.suggest,
            max_evals = 25)

# Print best parameters
best_params = space_eval(space, best)

#########################                           
Params = {'max_depth': 4, 'gamma': '0.161', 'subsample': '0.50', 'reg_alpha': '0.048', 'reg_lambda': '0.013', 'learning_rate': '0.109', 'num_leaves': '230.000', 'colsample_bytree': '0.504', 'min_child_samples': '220.000', 'feature_fraction': '0.625', 'bagging_fraction': '0.661'}
  0%|          | 0/25 [00:00<?, ?it/s, best loss: ?]


XGBoostError: [14:34:08] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure :unhandled cuda error /workspace/src/tree/../common/device_helpers.cuh(896)

Stack trace:
  [bt] (0) /home/arjun/.local/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7fbd158a5cb4]
  [bt] (1) /home/arjun/.local/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(xgboost::tree::GPUHistMakerSpecialised<xgboost::detail::GradientPairInternal<double> >::Update(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x1270) [0x7fbd15ae17f0]
  [bt] (2) /home/arjun/.local/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0xa81) [0x7fbd1592b791]
  [bt] (3) /home/arjun/.local/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::ObjFunction*)+0xd65) [0x7fbd1592cc95]
  [bt] (4) /home/arjun/.local/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x396) [0x7fbd1593f556]
  [bt] (5) /home/arjun/.local/lib/python3.6/site-packages/xgboost/./lib/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fbd158a2aa5]
  [bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7fbd654e3dae]
  [bt] (7) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7fbd654e371f]
  [bt] (8) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7fbd656f75d4]



In [16]:
import torch
torch.cuda.is_available()

False