In [1]:
# importing the libraries

import os
import gc
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score, make_scorer, matthews_corrcoef, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb
import shap
import tensorflow as tf

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# setting up the parameters

root_dir = os.path.dirname(os.path.abspath(os.getcwd()))
pd.set_option("display.max_rows", 16)
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

## Helper Functions

In [3]:
def train_test_split(dataframe, validation_ratio):
    """
    randomly splits the dataset into train and valid sets
    
    input: dataframe, validation ratio (the %age of data to feed into the )
    output: trainset and validset
    """
    num_train = len(dataframe)
    indices = list(range(num_train))
    np.random.shuffle(indices)
    split = int(np.floor(validation_ratio*num_train))
    train_idx, valid_idx = indices[split:], indices[:split]
    train_df, valid_df = dataframe.iloc[train_idx], dataframe.iloc[valid_idx]
    return train_df, valid_df

In [4]:
# importing the dataset
df = pd.read_csv(os.path.join(root_dir, "data", "processed_data", "train_aggdf.csv"))
df.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,diff_time_mean,diff_time_max,diff_time_min,count_unique_app,hour,minute,vc_app_code,vc_user_id,app_code_count_unique_user,click_count_user_mean,time_elapsed_user,click_count_app_mean,time_elapsed_app,inst_count,user_unique_sessions,user_unique_item_ids,user_unique_category_1,user_unique_category_2,user_unique_category_3,user_unique_product_type,user_item_id_mode,user_category_1_mode,user_category_2_mode,user_category_3_mode,user_product_type_mode
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,2,0,0,74670.0,148200.0,1140.0,1,0,0,395,3,51,-1.0,-1.0,-1.0,-1.0,1,1,1,1,1,1,1,43886,11.0,35.0,20.0,5622.0
1,c81e728d9d4c2f636f067f89cc14862c,2018-11-15 00:00:00,89464,129,0,0,0,101901.818182,347520.0,180.0,1,0,0,7050,23,2000,-1.0,-1.0,-1.0,-1.0,226,40,150,15,50,90,132,38517,17.0,9.0,62.0,8028.0
2,eccbc87e4b5ce2fe28308fd9f2a7baf3,2018-11-15 00:00:00,58442,127,1,0,0,68812.727273,165720.0,8400.0,1,0,0,10851,34,2039,-1.0,-1.0,-1.0,-1.0,32,15,12,8,10,10,12,73224,1.0,64.0,263.0,5164.0
3,a87ff679a2f3e71d9181a67b7542122c,2018-11-15 00:00:00,4238,371,1,0,0,540.0,540.0,540.0,1,0,0,9343,2,1819,-1.0,-1.0,-1.0,-1.0,109,30,52,12,28,40,50,37336,17.0,8.0,84.0,231.0
4,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,1,1,1,45745.882353,167340.0,360.0,2,0,1,422,52,24,-1.0,-1.0,-1.0,-1.0,7,5,3,2,2,2,3,43209,4.0,74.0,292.0,577.0


In [5]:
# spliting the dataset
train_df, test_df = train_test_split(df, validation_ratio = 0.2)

print("shape of train_df = {}".format(train_df.shape))
print("shape of test_df = {}".format(test_df.shape))

# define predictors and targets in trainset and testset
x_train, y_train = train_df.drop(columns = ["impression_id", "user_id", "impression_time", "is_click"]), train_df["is_click"]
x_test, y_test = test_df.drop(columns = ["impression_id", "user_id", "impression_time", "is_click"]), test_df["is_click"]

print("-"*10)
print("Shape of x_train = {} ... y_train = {}".format(x_train.shape, y_train.shape))
print("Shape of x_test = {} ... y_test = {}".format(x_test.shape, y_test.shape))

shape of train_df = (190088, 32)
shape of test_df = (47521, 32)
----------
Shape of x_train = (190088, 28) ... y_train = (190088,)
Shape of x_test = (47521, 28) ... y_test = (47521,)


---
# Machine Learning Models
## 1. XGBoost

In [16]:
def f1_eval(y_pred, y_true):
    """
    custom eval metric for XGBClassifier
    """
    err = 1 - f1_score(y_true, np.round(y_pred))
    return "f1_err", err

In [21]:
def hyperparameter_tuning(params):
    """
    hypertunes a XGBoost model
    
    inp: parameters
    outp: score per fold
    """
    params = {
        "max_depth": int(params["max_depth"]),                              # max depth of the tree
        "gamma": "{:.3f}".format(params["gamma"]),                          # min loss reduction required to make a split
        "subsample": "{:.2f}".format(params["subsample"]),                  # denotes the fraction of observations to be randomly samples for each tree
        "reg_alpha": "{:.3f}".format(params["reg_alpha"]),                  # L1 regularization weight
        "reg_lambda": "{:.3f}".format(params["reg_lambda"]),                # L2 regularization weight
        "learning_rate": "{:.3f}".format(params["learning_rate"]),          # learning rate of XGB
        "num_leaves": "{:.3f}".format(params["num_leaves"]),                 
        "colsample_bytree": "{:.3f}".format(params["colsample_bytree"]),
        "min_child_samples": "{:.3f}".format(params["min_child_samples"]),
        "feature_fraction": "{:.3f}".format(params["feature_fraction"]),
        "bagging_fraction": "{:.3f}".format(params["bagging_fraction"]),
        "objective": "binary:logistic",
        #"f_eval": f1_eval,
        "eval_metric": "error"
    }
    
    print("#"*25)
    print("Params = {}".format(params))
    FOLDS = 10  # defining the folds required
    count = 1   # count of HPT cycles
    skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = 42)
    y_oof = np.zeros(x_train.shape[0])
    ROC_mean = 0
    for trn_idx, val_idx in skf.split(x_train, y_train):
        # define the classifier
        clf = xgb.XGBClassifier(random_state = 42, 
                                verbose = True, 
                                tree_method = "gpu_hist",
                                **params)
        # spliting data into train and valid sets
        train_x, valid_x = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        train_y, valid_y = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        
        # fit the estimator and predict
        clf.fit(train_x, train_y)
        pred = clf.predict(valid_x)
        # eval metrics
        score_ROC = make_scorer(roc_auc_score, needs_proba = True)(clf, valid_x, valid_y)
        score_F1 = f1_score(valid_y.values, pred, average = "binary")
        score_MCC = matthews_corrcoef(valid_y.values, pred)
        ROC_mean += score_ROC
        print("Count = {} ... score_ROC = {:.4f} ... score_F1 = {:.4f} ... score_MCC = {:.4f}".format(count, score_ROC, score_F1, score_MCC))
        count += 1
    
    gc.collect()
    print("Mean ROC_AUC = {:.4}".format(ROC_mean / FOLDS))
    del train_x, valid_x, train_y, valid_y, clf, score_ROC
    
    return -(ROC_mean/FOLDS)

In [22]:
space = {
    "max_depth": hp.quniform("max_depth", 6, 8, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0.01, 0.05),
    "reg_lambda": hp.uniform("reg_lambda", 0.01, 0.05),
    "learning_rate": hp.uniform("learning_rate", 0.001, 0.2),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.3, 0.9),
    "gamma": hp.uniform("gamma", 0.01, 0.7),
    "num_leaves": hp.choice("num_leaves", list(range(20, 250, 10))),
    "min_child_samples": hp.choice("min_child_samples", list(range(100, 250, 10))),
    "subsample": hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}

In [23]:
%%time

# Set algoritm parameters
best = fmin(fn = hyperparameter_tuning,
            space = space,
            algo = tpe.suggest,
            max_evals = 50)

# Print best parameters
best_params = space_eval(space, best)

#########################                           
Params = {'max_depth': 8, 'gamma': '0.643', 'subsample': '0.20', 'reg_alpha': '0.015', 'reg_lambda': '0.026', 'learning_rate': '0.179', 'num_leaves': '120.000', 'colsample_bytree': '0.417', 'min_child_samples': '210.000', 'feature_fraction': '0.440', 'bagging_fraction': '0.746', 'objective': 'binary:logistic', 'eval_metric': 'error'}
Count = 1 ... score_ROC = 0.6783 ... score_F1 = 0.0247 ... score_MCC = 0.0256
Count = 2 ... score_ROC = 0.6669 ... score_F1 = 0.0360 ... score_MCC = 0.0540
Count = 3 ... score_ROC = 0.6925 ... score_F1 = 0.0344 ... score_MCC = 0.0575
Count = 4 ... score_ROC = 0.6805 ... score_F1 = 0.0277 ... score_MCC = 0.0398
Count = 5 ... score_ROC = 0.6735 ... score_F1 = 0.0190 ... score_MCC = 0.0205
Count = 6 ... score_ROC = 0.6751 ... score_F1 = 0.0168 ... score_MCC = 0.0156
Count = 7 ... score_ROC = 0.6810 ... score_F1 = 0.0382 ... score_MCC = 0.0592
Count = 8 ... score_ROC = 0.6790 ... score_F1 = 0.0391 ... score_

In [7]:
# final chosen parameter
params = {
    'max_depth': 6, 
    'gamma': '0.255', 
    'subsample': '0.90', 
    'reg_alpha': '0.027',
    'reg_lambda': '0.030', 
    'learning_rate': '0.071', 
    'num_leaves': '100.000', 
    'colsample_bytree': '0.313', 
    'min_child_samples': '200.000', 
    'feature_fraction': '0.429', 
    'bagging_fraction': '0.798', 
    'objective': 'binary:logistic', 
    'eval_metric': 'error'
}

# define the classifier
xgb_clf = xgb.XGBClassifier(random_state = 42, 
                            verbose = True, 
                            tree_method = "gpu_hist",
                            **params)

# fit the classifier
xgb_clf.fit(x_train, y_train)

# predict with the trained classifier
preds = xgb_clf.predict(x_test)

# printing the confusion matrix
con_mat = confusion_matrix(y_test.values, preds)
print(con_mat)

[[45339     0]
 [ 2180     2]]


Looking at the above *confusion matrix*, two conclusions can be drawn:-
- the number of false negatives is insanely high.
- the hyperparameter tuning may not give us the good performance. So, we need to explore new other techniques.

As, we already have tried the **StratifiedKFold**, we need to now explore other sampling algorithms, that can allow our model to learn better. One solution to this problem could be using *Majority Under-Sampling, Minority Over-Sampling*. The way we can implement this is by **SMOTE**, which stands for Synthetic Minority Over-sampling Technique. The following code snippet will implement SMOTE from scratch (to understand the algorithm better, though a library called imblearn is also available). 

This algorithm is attributed to [this post](https://medium.com/@breya.heysoftware/synthetic-minority-over-sampling-technique-smote-from-scratch-e1167f788434):-
- Identify the feature vector and its nearest neighbour
- Take the difference between the two
- Multiply the difference with a random number between 0 and 1
- Identify a new point on the line segment by adding the random number to feature vector
- Repeat the process for identified feature vector

**NOTE** - for the best performance of the SMOTE, the data should be normalized. As I used XGBoost algorithm, I didn't scaled the data. That's because, tree based algorithm are immune to scale of the dataset.

In [8]:
def n_neighbors(x_trainset):
    """
    inp: trainset without labels
    outp: indices for nearest neighbors
    """
    neighbors = NearestNeighbors(n_neighbors = 5, metric = "euclidean", algorithm = "kd_tree").fit(x_trainset)
    euclidean, indices = neighbors.kneighbors(x_trainset)
    return indices

In [9]:
def smote(x_trainset, minclass_arr):
    """
    inp: trainset without labels,
         minority class array
    outp: matrix
    """
    # get indices from n_neighbors function defined earlier
    indices = n_neighbors(x_trainset)
    matrix = []                          # initialize a matrix
    for m in range(0, len(indices)):
        temp = minclass_arr[indices[m]]
        temp = pd.DataFrame(temp)
        matrix.append([])
        for j in range(0, len(temp.columns)):
            matrix[m].append(random.choice[temp[j]])
        return matrix

In [12]:
# getting minority classes instances in the trainset
unique, counts = np.unique(y_train, return_counts = True)

In [16]:
# Create a minority_shape variable which contains the dimension of the minority class
minority_class = dict(zip(unique, counts))[1]

In [17]:
# normalizing the data


8680