# XGBoost

## Import Libraries and Data Loading

In [50]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, precision_score
from sklearn.model_selection import  GridSearchCV
from sklearn.preprocessing import StandardScaler

train = pd.read_feather('Final_train_dataset.feather')
validation = pd.read_feather('Final_validation_dataset.feather')
test = pd.read_feather('Final_test_dataset.feather')

## Split Data into X(Feature) and Y(Class)

In [51]:
X_train = train.drop(['is_duplicate'],  axis=1)
y_train = train['is_duplicate']
X_val = validation.drop(['is_duplicate'],  axis=1)
y_val = validation['is_duplicate']
X_test = test.drop(['is_duplicate'],  axis=1)
y_test = test['is_duplicate']

## Drop Unecessary Columns (Non Important Features) 

In [52]:
impt_feat = ['freq_q2', 'freq_q1+q2', 'freq_q1', 'jaccard_dist', 'Levenshtein',
       'diff_tfidf_L2', 'diff_tfidf_L1', 'common_ratio', 'levenshtein',
       'fuzz_qratio', 'dist_canberra', 'length_diff', 'lc_substring',
       'lc_subsequence', 'freq_q1-q2', 'same_ending', 'wmdistance',
       'dist_cosine', 'dist_cityblock', 'dist_euclidean', 'dist_minkowski',
       'q1_vec_0', 'q1_vec_1', 'q1_vec_2', 'q1_vec_3', 'q1_vec_4', 'q2_vec_0',
       'q2_vec_1', 'q2_vec_2', 'q2_vec_3', 'q2_vec_4', 'diff_tfidf_L2_norm', 'diff_tfidf_L1_norm',
       'q2_word_to_vec', 'total_length']

X_train = X_train[impt_feat]
X_val = X_val[impt_feat]
X_test = X_test[impt_feat]

## Scale Data

In [53]:
sc = StandardScaler()
sc.fit(X_train)
scale = X_train.columns.tolist()

X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()
X_train_scaled[scale] = StandardScaler().fit_transform(X_train[scale])
X_val_scaled[scale] = StandardScaler().fit_transform(X_val_scaled[scale])
X_test_scaled[scale] = StandardScaler().fit_transform(X_test_scaled[scale])

## 1. Building Base XGBoost

In [54]:
xgb = XGBClassifier(random_state = 1)
xgb.fit(X_train_scaled, y_train)

In [64]:
def evaluate(model, x_train, y_train, x_val, y_val, x_test, y_test):
    preds_train = model.predict(x_train)
    preds_prob_train = model.predict_proba(x_train)
    preds_val = model.predict(x_val)
    preds_prob_val = model.predict_proba(x_val)
    preds_test = model.predict(x_test)
    preds_prob_test = model.predict_proba(x_test)
    print("The train log loss is:", log_loss(y_train, preds_prob_train))
    print("The train precision is:", precision_score(y_train, preds_train))
    print("The validation log loss is:", log_loss(y_val, preds_prob_val))
    print("The validation precision is:", precision_score(y_val, preds_val))
    print("The test log loss is:", log_loss(y_test, preds_prob_test))
    print("The test precision is:", precision_score(y_test, preds_test))
    return preds_train, preds_prob_train, preds_val, preds_prob_val, preds_test, preds_prob_test


preds_train, preds_prob_train, preds_val, preds_prob_val, preds_test, preds_prob_test = evaluate(xgb, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test)

The train log loss is: 0.28676031712518535
The train precision is: 0.8407997462242038
The validation log loss is: 0.36527218022652663
The validation precision is: 0.8354134865762772
The test log loss is: 0.36119589164132737
The test precision is: 0.8092410314813309


## 2 Tune hyperparameter to boost predictive power

### Set initial value

In [56]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.1
params["gamma"] = 2
params["tree_method"] = "gpu_hist"
params["max_bin"] = 256
params["max_depth"] = 6 
params["min_child_weight"] = 3
params["subsample"] = 0.9
params["colsample_bytree"] = 0.7
params['learning_rate'] = 0.1

dtrain = xgb.DMatrix(X_train_scaled, label = y_train)
model1 = xgb.cv(params, dtrain, num_boost_round = 1000, nfold = 10, 
                early_stopping_rounds = 200, verbose_eval = 50)

[0]	train-logloss:0.65071+0.00009	test-logloss:0.65083+0.00023
[50]	train-logloss:0.36455+0.00024	test-logloss:0.36925+0.00311
[100]	train-logloss:0.34782+0.00043	test-logloss:0.35691+0.00301
[150]	train-logloss:0.33789+0.00049	test-logloss:0.35140+0.00301
[200]	train-logloss:0.33076+0.00038	test-logloss:0.34849+0.00303
[250]	train-logloss:0.32432+0.00044	test-logloss:0.34640+0.00304
[300]	train-logloss:0.31846+0.00049	test-logloss:0.34496+0.00308
[350]	train-logloss:0.31319+0.00044	test-logloss:0.34390+0.00302
[400]	train-logloss:0.30838+0.00045	test-logloss:0.34318+0.00308
[450]	train-logloss:0.30402+0.00058	test-logloss:0.34257+0.00311
[500]	train-logloss:0.30038+0.00056	test-logloss:0.34217+0.00309
[550]	train-logloss:0.29739+0.00055	test-logloss:0.34188+0.00313
[600]	train-logloss:0.29509+0.00053	test-logloss:0.34171+0.00320
[650]	train-logloss:0.29319+0.00065	test-logloss:0.34159+0.00320
[700]	train-logloss:0.29160+0.00075	test-logloss:0.34146+0.00321
[750]	train-logloss:0.29007+

## Tune max_depth and min_child_weight

In [57]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.1
params["gamma"] = 2
params["tree_method"] = "gpu_hist"
params["max_bin"] = 256
#params["max_depth"] = 6 
#params["min_child_weight"] = 3
params["subsample"] = 0.9
params["colsample_bytree"] = 0.7
params['learning_rate'] = 0.1

evaluation_list = []
for depth in [5, 6]:
    for child_weight in [1, 3, 4]:
        params = {**params, **{"max_depth": depth, "min_child_weight": child_weight}}
        evaluation = xgb.cv(params, dtrain, num_boost_round = 1000, nfold = 6, early_stopping_rounds = 100)
        evaluation_list.append(evaluation)
        
evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

Unnamed: 0,999,999.1,999.2,999.3,999.4,999.5
train-logloss-mean,0.299813,0.301003,0.301562,0.280293,0.283133,0.283877
train-logloss-std,0.000373,0.000531,0.000602,0.001268,0.000756,0.00076
test-logloss-mean,0.342355,0.342515,0.342678,0.341741,0.341739,0.34182
test-logloss-std,0.002013,0.002036,0.001845,0.00202,0.001931,0.00182


Max depth = 6
min weight = 3

## Tune gamma

In [58]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.1
#params["gamma"] = 2
params["tree_method"] = "gpu_hist"
params["max_bin"] = 256
params["max_depth"] = 6 
params["min_child_weight"] = 4
params["subsample"] = 0.9
params["colsample_bytree"] = 0.7
params['learning_rate'] = 0.1

evaluation_list = []
for gamma in [0, 1, 2, 3, 4, 5]:
    params = {**params, **{"gamma": gamma}}
    evaluation = xgb.cv(params, dtrain, num_boost_round = 1000, nfold = 6, early_stopping_rounds = 100)
    evaluation_list.append(evaluation)
        
evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

Unnamed: 0,999,830,999.1,999.2,999.3,999.4
train-logloss-mean,0.258529,0.270993,0.283877,0.30337,0.31518,0.322529
train-logloss-std,0.000493,0.00025,0.00076,0.000843,0.000449,0.000463
test-logloss-mean,0.341388,0.341488,0.34182,0.343123,0.344749,0.346142
test-logloss-std,0.001851,0.001869,0.00182,0.002048,0.001908,0.002


Gamma = 0

##  Tune subsample and colsample_bytree

In [59]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.1
params["gamma"] = 0
params["tree_method"] = "gpu_hist"
params["max_bin"] = 256
params["max_depth"] = 6 
params["min_child_weight"] = 4
#params["subsample"] = 0.9
#params["colsample_bytree"] = 0.8
params['learning_rate'] = 0.1

evaluation_list = []
for row in [0.7, 0.8, 0.9]:
    for col in [0.7, 0.8, 0.9]:
        params = {**params, **{"subsample": row, "colsample_bytree": col}}
        evaluation = xgb.cv(params, dtrain, num_boost_round = 1000, nfold = 6, 
                            early_stopping_rounds = 100)
        evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

Unnamed: 0,652,638,658,813,806,750,888,894,788
train-logloss-mean,0.283113,0.282735,0.280057,0.269761,0.268515,0.271965,0.264927,0.262667,0.270172
train-logloss-std,0.000421,0.000328,0.000446,0.00067,0.000437,0.000522,0.000638,0.000486,0.000335
test-logloss-mean,0.342636,0.342925,0.342477,0.342159,0.342126,0.341971,0.341612,0.341395,0.341673
test-logloss-std,0.001979,0.001941,0.002241,0.002194,0.001861,0.002266,0.001908,0.002147,0.001973


subsample = 0.9, colsample_bytree = 0.8

## Tune Learning Rate

In [60]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.1
params["gamma"] = 2
params["tree_method"] = "gpu_hist"
params["max_bin"] = 256
params["max_depth"] = 6 
params["min_child_weight"] = 4
params["subsample"] = 0.9
params["colsample_bytree"] = 0.8
#params['learning_rate'] = 0.1

evaluation_list = []
for learning_rate in [0.01, 0.1, 0.2, 0.3]:
    params = {**params, **{"learning_rate": learning_rate}}
    evaluation = xgb.cv(params, dtrain, num_boost_round = 1000, nfold = 6, early_stopping_rounds = 100)
    evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

Unnamed: 0,999,999.1,306,168
train-logloss-mean,0.347222,0.281152,0.290207,0.299174
train-logloss-std,0.000333,0.000581,0.00069,0.000539
test-logloss-mean,0.356846,0.341734,0.344974,0.347961
test-logloss-std,0.00158,0.001936,0.002162,0.001714


learning_rate = 0.1

## Tune Max bin

In [62]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.1
params["gamma"] = 2
params["tree_method"] = "gpu_hist"
#params["max_bin"] = 256
params["max_depth"] = 6 
params["min_child_weight"] = 4
params["subsample"] = 0.9
params["colsample_bytree"] = 0.8
params['learning_rate'] = 0.1

evaluation_list = []
for bin in [200, 230, 256, 280]:
    params = {**params, **{"max_bin": bin}}
    evaluation = xgb.cv(params, dtrain, num_boost_round = 1000, nfold = 6, early_stopping_rounds = 100)
    evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

Unnamed: 0,999,999.1,999.2,999.3
train-logloss-mean,0.283256,0.282271,0.281919,0.281207
train-logloss-std,0.000838,0.000828,0.00082,0.001068
test-logloss-mean,0.341643,0.341637,0.341871,0.34195
test-logloss-std,0.001943,0.001898,0.002005,0.002137


max depth = 230

## 3. Building Final XGBoost Model

In [63]:
xgb = XGBClassifier(random_state = 1, objective = "binary:logistic", eval_metric = "logloss", eta = 0.1, gamma = 2 , tree_method = "gpu_hist", 
                    max_bin = 230, max_depth = 6, min_child_weight = 4, subsample = .9, colsample_bytree = .8, learning_rate = .1,
                    n_estimators = 1000)
    
xgb.fit(X_train_scaled, y_train)

In [55]:
    evaluate(xgb, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test)

The train log loss is: 0.3191114700789557
The train precision is: 0.8093476144109055
The validation log loss is: 0.3475345079939476
The validation precision is: 0.7850367306183582
The test log loss is: 0.35228207652931276
The test precision is: 0.7791057367829022


(array([0, 1, 1, ..., 0, 0, 0]),
 array([[9.4799471e-01, 5.2005291e-02],
        [4.5498025e-01, 5.4501975e-01],
        [3.1273252e-01, 6.8726748e-01],
        ...,
        [7.8219104e-01, 2.1780893e-01],
        [7.7900147e-01, 2.2099854e-01],
        [9.9940223e-01, 5.9778703e-04]], dtype=float32),
 array([0, 0, 1, ..., 0, 0, 1]),
 array([[0.5817892 , 0.4182108 ],
        [0.5515315 , 0.4484685 ],
        [0.36105275, 0.63894725],
        ...,
        [0.9772966 , 0.02270339],
        [0.7361076 , 0.26389238],
        [0.44250673, 0.55749327]], dtype=float32),
 array([0, 1, 0, ..., 0, 1, 1]),
 array([[7.8975135e-01, 2.1024866e-01],
        [4.2213517e-01, 5.7786483e-01],
        [9.9991572e-01, 8.4295418e-05],
        ...,
        [6.6229057e-01, 3.3770946e-01],
        [7.3259056e-02, 9.2674094e-01],
        [3.4408629e-01, 6.5591371e-01]], dtype=float32))

## Output predicted probabilities for final model 

In [42]:
train_probs_0 = [x[0] for x in preds_prob_train]
test_probs_0 = [x[0] for x in preds_prob_test]

pd.DataFrame(train_probs_0).to_csv('predictions_full_xgboost_train.csv', index = False)    
pd.DataFrame(test_probs_0).to_csv('predictions_full_xgboost_test.csv', index = False)     