In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
_v = '_v0'
fname = 'data.csv'
bname = 'data_blind.csv'
label_name = 'Label'

In [None]:
data = pd.read_csv(fname)

In [None]:
print(data.shape)
data.head()

In [None]:
X = data.drop(columns=[label_name])
Y = data[label_name]
labels = np.unique(Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)
dtrain = xgb.DMatrix(x_train,y_train)
dtest = xgb.DMatrix(x_test,y_test)

## CV

In [None]:
dcross_val = xgb.DMatrix(X, Y)

In [None]:
num_round = 50
es = 4
metric = 'merror' #change metric
folds = 3
params = {
    'eta':0.1,
    'subsample': 0.5,
    'colsample_bytree': 0.3,
    'max_depth': 10,
    'min_child_weight': 1,
    'objective':'multi:softmax',
    'nthread': 8,
    'silent':1
}

In [None]:
cv_results = xgb.cv(dtrain=dcross_val,
                    params=params, 
                    nfold=folds, 
                    num_boost_round=num_round,
                    early_stopping_rounds=es,
                    metrics=metric,
                    callbacks=[xgb.callback.print_evaluation(show_stdv=False)],
                    as_pandas=True, seed=0)

In [None]:
cv_results.head()

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight/2)
    for max_depth in range(1,15)
    for min_child_weight in range(1,20)
]

In [None]:
min_error = float("Inf") ## change error throughout to metric
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params,
        dcross_val,
        num_boost_round=num_round,
        seed=0,
        nfold=folds,
        metrics=metric,
        early_stopping_rounds=es
    )

    mean_error = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].argmin()
    print("\terror {} for {} rounds".format(mean_error, boost_rounds))
    if mean_error < min_error:
        min_error = mean_error
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, ERROR: {}".format(best_params[0], best_params[1], min_error))

In [None]:
params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]
params

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(3,10)]
    for colsample in [i/10. for i in range(3,10)]
]

In [None]:
min_error = float("Inf") ## change error throughout to metric
best_params = None

# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))

    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cv_results = xgb.cv(
        params,
        dcross_val,
        num_boost_round=num_round,
        seed=0,
        nfold=folds,
        metrics=metric,
        early_stopping_rounds=es
    )

    # Update best score
    mean_error = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].argmin()
    print("\terror {} for {} rounds".format(mean_error, boost_rounds))
    if mean_error < min_error:
        min_error = mean_error
        best_params = (subsample,colsample)

print("Best params: {}, {}, ERROR: {}".format(best_params[0], best_params[1], min_error))

In [None]:
params['subsample'] = best_params[0]
params['colsample_bytree'] = best_params[1]
params

In [None]:
min_error = float("Inf") ## change error throughout to metric
best_params = None

for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))

    # We update our parameters
    params['eta'] = eta

    # Run and time CV
    cv_results = xgb.cv(
        params,
        dcross_val,
        num_boost_round=num_round,
        seed=0,
        nfold=folds,
        metrics=metric,
        early_stopping_rounds=es
    )

    # Update best score
    mean_error = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].argmin()
    print("\terror {} for {} rounds\n".format(mean_error, boost_rounds))
    if mean_error < min_error:
        min_error = mean_error
        best_params = eta

print("Best params: {}, ERROR: {}".format(best_params, min_error))

In [None]:
params['eta'] = best_params
params

In [None]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=es
)

In [None]:
num_round = model.best_iteration + 1

best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=es
)

In [None]:
best_model.save_model("xgb" + _v +".model")

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10, 10))
xgb.plot_importance(best_model, ax =ax, title='Feature importance (Weight)', importance_type='weight', show_values=False)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10, 10))
xgb.plot_importance(best_model, ax =ax, title='Feature importance (Gain)', importance_type='gain', show_values=False)

## Test

In [None]:
model = xgb.Booster({'nthread':8})
model.load_model("xgb" + _v + ".model")

In [None]:
y_res = model.predict(dtest)

In [None]:
acc = accuracy_score(y_res, y_test)
report = classification_report(y_res, y_test, digits=3, target_names=labels)
cf = confusion_matrix(y_res, y_test)

In [None]:
print(report)
print("Accuracy", acc)

In [None]:
sns.heatmap(cf, xticklabels=labels yticklabels=labels, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('True Class')
plt.xlabel('Predicted class')
plt.show()

## Blind

In [None]:
blind = pd.read_csv(bname)
x_blind = blind.drop(columns=[label_name])
y_blind = blind[label_name]
dblind = xgb.DMatrix(x_blind, y_blind)

In [None]:
model = xgb.Booster({'nthread':8})
model.load_model("xgb" + _v + ".model")

In [None]:
y_pred = model.predict(dblind)

In [None]:
acc = accuracy_score(y_b, y_blind)
report = classification_report(y_b, y_blind, digits=3, target_names=labels)
cf = confusion_matrix(y_b, y_blind)

In [None]:
print(report)
print("Accuracy", acc)

In [None]:
sns.heatmap(cf, xticklabels=labels, yticklabels=labels, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('True Class')
plt.xlabel('Predicted class')
plt.show()