In [None]:
# xgboost
#https://xgboost.readthedocs.io/en/latest/python/index.html
# https://github.com/dmlc/xgboost/blob/master/demo/multiclass_classification/train.py#L35

In [1]:
from sklearn.model_selection import KFold
from load_mnist import load_mnist
import xgboost as xgb
import numpy as np
from sklearn.preprocessing import StandardScaler
import time
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import precision_recall_fscore_support

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    else:
        pass
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()




train_data, train_label = load_mnist("","train")
test_data,test_label = load_mnist("","t10k")

sc = StandardScaler()
X_main_std = sc.fit(train_data)

train_data = X_main_std.transform(train_data)
test_data = X_main_std.transform(test_data)





In [2]:

sc = StandardScaler()
X_main_std = sc.fit(train_data)

train_data = X_main_std.transform(train_data)
test_data = X_main_std.transform(test_data)

In [3]:
param = {}
param['tree_method'] = 'gpu_exact'
# param['tree_method'] = 'exact'
param['objective'] = 'multi:softmax'
param['eta'] = 0.3
param['num_class'] = 10
param["subsample"] = 0.8
param["colsample_bytree"] = 0.8
param["eval_metric"] = 'merror'
param['silent'] = 1
num_round = 2
param['max_depth'] = 2

In [10]:
import time
kf = KFold(n_splits = 10)
dtest = xgb.DMatrix(test_data, label=test_label)
accs = []
# precisions = []
# recalls = []
# f1s = []
precisions = np.zeros((10,))
recalls = np.zeros((10,))
f1s = np.zeros((10,))

time1 = time.time()
for train_index, val_index in kf.split(train_data):
    x_train = train_data[train_index]
    y_train = train_label[train_index]
    x_valid = train_data[val_index]
    y_valid = train_label[val_index]
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dval = xgb.DMatrix(x_valid, label=y_valid)
    evallist = [(dtrain, 'train'),(dval, 'validation')]
    
    #training
    early_stopping = 5
    
    bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=early_stopping)
    
    #predicting
    pred = bst.predict(dtest)
    acc_rate = np.sum(pred == test_label) / test_label.shape[0]
    
    bst.__del__()
    
    accs.append(acc_rate)
    pr, re, f1, size = precision_recall_fscore_support(test_label, pred)
    
#     precisions.append(pr)
#     recalls.append(re)
#     f1s.append(f1)
    precisions += pr
    recalls += re
    f1s += f1
#     print(pr.shape)

time2 = time.time()
total_time = time2-time1

[0]	train-merror:0.322704	validation-merror:0.327167
Multiple eval metrics have been passed: 'validation-merror' will be used for early stopping.

Will train until validation-merror hasn't improved in 5 rounds.
[1]	train-merror:0.281926	validation-merror:0.28
[0]	train-merror:0.324241	validation-merror:0.3255
Multiple eval metrics have been passed: 'validation-merror' will be used for early stopping.

Will train until validation-merror hasn't improved in 5 rounds.
[1]	train-merror:0.284611	validation-merror:0.2855
[0]	train-merror:0.328352	validation-merror:0.325833
Multiple eval metrics have been passed: 'validation-merror' will be used for early stopping.

Will train until validation-merror hasn't improved in 5 rounds.
[1]	train-merror:0.284667	validation-merror:0.288833
[0]	train-merror:0.325926	validation-merror:0.3295
Multiple eval metrics have been passed: 'validation-merror' will be used for early stopping.

Will train until validation-merror hasn't improved in 5 rounds.
[1]	tra

In [11]:
total_time
# 49.29649209976196
# 46.22644758224487

46.104421615600586

In [12]:
precisions /= 10
recalls /= 10
f1s /= 10
print(sum(accs)/10)
print(precisions)
print(recalls)
print(f1s)

0.7051000000000001
[0.76748962 0.96232645 0.62671287 0.57747694 0.52137085 0.85139799
 0.46966386 0.76415181 0.79968081 0.80060026]
[0.6265 0.8478 0.6873 0.8195 0.4557 0.7024 0.4512 0.8593 0.6952 0.9061]
[0.68974685 0.9014337  0.65557932 0.67750915 0.48622879 0.76970446
 0.46015818 0.80892258 0.74364037 0.85001219]


In [None]:
cnf_matrix = confusion_matrix(test_label, pred)
print(cnf_matrix)
np.set_printoptions(precision=3)
print(precision_recall_fscore_support(test_label, pred))
# Plot non-normalized confusion matrix
plt.figure(figsize=(8,6))
plot_confusion_matrix(cnf_matrix, classes=[0,1,2,3,4,5,6,7,8,9],
                      title='Confusion matrix, without normalization')

plt.show()
