# XGBoost

This notebook will train a simple XGBoost model, state-of-the-art gradient boosting algorithm, to see how well it performs on MNIST. Tree-based models are not the best choice for image data, but it's interesting to see how well it performs.

In [1]:
import numpy as np
import pandas as pd
from torchvision.datasets import MNIST
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import joblib

SEED = 42

In [2]:
train_ds = MNIST(root="../data", train=True, download=True)
test_ds = MNIST(root="../data", train=False, download=True)

In [3]:
def dataset_to_Xy(ds):
    flattened_data = np.array([np.array(image).flatten() for image, label in ds]) / 255

    X = pd.DataFrame(flattened_data, columns=[f'pixel_{i}' for i in range(flattened_data.shape[1])])
    y = ds.targets

    return X, y

In [5]:
X, y = dataset_to_Xy(train_ds)
X_test, y_test = dataset_to_Xy(test_ds)

X.sample(5)

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_774,pixel_775,pixel_776,pixel_777,pixel_778,pixel_779,pixel_780,pixel_781,pixel_782,pixel_783
48170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.023529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=SEED)
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(54000, 784) torch.Size([54000])
(6000, 784) torch.Size([6000])


In [18]:
model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.08,
    max_depth=6,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.005,
    reg_lambda=0.01,
    early_stopping_rounds=10,
    random_state=SEED
)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_set=eval_set, verbose=True)

[0]	validation_0-mlogloss:2.02498
[1]	validation_0-mlogloss:1.82266
[2]	validation_0-mlogloss:1.65953
[3]	validation_0-mlogloss:1.52474
[4]	validation_0-mlogloss:1.41106
[5]	validation_0-mlogloss:1.31235
[6]	validation_0-mlogloss:1.22399
[7]	validation_0-mlogloss:1.14487
[8]	validation_0-mlogloss:1.07486
[9]	validation_0-mlogloss:1.01000
[10]	validation_0-mlogloss:0.95110
[11]	validation_0-mlogloss:0.89771
[12]	validation_0-mlogloss:0.84801
[13]	validation_0-mlogloss:0.80316
[14]	validation_0-mlogloss:0.76174
[15]	validation_0-mlogloss:0.72315
[16]	validation_0-mlogloss:0.68766
[17]	validation_0-mlogloss:0.65429
[18]	validation_0-mlogloss:0.62433
[19]	validation_0-mlogloss:0.59580
[20]	validation_0-mlogloss:0.56970
[21]	validation_0-mlogloss:0.54463
[22]	validation_0-mlogloss:0.52139
[23]	validation_0-mlogloss:0.49964
[24]	validation_0-mlogloss:0.47927
[25]	validation_0-mlogloss:0.46016
[26]	validation_0-mlogloss:0.44227
[27]	validation_0-mlogloss:0.42531
[28]	validation_0-mlogloss:0.4

In [19]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9762

In [20]:
joblib.dump(model, '../models/xgb_baseline.joblib')

['../models/xgb_baseline.joblib']