# Searching for the optimal parameters

In [1]:
#!conda install -y xgboost numpy pandas scipy

In [2]:
#pip install -U joblib

In [3]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn import metrics
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import fetch_openml

  from pandas import MultiIndex, Int64Index


In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
# import the mnist dataset
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [6]:
# separate features and target
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [7]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                       test_size=0.1, 
                                       random_state=42)

## Data Augmentation

In [8]:
from scipy.ndimage import shift
X_aug_down = shift(np.array(X_train).reshape(63000,28,28), [0,1,0], cval=0)
X_aug_down.shape

(63000, 28, 28)

In [9]:
X_aug_up = shift(np.array(X_train).reshape(63000,28,28), [0,-1,0], cval=0)
X_aug_up.shape

(63000, 28, 28)

In [10]:
X_aug_right = shift(np.array(X_train).reshape(63000,28,28), [0,0,1], cval=0)
X_aug_right.shape

(63000, 28, 28)

In [11]:
X_aug_left = shift(np.array(X_train).reshape(63000,28,28), [0,0,-1], cval=0)
X_aug_left.shape

(63000, 28, 28)

In [12]:
X_temp = np.concatenate((X_aug_down, X_aug_up, X_aug_right, X_aug_left))
X_temp.shape

(252000, 28, 28)

In [13]:
# Augmented training dataset generated and stored in X_aug and y_aug.
X_aug = np.concatenate((X_train, X_temp.reshape(252000, 784)))
X_aug.shape

(315000, 784)

In [14]:
y_aug = np.concatenate((y_train, y_train, y_train, y_train, y_train))
y_aug.shape

(315000,)

In [15]:
X_train = X_aug
y_train = y_aug


print(len(X_train))
print(len(y_train))

315000
315000


## Preprocessing

In [16]:
# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_test_scaled = scaler.transform(X_test.astype(np.float64))

#X_train_scaled = X_train
#X_test_scaled = X_test



## XG Boost

There are in general two ways that you can control overfitting in XGBoost:

- The first way is to directly control model complexity.

    - This includes max_depth, min_child_weight and gamma.

- The second way is to add randomness to make training robust to noise.

    - This includes subsample and colsample_bytree.

    - You can also reduce stepsize eta. Remember to increase num_round when you do so.

[source](https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html#:~:text=There%20are%20in,you%20do%20so.)

In [17]:
# modeling: XGBoost
model = XGBClassifier()

Gridsearch suggestions:  
https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde

Using this example but simplifying the grid to conserve time.


In [18]:
# define your parameter grid 

param_grid = {
    "learning_rate"     : [ 0.10, 0.15, 0.30 ] ,
     "max_depth"        : [ 3,  15],
     "min_child_weight" : [ 1 ],
     "gamma"            : [ 0.0 ],
     "colsample_bytree" : [ 0.3,  0.5  ] 
}

In [19]:
# establish cross-validation and gridsearch 
# note: setting cross-validation =2 to save time
# xgb has cross-validation built in so repeating here is unnecessary
xgb_grid = GridSearchCV(model, param_grid, verbose=0, cv=2, n_jobs=-1)

In [None]:
# conduct the search (this will take a while)
xgb_grid.fit(X_train_scaled, y_train)

In [None]:
# see the best parameters and their score
print("Best parameters:")
print(xgb_grid.best_params_)
print("Best score in grid search:")
print(xgb_grid.best_score_)
print("best model from grid search:")
print(xgb_grid.score(X_test_scaled, y_test))

In [None]:
# predict
y_preds=xgb_grid.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

In [None]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

## Pickle the model

In [None]:
# xgboost
f = open('xgb_grid_model.pkl', 'wb')
pickle.dump(xgb_grid, f)
f.close() 

## END