# Problem 3

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

import matplotlib.pyplot as plt

%matplotlib inline

already_loaded = 0

In [2]:
if already_loaded:
    openml_mnist = pickle.load(open('mnist.pkl', 'rb'))
else:
    openml_mnist = fetch_openml('mnist_784', as_frame=True)
    pickle.dump(openml_mnist, open('mnist.pkl', 'wb'))

In [3]:
X_full = openml_mnist.data.values
Y_full = openml_mnist.target.values

X_train, X_test, Y_train, Y_test = train_test_split(X_full, Y_full, test_size=0.25)

We first use randomized search over a grid of hyperparameters to train a Random Forest Classifier.

In [4]:
#Grid of parameters to train the Random Forest over

param_grid = {
    'rfc__n_estimators': [10,50,100,200,300,500],
    'rfc__max_depth': [10,20,30,40,50,None],
    'rfc__min_samples_split': [2, 5, 10],
    'rfc__min_samples_leaf': [1, 2, 4]
}
pipe = Pipeline([('scaler', StandardScaler()), ('rfc', RandomForestClassifier())])
rforest = RandomizedSearchCV(pipe, param_grid, cv=3, n_iter=100, verbose=2, n_jobs=2)

In [5]:
rforest.fit(X_train, Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 22.5min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 99.4min
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed: 198.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [6]:
print(rforest.best_params_)

{'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 40}


In [7]:
rforest_preds = rforest.predict(X_test)

acc = accuracy_score(Y_test, rforest_preds)
print('Accuracy with best Random Forest: {0:.3f}'.format(acc))

Accuracy with best Random Forest: 0.969


We can see that the results are similar to the previous methods. We do the same, to evaluate our XGB model:

In [4]:
param_grid = {
    'xgb__n_estimators': np.arange(10,51,10),
    'xgb__learning_rate': [0.3, 0.4, 0.5],
    'xgb__max_depth' : [2, 5, 10, 15],
    'xgb__colsample_bytree': [0.3,0.5],
}
pipe = Pipeline([('scaler', StandardScaler()), ('xgb', xgb.XGBClassifier())])
xgb_model = RandomizedSearchCV(pipe, param_grid, cv=3, n_iter=100, verbose=2, n_jobs=2)

In [5]:
xgb_model.fit(X_train,Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 39.5min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 196.2min
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed: 379.0min finished


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...obs=None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                           

In [6]:
print(xgb_model.best_params_)

{'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.3, 'colsample_bytree': 0.5}


In [7]:
xgb_preds = xgb_model.predict(X_test)

acc = accuracy_score(Y_test, xgb_preds)
print('Accuracy with best Gradient Boosting: {0:.3f}'.format(acc))

Accuracy with best Gradient Boosting: 0.974


The results are slightly better in this case, though this is also affected by the hyperparameter search. In any case, the accuracy for the MNIST dataset is very high.