# Problem 4

The process used here is the same as the one used in problem 3, except for CIFAR instead of MNIST. The results can be seen in the following:

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

import matplotlib.pyplot as plt

%matplotlib inline

already_loaded = 1

In [2]:
if already_loaded:
    openml_mnist = pickle.load(open('cifar.pkl', 'rb'))
else:
    openml_mnist = fetch_openml('CIFAR_10_small', as_frame=True)
    pickle.dump(openml_mnist, open('cifar.pkl', 'wb'))

In [3]:
X_full = openml_mnist.data.values
Y_full = openml_mnist.target.values

X_train, X_test, Y_train, Y_test = train_test_split(X_full, Y_full, test_size=0.25)

In [4]:
#Grid of parameters to train the Random Forest over
param_grid = {
    'rfc__n_estimators': [10,50,100,200,300,500],
    'rfc__max_depth': [10,20,30,40,50,None],
    'rfc__min_samples_split': [2, 5, 10],
    'rfc__min_samples_leaf': [1, 2, 4]
}
pipe = Pipeline([('scaler', StandardScaler()), ('rfc', RandomForestClassifier())])
rforest = RandomizedSearchCV(pipe, param_grid, cv=3, n_iter=50, verbose=2, n_jobs=2)

In [5]:
rforest.fit(X_train, Y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 19.4min
[Parallel(n_jobs=2)]: Done 150 out of 150 | elapsed: 97.2min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=2,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, None],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [10, 50, 100, 200, 300,
                                                         500]},
                   verbose=2)

In [6]:
print(rforest.best_params_)

{'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 40}


In [7]:
rforest_preds = rforest.predict(X_test)

acc = accuracy_score(Y_test, rforest_preds)
print('Accuracy with best Random Forest: {0:.3f}'.format(acc))

Accuracy with best Random Forest: 0.457


In [4]:
param_grid = {
    'xgb__n_estimators': np.arange(10,51,10),
    'xgb__learning_rate': [0.3, 0.4, 0.5],
    'xgb__max_depth' : [2, 5, 10, 15],
    'xgb__colsample_bytree': [0.3,0.5],
}
pipe = Pipeline([('scaler', StandardScaler()), ('xgb', xgb.XGBClassifier())])
xgb_model = RandomizedSearchCV(pipe, param_grid, cv=3, n_iter=50, verbose=2, n_jobs=2)

In [5]:
xgb_model.fit(X_train,Y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 104.2min
[Parallel(n_jobs=2)]: Done 150 out of 150 | elapsed: 369.4min finished


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                          

In [6]:
print(xgb_model.best_params_)

{'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.3, 'colsample_bytree': 0.5}


In [8]:
xgb_preds = xgb_model.predict(X_test)

acc = accuracy_score(Y_test, xgb_preds)
print('Accuracy with best Gradient Boosting: {0:.3f}'.format(acc))

Accuracy with best Gradient Boosting: 0.465


Again, we see a small increase in performance of gradient boosting over random forests. In any case, the performance of both is similar, and lower than MNIST since here the dataset is much more complex.