In [7]:
!conda install -y xgboost numpy pandas scipy

Collecting package metadata (current_repodata.json): done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - conda-forge/noarch::tqdm==4.62.3=pyhd8ed1ab_0
  - conda-forge/noarch::black==21.11b1=pyhd8ed1ab_0
  - conda-forge/linux-64::conda-package-handling==1.7.3=py38h497a2fe_1
  - conda-forge/noarch::dask-core==2021.11.2=pyhd8ed1ab_0
  - conda-forge/noarch::imageio==2.9.0=py_0
  - conda-forge/linux-64::pytest==6.2.5=py38h578d9bd_1
  - conda-forge/linux-64::watchdog==2.1.6=py38h578d9bd_1
  - conda-forge/linux-64::aiohttp==3.8.1=py38h497a2fe_0
  - conda-forge/linux-64::astropy==5.0=py38h6c62de6_0
  - conda-forge/linux-64::bokeh==2.4.2=py38h578d9bd_0
  - conda-forge/linux-64::distributed==2021.11.2=py38h578d9bd_0
  - conda-forge/noarch::flask==2.0.2=pyhd8ed1ab_0
  - conda-forge/linux-64::matplotlib-base==3.5.0=py38hf4fb855_0
  - conda-forge/noarch::nbformat==5.1.3=pyhd8ed1ab_0
  - cond

In [27]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn import metrics
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import fetch_openml

In [28]:
from sklearn.model_selection import GridSearchCV

In [13]:
# import the mnist dataset
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [29]:
# separate features and target
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [30]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                       test_size=0.1, 
                                       random_state=42)

In [32]:
print(X_train.shape)
print(X_test.shape)

(63000, 784)
(7000, 784)


In [33]:
print(y_train.shape)
print(y_test.shape)

(63000,)
(7000,)


## Data Augmentation

In [34]:
# Referred from https://github.com/austinlasseter/handson-ml2/blob/master/03_classification.ipynb
# https://towardsdatascience.com/improving-accuracy-on-mnist-using-data-augmentation-b5c38eb5a903
# https://stackoverflow.com/questions/67195851/attributeerror-str-object-has-no-attribute-reshape
from scipy.ndimage import shift

# Method to shift the image by given dimension
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

In [35]:
# Creating Augmented Dataset
X_train_augmented = [image for image in X_train]
y_train_augmented = [image for image in y_train]

for dx, dy in ((1,0), (-1,0), (0,1), (0,-1)):
    for image, label in zip(X_train.values, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)


In [36]:
print(len(X_train_augmented))
print(len(y_train_augmented))

252784
315000


In [37]:
# Shuffle the dataset
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = np.array(X_train_augmented)[shuffle_idx]
y_train_augmented = np.array(y_train_augmented)[shuffle_idx]

In [41]:
print(len(X_train_augmented))
print(len(y_train_augmented))

252784
252784


In [43]:
#X_train_augmented

In [45]:
X_train_scaled = X_train_augmented
y_train = y_train_augmented

## Preprocessing

In [40]:
# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ValueError: setting an array element with a sequence.

## Random Forest

In [46]:
# modeling: random forest (arbitrary hyperparameters)
model = RandomForestClassifier()

In [47]:
# define your parameter grid 
param_grid = {
                'max_depth': [5, 10, 15],
                #'criterion': ['entropy', 'gini'],
                'criterion': ['entropy'],
                'min_samples_leaf': [6, 10, 20],
                'class_weight':['balanced', None],
                'n_estimators': [50, 100, 200]
            }

In [48]:
# establish cross-validation and gridsearch
rf_grid = GridSearchCV(model, param_grid, cv=10, verbose=1,n_jobs=-1)

In [None]:
# conduct the search (this cell will take a very long time!)
rf_grid.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


In [None]:
# see the best parameters and their score
print("Best parameters:")
print(rf_grid.best_params_)
print("Best score in grid search:")
print(rf_grid.best_score_)
print("best model from grid search:")
print(rf_grid.score(X_test_scaled, y_test))

In [None]:
# predict
y_preds=rf_grid.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

In [None]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

## Pickle the model

In [None]:
# random forest
f = open('rf_grid_model.pkl', 'wb')
pickle.dump(rf_grid, f)
f.close() 

## END