# Data Pull and Train / Test Sets 

In [1]:
import numpy as np
np.random.seed(42)
import matplotlib as mpl
from matplotlib import pyplot as plt

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('MNIST_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8)

In [3]:
X, y = mnist["data"], mnist["target"]
X.shape, y.shape

((70000, 784), (70000,))

In [4]:
split_index = 60000
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((60000, 784), (60000,), (10000, 784), (10000,))

# Random Forest Model Fitting to Training Set, Accuracy Calculation

In [5]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state=42, n_estimators = 10)

In [6]:
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [7]:
from sklearn.metrics import accuracy_score
y_pred = forest.predict(X_test)
accuracy_score(y_test, y_pred)

0.94920000000000004

# Data Standardisation

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(copy=True)

In [9]:
scaler.fit(X)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
X_train_scaled.min(), X_train_scaled.max()

(0.0, 1.0)

In [12]:
X_test_scaled.min(), X_test_scaled.max()

(0.0, 1.0)

# Testing Random Forest Classifier with Scaled Inputs

In [13]:
forest.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [14]:
y_pred = forest.predict(X_test_scaled)
accuracy_score(y_test, y_pred) #no preformance advantage to scaling

0.94920000000000004

# Image Augmentation Functions to Expand Training Set

In [15]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [16]:
def shift_up(bmp, side_length=28):
    img = np.reshape(bmp, (side_length, side_length))
    img = np.delete(img, 0, 0)
    img = np.append(img, [np.zeros(side_length)], axis=0)
    return np.reshape(img, side_length**2)

In [17]:
def shift_down(bmp, side_length=28):
    img = np.reshape(bmp, (side_length, side_length))
    img = np.delete(img, side_length-1, 0)
    img = np.insert(img, 0, [np.zeros(side_length)], axis=0)
    return np.reshape(img, side_length**2)

In [18]:
def shift_left(bmp, side_length=28):
    img = np.reshape(bmp, (side_length, side_length))
    img = np.delete(img, 0, 1)
    zeroes = np.zeros((side_length, side_length))
    zeroes[:,:-1] = img
    return np.reshape(zeroes, side_length**2)

In [19]:
def shift_right(bmp, side_length=28):
    img = np.reshape(bmp, (side_length, side_length))
    img = np.delete(img, side_length-1, 1)
    zeroes = np.zeros((side_length, side_length))
    zeroes[:,1:] = img
    return np.reshape(zeroes, side_length**2)

# Training Set Augmentation

In [20]:
X_train_up = np.array([shift_up(shift_up(img)) for img in X_train])
X_train_down = np.array([shift_down(shift_down(img)) for img in X_train])
X_train_left = np.array([shift_left(shift_left(img)) for img in X_train])
X_train_right = np.array([shift_right(shift_right(img)) for img in X_train])

X_train_up.shape, X_train_down.shape, X_train_left.shape, X_train_right.shape

((60000, 784), (60000, 784), (60000, 784), (60000, 784))

In [21]:
X_train_augmented = X_train
X_train_augmented = np.append(X_train_augmented, X_train_up, axis=0)
X_train_augmented = np.append(X_train_augmented, X_train_down, axis=0)
X_train_augmented = np.append(X_train_augmented, X_train_left, axis=0)
X_train_augmented = np.append(X_train_augmented, X_train_right, axis=0)
X_train_augmented.shape

(300000, 784)

In [22]:
y_train_augmented = y_train
for _ in range(4):
    y_train_augmented = np.append(y_train_augmented, y_train, axis=0)
y_train_augmented.shape

(300000,)

# Training RF on Augmented Dataset

In [23]:
forest.fit(X_train_augmented, y_train_augmented)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [24]:
y_pred = forest.predict(X_test)
accuracy_score(y_test, y_pred)

0.95779999999999998

# Hyperparameter Tuning Phase

In [25]:
forest_hp_tuning=RandomForestClassifier(random_state=42, n_estimators = 10)
forest_hp_tuning.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [38]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {'n_estimators':[5,25,125,625],
              'bootstrap':[False, True],
              'min_samples_split':[2,4,8,16],
              'max_depth':[10,100,1000,None]}
clf = RandomizedSearchCV(forest_hp_tuning, param_distributions=parameters,
                                n_iter=1, cv=2, scoring='accuracy', random_state=42)

In [39]:
clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [35]:
cvres = clf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.961666666667 {'n_estimators': 125, 'max_depth': 1000, 'bootstrap': True, 'min_samples_split': 4}


# Re-tuned Classifier

In [40]:
forest_tuned = RandomForestClassifier(min_samples_split=4, bootstrap=True, random_state=42, n_estimators = 125)

In [41]:
forest_tuned.fit(X_train_augmented, y_train_augmented)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=125, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [43]:
y_pred_tuned = forest_tuned.predict(X_test)
accuracy_score(y_test, y_pred_tuned)

0.9758