In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.misc import imresize

In [2]:
# a user-defined function that rescale 28x28 images to 20x20
def rescale(ims_in):
    ims_out = np.empty([ims_in.shape[0],400])
    for i in range(ims_in.shape[0]):
        im_in = np.reshape(ims_in[i], (28,28))
        x_min = (min(np.nonzero(im_in)[1]))
        x_max = (max(np.nonzero(im_in)[1]))
        y_min = (min(np.nonzero(im_in)[0]))
        y_max = (max(np.nonzero(im_in)[0]))
        im_in = im_in[y_min:y_max, x_min:x_max]
        im_in = imresize(im_in, (20,20))
        ims_out[i] = np.ravel(im_in)
    return ims_out

# pre-processing training, validation and testing data

train_data = pd.read_csv('train.csv', header=0).values
train_feature = train_data[:,2:]
train_label = train_data[:,1]

val_data = pd.read_csv('val.csv', header=0).values
val_feature = val_data[:,1:]
val_label = val_data[:,0]

test_data = pd.read_csv('test.csv', header=None).values
test_feature = test_data

# rescale each 28x28 image to 20x20 such that every digits fully fit into the bounding box
# if the pixel value is greater than or equal to 128, scale to 1
# otherwise, scale to 0

train_feature = rescale(train_feature)
train_feature[np.where(train_feature < 128)]-=train_feature[np.where(train_feature < 128)]
train_feature[np.where(train_feature >= 128)]/=train_feature[np.where(train_feature >= 128)]

val_feature = rescale(val_feature)
val_feature[np.where(val_feature < 128)]-=val_feature[np.where(val_feature < 128)]
val_feature[np.where(val_feature >= 128)]/=val_feature[np.where(val_feature >= 128)]

test_feature = rescale(test_feature)
test_feature[np.where(test_feature < 128)]-=test_feature[np.where(test_feature < 128)]
test_feature[np.where(test_feature >= 128)]/=test_feature[np.where(test_feature >= 128)]

`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
  # This is added back by InteractiveShellApp.init_path()


In [3]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, max_depth=4)
clf.fit(train_feature, train_label)
print('10 trees and 4 depth: ', clf.score(val_feature, val_label))
test_label = clf.predict(test_feature)
test_label = np.asarray(test_label, dtype = np.int32)

clf = RandomForestClassifier(n_estimators=10, max_depth=16)
clf.fit(train_feature, train_label)
print('10 trees and 16 depth: ', clf.score(val_feature, val_label))
test_label = clf.predict(test_feature)
test_label = np.asarray(test_label, dtype = np.int32)

clf = RandomForestClassifier(n_estimators=30, max_depth=4)
clf.fit(train_feature, train_label)
print('30 trees and 4 depth: ', clf.score(val_feature, val_label))
test_label = clf.predict(test_feature)
test_label = np.asarray(test_label, dtype = np.int32)

clf = RandomForestClassifier(n_estimators=30, max_depth=16)
clf.fit(train_feature, train_label)
print('30 trees and 16 depth: ', clf.score(val_feature, val_label))
test_label = clf.predict(test_feature)
test_label = np.asarray(test_label, dtype = np.int32)

10 trees and 4 depth:  0.688
10 trees and 16 depth:  0.973
30 trees and 4 depth:  0.7325
30 trees and 16 depth:  0.98
