## Read Data

In [1]:
# Provided code to read dataset for the Kaggle competition
import pandas as pd
from sklearn.model_selection import train_test_split

### read dataset
train = pd.read_csv("../input/fashion-mnist_train.csv")
train_x = train[list(train.columns)[1:]].values
train_y = train['label'].values

## normalize and reshape the predictors
train_x = train_x / 255

## create train and validation datasets
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2)

## reshape the inputs
train_x = train_x.reshape(-1, 784)
val_x = val_x.reshape(-1, 784)

## Train Model

In [8]:
# Mostly just looked through the sklearn docs to see what was available.
# https://scikit-learn.org/stable/supervised_learning.html#supervised-learning

# from sklearn.tree import DecisionTreeRegressor
# from sklearn.linear_model import SGDRegressor
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

# model = DecisionTreeRegressor(random_state=1, criterion='friedman_mse')
# model = SGDRegressor(max_iter=1000, tol=1e-3, loss='huber')
# model = RandomForestRegressor(random_state=0, n_estimators=100)
# model = KNeighborsRegressor(weights='uniform', algorithm='kd_tree', n_jobs=-1) # 83%
# model = KNeighborsClassifier(weights='uniform', n_jobs=-1) # 83%
# model = AdaBoostClassifier() # 57%
# model = MLPClassifier(verbose=True) # 88%
model = RandomForestClassifier(random_state=0, n_estimators=100, n_jobs=-1, criterion='entropy',
                              max_features='auto') # 88%

model.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## Predict & Check Accuracy from Training Data Split

In [9]:
from sklearn.metrics import accuracy_score

predicted_y = model.predict(val_x)

print('Accuracy is : {0:.2f}%'.format(accuracy_score(val_y, predicted_y) * 100))

Accuracy is : 87.80%


## Predict & Check Accuracy from Test Data

In [14]:
from sklearn.metrics import accuracy_score

### read dataset
test = pd.read_csv("../input/fashion-mnist_test.csv")
test_x = test[list(test.columns)[1:]].values
test_y = test['label'].values
## normalize and reshape the predictors
test_x = test_x / 255
## reshape the inputs
test_x = test_x.reshape(-1, 784)

predicted_y = model.predict(test_x)

print('Accuracy is : {0:.2f}%'.format(accuracy_score(test_y, predicted_y) * 100))

Accuracy is : 88.19%


## Save Predictions to File

In [15]:
# Based on:
# https://stackoverflow.com/questions/52411992/how-to-produce-a-kaggle-submission-csv-file-with-specific-entries
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

# Assumes predicted_y has been defined in a prior cell.
submission = pd.DataFrame({'Label': predicted_y})
submission.to_csv('../submission.csv', index=True, index_label='ID')