In [14]:
import time

import torch
from torchvision import datasets
from torchvision.transforms import ToTensor

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [3]:
# download training data from open datasets
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# download test data from open datasets
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [6]:
# convert images to numpy arrays and flatten them from 28Ã—28 to vector of size 784
X_train = training_data.data.numpy().reshape(len(training_data), -1) / 255.0
y_train = training_data.targets.numpy()

X_test = test_data.data.numpy().reshape(len(test_data), -1) / 255.0
y_test = test_data.targets.numpy()

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (60000, 784)
Test shape: (10000, 784)


In [15]:
param_grid = {
    "n_estimators": [10, 50, 100, 200, 300]  # numbers of trees
}

clf = RandomForestClassifier(
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

# 10 folds cross validation
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=10,
    n_jobs=-1,
    verbose=1
)

In [16]:
print("Running cross validation...")

t_cv = time.time()
grid.fit(X_train, y_train)
cv_time = time.time() - t_cv

print("\n===== Cross validation results =====")
print(f"Best params:      {grid.best_params_}")
print(f"Best CV accuracy: {grid.best_score_:.4f}")
print(f"CV time:          {cv_time:.2f} sec")

best_clf = grid.best_estimator_

# training on full dataset
t_train = time.time()
best_clf.fit(X_train, y_train)
train_time = time.time() - t_train

# evaluate on the test set
t_pred = time.time()
test_preds = best_clf.predict(X_test)
pred_time = time.time() - t_pred

test_accuracy = accuracy_score(y_test, test_preds)

print("\n===== Final results =====")
print(f"Test accuracy:  {test_accuracy:.4f}")
print(f"Train time:     {train_time:.4f} sec")
print(f"Predict time:   {pred_time:.4f} sec")

Running cross validation...
Fitting 10 folds for each of 5 candidates, totalling 50 fits

===== Cross validation results =====
Best params:      {'n_estimators': 300}
Best CV accuracy: 0.8858
CV time:          634.27 sec

===== Final results =====
Test accuracy:  0.8792
Train time:     33.5247 sec
Predict time:   0.1664 sec
