Math 5750/6880: Mathematics of Data Science \
Project 3

# 1. Fashion-MNIST image classification using sklearn

In [3]:
from tensorflow.keras.datasets import fashion_mnist
from sklearn.preprocessing import StandardScaler

# Load Fashion-MNIST
# Classes (0-9): T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, Ankle boot
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
X_train = X_train.reshape(len(X_train), -1)
X_test  = X_test.reshape(len(X_test), -1)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [25]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# Base Model
# mlp_baseline = MLPClassifier(hidden_layer_sizes=(100,),
#                              activation='logistic',
#                              solver='adam',
#                              learning_rate_init=0.001, random_state=42,
#                              verbose=True, early_stopping=True)

# Test Model (two hidden layers)
mlp_baseline = MLPClassifier(hidden_layer_sizes=(256, 128),
                             activation='relu',
                             learning_rate='adaptive',
                             learning_rate_init=0.001,
                             momentum=0.9,
                             early_stopping=True,
                             random_state=42,
                             verbose=True)

# Test Model (three hidden layers)
# mlp_baseline = MLPClassifier(hidden_layer_sizes=(512, 256, 128),
#                              activation='logistic',
#                              solver='adam',
#                              learning_rate_init=0.001,
#                              early_stopping=True,
#                              random_state=42,
#                              verbose=True)

start_time = time.time()
mlp_baseline.fit(X_train, y_train)
train_time = time.time() - start_time

# Evaluate
y_predict = mlp_baseline.predict(X_test)
acc = accuracy_score(y_test, y_predict)
cm = confusion_matrix(y_test, y_predict)

print(f"Baseline Accuracy: {acc:.4f}")
print(f"Training Time: {train_time:.2f} seconds")
print("Confusion Matrix:\n", cm)

Iteration 1, loss = 0.45783715
Validation score: 0.872000
Iteration 2, loss = 0.32062740
Validation score: 0.875167
Iteration 3, loss = 0.27955216
Validation score: 0.877500
Iteration 4, loss = 0.24878781
Validation score: 0.882500
Iteration 5, loss = 0.22653632
Validation score: 0.888500
Iteration 6, loss = 0.20629701
Validation score: 0.890333
Iteration 7, loss = 0.19099525
Validation score: 0.887500
Iteration 8, loss = 0.17115681
Validation score: 0.887833
Iteration 9, loss = 0.15745049
Validation score: 0.892833
Iteration 10, loss = 0.14635416
Validation score: 0.887833
Iteration 11, loss = 0.13654782
Validation score: 0.891500
Iteration 12, loss = 0.12984302
Validation score: 0.892500
Iteration 13, loss = 0.11400859
Validation score: 0.892167
Iteration 14, loss = 0.11108869
Validation score: 0.897000
Iteration 15, loss = 0.09853518
Validation score: 0.885000
Iteration 16, loss = 0.09247732
Validation score: 0.890333
Iteration 17, loss = 0.08379185
Validation score: 0.885833
Iterat

Layer Notes:
---
*   Baseline Model:
Decent improvement. 50 iterations did not converge, Accuracy = .8808, Training Time = 117.15 seconds. With early stopping: Accuracy = .8832, Training Time = 58.39 seconds.
*   Two Layers: Loss seemed to be converging at around 0.03 when 50 iterations completed, which took 306.88 seconds. Accuracy = .8832. Confusion matrix much better than in single layer case. With early stopping: Accuracy = .8894, Training Time = 130.20 seconds.
*   Three Layers: Loss once again seemed to be converging around 0.03 after 50 iterations, which took 700.83 seconds (around 12 minutes). Accuracy = 0.8873. Confusion matrix nearing ideal. With early stopping: Accuracy = .8883, Training Time = 254.61 seconds. Overall, not significantly better than two layers for almost double runtime.
---
Notes on Activation Functions (Ran on Two-Layer w/ Early Stopping):
---
*   Relu: Accuracy = .8894, Training Time = 130.20 seconds
*   Logistic: Accuracy = .8895, Training Time = 148.72 seconds
*   Identity: Accuracy = .8381, Training Time = 116.75 seconds
*   Tanh: Accuracy = .8854, Training Time = 213.36 seconds
---
Notes on Optimization Method (Two-Layer, Early Stopping, Relu)
---
*   ADAM: Accuracy = .8894, Training Time = 130.20 seconds
*   SGD: Accuracy = .8828, Training Time = 347.90 seconds
*   SGD w/ learning rate adjustments: Accuracy = .8894, Training Time = 151.33 seconds








# 3. Fashion-MNIST image classification  using pytorch

In [8]:
import numpy as np
from tensorflow.keras.datasets import fashion_mnist
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Load Fashion-MNIST
# Classes (0-9): T-shirt/top, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, Ankle boot
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

# scale to [0,1], add channel dimension -> (N, 1, 28, 28)
X_train = (X_train.astype("float32") / 255.0)[:, None, :, :]
X_test  = (X_test.astype("float32")  / 255.0)[:,  None, :, :]

y_train = y_train.astype(np.int64)
y_test  = y_test.astype(np.int64)

# train/val split: last 10k of train as validation
X_tr, X_val = X_train[:50000], X_train[50000:]
y_tr, y_val = y_train[:50000], y_train[50000:]

# wrap in PyTorch TensorDatasets and DataLoaders
train_ds = TensorDataset(torch.from_numpy(X_tr),  torch.from_numpy(y_tr))
val_ds   = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_ds  = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=256, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=256, shuffle=False)

In [11]:
import torch.nn as nn
import torch.optim as optim
import time

# In colab, you should ``change runtime type'' to GPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Feedforward (MLP) Model
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28*28, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        return self.fc3(x)
# CNN Model
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(128 * 7 * 7, 256)
        self.fc2 = nn.Linear(256, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = self.relu(self.conv3(x))
        x = torch.flatten(x, 1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x
# Training
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * X_batch.size(0)

        val_loss, val_acc = evaluate(model, val_loader, criterion)
        print(f"Iteration {epoch+1}, " + f"Train Loss: {running_loss/len(train_loader.dataset):.4f} ")
        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Validation score: {val_acc:.4f}")

def evaluate(model, loader, criterion):
    model.eval()
    correct, total, loss_total = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss_total += loss.item() * X_batch.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
    return loss_total / total, correct / total

def get_all_preds(model, loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            preds = outputs.argmax(dim=1)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(y_batch.cpu().numpy())
    return np.concatenate(all_labels), np.concatenate(all_preds)

model = CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

start_time = time.time()
train_model(model, train_loader, val_loader, optimizer, criterion, epochs=10)
train_time = time.time() - start_time

y_true, y_pred = get_all_preds(model, test_loader)

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

print(f"Training Time: {train_time:.2f} seconds")




Using device: cuda
Iteration 1, Train Loss: 0.5984 
Validation Loss: 0.3682
Validation score: 0.8648
Iteration 2, Train Loss: 0.3615 
Validation Loss: 0.2985
Validation score: 0.8869
Iteration 3, Train Loss: 0.3084 
Validation Loss: 0.2691
Validation score: 0.8989
Iteration 4, Train Loss: 0.2716 
Validation Loss: 0.2486
Validation score: 0.9067
Iteration 5, Train Loss: 0.2449 
Validation Loss: 0.2361
Validation score: 0.9125
Iteration 6, Train Loss: 0.2273 
Validation Loss: 0.2221
Validation score: 0.9177
Iteration 7, Train Loss: 0.2054 
Validation Loss: 0.2231
Validation score: 0.9171
Iteration 8, Train Loss: 0.1912 
Validation Loss: 0.2109
Validation score: 0.9235
Iteration 9, Train Loss: 0.1758 
Validation Loss: 0.2167
Validation score: 0.9210
Iteration 10, Train Loss: 0.1615 
Validation Loss: 0.2123
Validation score: 0.9231
Confusion Matrix:
 [[882   0  11  12   3   1  88   0   3   0]
 [  1 978   0  15   2   0   4   0   0   0]
 [ 18   1 884   7  47   0  41   0   2   0]
 [  8   0  1

Notes (Baseline MLP Model):
---
*   Final accuracy: .8864, Training Time = 11.32 seconds (way faster than sklearn!)
---
Notes (CNN Model):
---
*   Two-layer: Final accuracy: .9175, Training Time = 22.10 seconds.
*   Three-layer: Final accuracy: .9231, Training Time = 24.87 seconds.


