
# Fashion MNIST Neural Network Comprehensive Experiments

- This notebook consolidates all experiments into a single workflow
- Designed to run on Google Colab

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_openml
import warnings
warnings.filterwarnings('ignore')

## Load Fashion MNIST data

In [None]:
# ============================================================================
# 1. LOAD FASHION MNIST DATA
# ============================================================================

print("Loading Fashion MNIST dataset...")
# Load from keras/tensorflow which has Fashion MNIST built-in
from tensorflow import keras

(x_train_full, y_train_full), (x_test_full, y_test_full) = keras.datasets.fashion_mnist.load_data()

# Flatten images and normalize to [0, 1]
x_train_full = x_train_full.reshape(-1, 784).astype('float64') / 255.0
x_test_full = x_test_full.reshape(-1, 784).astype('float64') / 255.0

print(f"Training set shape: {x_train_full.shape}")
print(f"Test set shape: {x_test_full.shape}")

### Utility Functions

In [None]:
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def count_params(clf):
    """Count total parameters in the network"""
    params = 0
    for w in clf.coefs_:
        params += w.shape[0] * w.shape[1]
    for b in clf.intercepts_:
        params += b.shape[0]
    return params

def train_and_evaluate(x_train, y_train, x_test, y_test, clf):
    """Train model and return score, loss, params, time"""
    s = time.time()
    clf.fit(x_train, y_train)
    e = time.time() - s
    loss = clf.loss_
    params = count_params(clf)
    score = clf.score(x_test, y_test)
    return score, loss, params, e

def train_epochwise(x_train, y_train, x_test, y_test, clf, max_epochs):
    """Train one epoch at a time and track metrics"""
    train_loss = []
    train_err = []
    val_err = []
    
    clf.max_iter = 1
    clf.warm_start = True
    
    for i in range(max_epochs):
        clf.fit(x_train, y_train)
        train_loss.append(clf.loss_)
        train_err.append(1.0 - clf.score(x_train, y_train))
        val_err.append(1.0 - clf.score(x_test, y_test))
        if (i + 1) % max(1, max_epochs // 10) == 0:
            print(f"  Epoch {i+1}/{max_epochs}: val_err = {val_err[-1]:.5f}")
    
    return train_loss, train_err, val_err

## Experiment 1: Basic Training with Different Architectures and Activations

In [None]:
# ============================================================================
# EXPERIMENT 1: BASIC TRAINING WITH DIFFERENT ARCHITECTURES & ACTIVATIONS
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 1: BASIC TRAINING - Architecture & Activation Comparison")
print("="*70)

N = 5000  # Use subset for faster training
x_train = x_train_full[:N]
y_train = y_train_full[:N]
x_test = x_test_full[:N]
y_test = y_test_full[:N]

layer_configs = [
    (1,), (500,), (800,), (1000,), (2000,),
    (1000, 500), (3000, 1500),
    (2, 2, 2), (1000, 500, 250), (2000, 1000, 500),
]

activation_functions = ["relu", "logistic", "tanh"]

results_basic = {act: [] for act in activation_functions}

for act in activation_functions:
    print(f"\n{act.upper()}:")
    for layer in layer_configs:
        scores = []
        losses = []
        times = []
        
        for i in range(3):  # 3 runs for averaging
            clf = MLPClassifier(
                solver="sgd", verbose=False, tol=1e-8,
                nesterovs_momentum=False, early_stopping=False,
                learning_rate_init=0.001, momentum=0.9, max_iter=200,
                hidden_layer_sizes=layer, activation=act
            )
            score, loss, params, elapsed = train_and_evaluate(
                x_train, y_train, x_test, y_test, clf
            )
            scores.append(score)
            losses.append(loss)
            times.append(elapsed)
        
        s_arr = np.array(scores)
        l_arr = np.array(losses)
        t_arr = np.array(times)
        se = s_arr.std() / np.sqrt(len(scores))
        le = l_arr.std() / np.sqrt(len(losses))
        
        results_basic[act].append({
            'layers': layer, 'score': s_arr.mean(), 'score_err': se,
            'loss': l_arr.mean(), 'loss_err': le, 'params': params,
            'time': t_arr.mean()
        })
        
        print(f"  layers: {str(layer):20s} | score: {s_arr.mean():.4f} ± {se:.4f} | "
              f"loss: {l_arr.mean():.4f} ± {le:.4f} | params: {params:6d} | time: {t_arr.mean():.2f}s")

## Experiment 2: Batch Size Effects

In [None]:
# ============================================================================
# EXPERIMENT 2: BATCH SIZE EFFECTS
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 2: BATCH SIZE EFFECTS")
print("="*70)

N = 16384
x_train = x_train_full[:N]
y_train = y_train_full[:N]
x_test = x_test_full[:N]
y_test = y_test_full[:N]

batch_sizes = [16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]
M = 8192  # Total minibatches to keep constant

batch_results = {'batch_size': [], 'score': [], 'score_err': [], 'loss': [], 'loss_err': []}

for bz in batch_sizes:
    print(f"\nbatch_size = {bz}:")
    epochs = (M * bz) // N
    if epochs < 1:
        epochs = 1
    
    scores = []
    losses = []
    
    for i in range(3):
        clf = MLPClassifier(
            solver="sgd", verbose=False, tol=1e-8,
            nesterovs_momentum=False, early_stopping=False,
            learning_rate_init=0.001, momentum=0.9, max_iter=epochs,
            hidden_layer_sizes=(1000, 500), activation="relu",
            batch_size=bz
        )
        score, loss, params, _ = train_and_evaluate(
            x_train, y_train, x_test, y_test, clf
        )
        scores.append(score)
        losses.append(loss)
        print(f"  Run {i+1}: score = {score:.5f}, loss = {loss:.5f}")
    
    s_arr = np.array(scores)
    l_arr = np.array(losses)
    se = s_arr.std() / np.sqrt(len(scores))
    le = l_arr.std() / np.sqrt(len(losses))
    
    batch_results['batch_size'].append(bz)
    batch_results['score'].append(s_arr.mean())
    batch_results['score_err'].append(se)
    batch_results['loss'].append(l_arr.mean())
    batch_results['loss_err'].append(le)
    
    print(f"  Mean: score = {s_arr.mean():.5f} ± {se:.5f}, loss = {l_arr.mean():.5f} ± {le:.5f}")

# Plot batch size results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.errorbar(batch_results['batch_size'], batch_results['score'], 
             batch_results['score_err'], marker='o', color='k', fillstyle='none')
ax1.set_xlabel('Minibatch Size', fontsize=12)
ax1.set_ylabel('Test Score', fontsize=12)
ax1.set_xscale('log')
ax1.grid(True, alpha=0.3)

ax2.errorbar(batch_results['batch_size'], batch_results['loss'], 
             batch_results['loss_err'], marker='s', color='k', fillstyle='none')
ax2.set_xlabel('Minibatch Size', fontsize=12)
ax2.set_ylabel('Training Loss', fontsize=12)
ax2.set_xscale('log')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('batch_size_results.png', dpi=150, bbox_inches='tight')
plt.show()

## Experiment 3: Base Learning Rate

In [None]:
# ============================================================================
# EXPERIMENT 3: BASE LEARNING RATE
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 3: BASE LEARNING RATE")
print("="*70)

N = 10000
x_train = x_train_full[:N]
y_train = y_train_full[:N]
x_test = x_test_full[:N]
y_test = y_test_full[:N]

base_lrs = [0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]
lr_results = {'lr': [], 'score_fixed': [], 'score_scaled': [], 'time_scaled': []}

print("\nFixed epochs (50):")
for lr in base_lrs:
    clf = MLPClassifier(
        solver="sgd", verbose=False, tol=1e-8,
        nesterovs_momentum=False, early_stopping=False,
        learning_rate_init=lr, momentum=0.9, max_iter=50,
        hidden_layer_sizes=(1000, 500), activation="relu",
        learning_rate="constant", batch_size=64
    )
    score, loss, _, _ = train_and_evaluate(
        x_train, y_train, x_test, y_test, clf
    )
    lr_results['lr'].append(lr)
    lr_results['score_fixed'].append(score)
    print(f"  lr = {lr:.5f}: score = {score:.5f}, loss = {loss:.5f}")

print("\nScaled epochs (lr * epochs ≈ 1.5):")
epochs_list = [8, 15, 30, 150, 300, 1500, 3000, 15000]
score_scaled = []
time_scaled = []

for i, lr in enumerate(base_lrs):
    clf = MLPClassifier(
        solver="sgd", verbose=False, tol=1e-8,
        nesterovs_momentum=False, early_stopping=False,
        learning_rate_init=lr, momentum=0.9, max_iter=epochs_list[i],
        hidden_layer_sizes=(1000, 500), activation="relu",
        learning_rate="constant", batch_size=64
    )
    score, loss, _, elapsed = train_and_evaluate(
        x_train, y_train, x_test, y_test, clf
    )
    score_scaled.append(score)
    time_scaled.append(elapsed)
    print(f"  lr = {lr:.5f} (epochs={epochs_list[i]}): score = {score:.5f}, time = {elapsed:.2f}s")

lr_results['score_scaled'] = score_scaled
lr_results['time_scaled'] = time_scaled

# Plot learning rate results
fig, ax = plt.subplots(figsize=(10, 6))
ax.semilogx(lr_results['lr'], lr_results['score_fixed'], marker='o', 
            label='Fixed epochs (50)', color='k', fillstyle='none')
ax.semilogx(lr_results['lr'], lr_results['score_scaled'], marker='s', 
            label='Scaled epochs (lr×epoch≈1.5)', color='k', fillstyle='none')
ax.set_xlabel('Learning Rate (η)', fontsize=12)
ax.set_ylabel('Test Score', fontsize=12)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('learning_rate_results.png', dpi=150, bbox_inches='tight')
plt.show()

## Experiment 4: Training Set Size

In [None]:
# ============================================================================
# EXPERIMENT 4: TRAINING SET SIZE
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 4: TRAINING SET SIZE")
print("="*70)

x_test = x_test_full[:10000]
y_test = y_test_full[:10000]

train_sizes = [100, 200, 300, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500, 10000]
size_results = {'sizes': [], 'scores': [], 'scores_err': []}

for n in train_sizes:
    print(f"\nTraining set size = {n}")
    x_train = x_train_full[:n]
    y_train = y_train_full[:n]
    
    scores = []
    for i in range(3):
        epochs = max(1, int((100.0 / n) * 1000))  # ~1000 SGD steps
        clf = MLPClassifier(
            solver="sgd", verbose=False, tol=1e-8,
            nesterovs_momentum=False, early_stopping=False,
            learning_rate_init=0.05, momentum=0.9, max_iter=epochs,
            hidden_layer_sizes=(1000, 500), activation="relu",
            learning_rate="constant", batch_size=100
        )
        score, loss, _, _ = train_and_evaluate(
            x_train, y_train, x_test, y_test, clf
        )
        scores.append(score)
    
    s_arr = np.array(scores)
    se = s_arr.std() / np.sqrt(len(scores))
    size_results['sizes'].append(n)
    size_results['scores'].append(s_arr.mean())
    size_results['scores_err'].append(se)
    print(f"  Mean score: {s_arr.mean():.5f} ± {se:.5f}")

# Plot training set size results
fig, ax = plt.subplots(figsize=(10, 6))
ax.errorbar(size_results['sizes'], size_results['scores'], 
            size_results['scores_err'], marker='o', color='k', fillstyle='none', capsize=5)
ax.set_xlabel('Number of Training Samples', fontsize=12)
ax.set_ylabel('Test Score', fontsize=12)
ax.grid(True, alpha=0.3)
ax.set_xscale('log')
plt.tight_layout()
plt.savefig('training_size_results.png', dpi=150, bbox_inches='tight')
plt.show()

## Experiment 5: L2 Regularization

In [None]:
# ============================================================================
# EXPERIMENT 5: L2 REGULARIZATION
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 5: L2 REGULARIZATION")
print("="*70)

x_train = x_train_full[:3000]
y_train = y_train_full[:3000]
x_test = x_test_full[:3000]
y_test = y_test_full[:3000]

alphas = [0.0, 0.0001, 0.001, 0.01, 0.1]
max_epochs = 200
l2_results = {alpha: {'val_err': [], 'train_err': []} for alpha in alphas}

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, alpha in enumerate(alphas):
    print(f"\nAlpha (L2) = {alpha}:")
    clf = MLPClassifier(
        solver="sgd", verbose=False, tol=0,
        nesterovs_momentum=False, early_stopping=False,
        learning_rate_init=0.01, momentum=0.0,
        hidden_layer_sizes=(100, 50), activation="relu",
        alpha=alpha, learning_rate="constant", batch_size=64, max_iter=1
    )
    
    train_loss, train_err, val_err = train_epochwise(
        x_train, y_train, x_test, y_test, clf, max_epochs
    )
    
    l2_results[alpha]['train_err'] = train_err
    l2_results[alpha]['val_err'] = val_err
    
    print(f"  Final: train_err = {train_err[-1]:.5f}, val_err = {val_err[-1]:.5f}")
    
    ax = axes[idx]
    ax.plot(val_err, linewidth=2, color=f'C{idx}')
    ax.set_title(f'Alpha = {alpha}', fontsize=11)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Validation Error')
    ax.grid(True, alpha=0.3)

axes[-1].remove()
plt.tight_layout()
plt.savefig('l2_regularization_results.png', dpi=150, bbox_inches='tight')
plt.show()

## Experiment 6: Momentum

In [None]:
# ============================================================================
# EXPERIMENT 6: MOMENTUM
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 6: MOMENTUM")
print("="*70)

x_train = x_train_full[:5000]
y_train = y_train_full[:5000]
x_test = x_test_full[:5000]
y_test = y_test_full[:5000]

momentums = [0.0, 0.3, 0.5, 0.7, 0.9, 0.99]
max_epochs = 100
momentum_results = {m: {'val_err': [], 'train_err': []} for m in momentums}

fig, ax = plt.subplots(figsize=(12, 7))

for idx, m in enumerate(momentums):
    print(f"\nMomentum = {m}:")
    clf = MLPClassifier(
        solver="sgd", verbose=False, tol=0,
        nesterovs_momentum=False, early_stopping=False,
        learning_rate_init=0.01, momentum=m,
        hidden_layer_sizes=(100, 50), activation="relu",
        alpha=0.0001, learning_rate="constant", batch_size=64, max_iter=1
    )
    
    train_loss, train_err, val_err = train_epochwise(
        x_train, y_train, x_test, y_test, clf, max_epochs
    )
    
    momentum_results[m]['train_err'] = train_err
    momentum_results[m]['val_err'] = val_err
    
    print(f"  Final: train_err = {train_err[-1]:.5f}, val_err = {val_err[-1]:.5f}")
    
    ax.plot(val_err, marker='', linewidth=2, label=f'momentum = {m}', alpha=0.8)

ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Validation Error', fontsize=12)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('momentum_results.png', dpi=150, bbox_inches='tight')
plt.show()

## Experiment 7: Weight Initialization

In [None]:
# ============================================================================
# EXPERIMENT 7: WEIGHT INITIALIZATION
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 7: WEIGHT INITIALIZATION")
print("="*70)

from sklearn.neural_network import MLPClassifier as SklearnMLP

class CustomMLPClassifier(SklearnMLP):
    """MLPClassifier with custom weight initialization"""
    
    def __init__(self, *args, init_scheme=0, **kwargs):
        super().__init__(*args, **kwargs)
        self.init_scheme = init_scheme
    
    def _init_coef(self, fan_in, fan_out, dtype):
        if self.init_scheme == 0:
            # Glorot (default)
            weights, biases = super()._init_coef(fan_in, fan_out, dtype)
        elif self.init_scheme == 1:
            # Small uniform
            weights = 0.01 * (np.random.random((fan_in, fan_out)) - 0.5)
            biases = np.zeros(fan_out)
        elif self.init_scheme == 2:
            # Small Gaussian
            weights = 0.005 * np.random.normal(size=(fan_in, fan_out))
            biases = np.zeros(fan_out)
        elif self.init_scheme == 3:
            # He initialization
            weights = np.random.normal(size=(fan_in, fan_out)) * np.sqrt(2.0 / fan_in)
            biases = np.zeros(fan_out)
        elif self.init_scheme == 4:
            # Xavier
            weights = np.random.normal(size=(fan_in, fan_out)) * np.sqrt(1.0 / fan_in)
            biases = np.zeros(fan_out)
        
        return weights.astype(dtype, copy=False), biases.astype(dtype, copy=False)

x_train = x_train_full[:6000]
y_train = y_train_full[:6000]
x_test = x_test_full[:6000]
y_test = y_test_full[:6000]

init_names = ["Glorot", "Small Uniform", "Small Gaussian", "He", "Xavier"]
init_schemes = [0, 1, 2, 3, 4]
max_epochs = 300

fig, ax = plt.subplots(figsize=(12, 7))

for scheme_idx, (scheme, name) in enumerate(zip(init_schemes, init_names)):
    print(f"\nInitialization scheme: {name}")
    test_errs = []
    
    for epoch in range(max_epochs):
        clf = CustomMLPClassifier(
            solver="sgd", verbose=False, tol=0,
            nesterovs_momentum=False, early_stopping=False,
            learning_rate_init=0.01, momentum=0.9,
            hidden_layer_sizes=(100, 50), activation="relu",
            alpha=0.2, learning_rate="constant", batch_size=64,
            max_iter=1, warm_start=(epoch > 0), init_scheme=scheme
        )
        
        clf.fit(x_train, y_train)
        test_err = 1.0 - clf.score(x_test, y_test)
        test_errs.append(test_err)
        
        if (epoch + 1) % 50 == 0:
            print(f"  Epoch {epoch+1}: test_err = {test_err:.5f}")
    
    # Smooth the curve for visualization
    window = 21
    if len(test_errs) >= window:
        smoothed = np.convolve(test_errs, np.ones(window)/window, mode='valid')
        ax.plot(smoothed, linewidth=2, label=name, alpha=0.8)
    else:
        ax.plot(test_errs, linewidth=2, label=name, alpha=0.8)

ax.set_xlabel('Epoch', fontsize=12)
ax.set_ylabel('Test Error', fontsize=12)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('weight_init_results.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n" + "="*70)
print("ALL EXPERIMENTS COMPLETED!")
print("="*70)