### Download dataset

In [None]:
import sys
from pathlib import Path
import os

FOLDERNAME = "numpy/assignment1"

personal_dir = Path(os.getcwd())
sys.path.append(personal_dir)
repo_dir = Path(os.getcwd()).parents[2]
sys.path.append(Path(repo_dir, FOLDERNAME))

# This downloads the CIFAR-10 dataset to your Drive
# if it doesn't already exist.
%cd $repo_dir/$FOLDERNAME/cs231n/datasets/
!bash get_datasets.sh
%cd $repo_dir/$FOLDERNAME

In [3]:
import numpy as np
from cs231n.data_utils import load_CIFAR10

In [None]:
def get_CIFAR10_data(
    num_training=49000, num_validation=1000, num_test=1000, num_dev=500
):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the linear classifier. These are the same steps as we used for the
    SVM, but condensed to a single function.
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = "cs231n/datasets/cifar-10-batches-py"

    # Cleaning up variables to prevent loading data multiple times (which may cause memory issue)
    try:
        del X_train, y_train
        del X_test, y_test
        print("Clear previously loaded data.")
    except:
        pass

    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    # subsample the data
    mask = list(range(num_training, num_training + num_validation))
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = list(range(num_training))
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = list(range(num_test))
    X_test = X_test[mask]
    y_test = y_test[mask]
    mask = np.random.choice(num_training, num_dev, replace=False)
    X_dev = X_train[mask]
    y_dev = y_train[mask]

    # Preprocessing: reshape the image data into rows
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_val = np.reshape(X_val, (X_val.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    X_dev -= mean_image

    # add bias dimension and transform into columns
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
    X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
    X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])

    return X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = (
    get_CIFAR10_data()
)
print("Train data shape: ", X_train.shape)
print("Train labels shape: ", y_train.shape)
print("Validation data shape: ", X_val.shape)
print("Validation labels shape: ", y_val.shape)
print("Test data shape: ", X_test.shape)
print("Test labels shape: ", y_test.shape)
print("dev data shape: ", X_dev.shape)
print("dev labels shape: ", y_dev.shape)

In [None]:
%cd $personal_dir

### Finding best hyperparams

In [6]:
import time
from itertools import product
from math import floor

from linear_classifier import Softmax
from logistic_regression_classifier import LogisticRegression

In [7]:
learning_rates = [1e-7, 5e-7]
regularization_strengths = [2.5e4, 5e4]
epochs = 2
batch_size = 200
num_iters = floor(X_train.shape[0] * epochs / batch_size)
CLASSIFIER_CLASS = Softmax

In [8]:
# TODO: add time measurement for each epoch

# TODO: add augmentation for each epoch

# TODO: use sigmoid instead of softmax

In [None]:
results = {}
best_val = -1
best_softmax = None
time_stats = []
total_loss_history = []
total_weights_history = []
hyperparams_comb = list(product(learning_rates, regularization_strengths))

for lr, reg in hyperparams_comb:
    classifier = CLASSIFIER_CLASS()

    start_time = time.time()
    loss_history, weights_history = classifier.train(
        X_train, y_train, lr, reg, num_iters=num_iters, batch_size=batch_size
    )
    end_time = time.time()
    time_stats.append(end_time - start_time)
    total_loss_history.append(loss_history)
    total_weights_history.append(weights_history)

    y_train_pred = classifier.predict(X_train)
    y_val_pred = classifier.predict(X_val)

    train_accuracy = np.mean(y_train == y_train_pred)
    val_accuracy = np.mean(y_val == y_val_pred)
    results[(lr, reg)] = train_accuracy, val_accuracy

    if val_accuracy > best_val:
        best_val = val_accuracy
        best_softmax = classifier

for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    print(
        "lr %e reg %e train accuracy: %f val accuracy: %f"
        % (lr, reg, train_accuracy, val_accuracy)
    )

print(
    "best validation accuracy achieved during cross-validation: %f" % best_val
)

In [None]:
# Evaluate the best softmax on test set
y_test_pred = best_softmax.predict(X_test)
test_accuracy = np.mean(y_test == y_test_pred)
print("softmax on raw pixels final test set accuracy: %f" % (test_accuracy,))

### Process statistics and show metrics

In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

In [12]:
total_train_acc_history = []
total_test_acc_history = []

for weights_history in total_weights_history:
    train_acc_history = []
    test_acc_history = []
    for w in weights_history:
        classifier = CLASSIFIER_CLASS(w)

        y_train_pred = classifier.predict(X_train)
        y_test_pred = classifier.predict(X_test)

        train_acc = np.mean(y_train == y_train_pred)
        test_acc = np.mean(y_test == y_test_pred)

        train_acc_history.append(train_acc)
        test_acc_history.append(test_acc)
    total_train_acc_history.append(train_acc_history)
    total_test_acc_history.append(test_acc_history)

In [None]:
df = pd.DataFrame(
    {
        "LR & Reg": hyperparams_comb,
        "Loss": total_loss_history,
        "Train accuracy": total_train_acc_history,
        "Test accuracy": total_test_acc_history,
        "Index": [np.arange(len(total_loss_history[0]))]
        * len(total_loss_history),
    }
)
df

In [45]:
transformed_data = pd.DataFrame()
frequency = 10
for idx, row in df.iterrows():
    train_data = pd.DataFrame(
        {
            f"Iterations ({frequency=})": row["Index"][::frequency],
            "Accuracy": row["Train accuracy"][::frequency],
            "Dataset Type": "Train",
            "LR & Reg": str(row["LR & Reg"]),
        }
    )
    test_data = pd.DataFrame(
        {
            f"Iterations ({frequency=})": row["Index"][::frequency],
            "Accuracy": row["Test accuracy"][::frequency],
            "Dataset Type": "Test",
            "LR & Reg": str(row["LR & Reg"]),
        }
    )
    transformed_data = pd.concat(
        [transformed_data, train_data, test_data], ignore_index=True
    )

In [16]:
# TODO: loss vs iters, acc vs iters

In [None]:
g = sns.FacetGrid(
    transformed_data,
    col="LR & Reg",
    hue="Dataset Type",
    height=5,
    aspect=1.5,
    col_wrap=2,
)
g.map(sns.lineplot, f"Iterations ({frequency=})", "Accuracy").add_legend()
g.set_axis_labels(f"Iterations ({frequency=})", "Accuracy")

plt.subplots_adjust(top=0.88, wspace=0.2, hspace=0.2)
g.figure.suptitle(
    "Train and Test Accuracy Data for each learning rate and regularization strength"
)
plt.show()