In [30]:
import pickle
import time
import pandas as pd
import numpy as np
import random


In [31]:
train_file = "data/fii-nn-2025-homework-2/extended_mnist_train.pkl"
test_file = "data/fii-nn-2025-homework-2/extended_mnist_test.pkl"

with open(train_file, "rb") as fp:
    train = pickle.load(fp)

with open(test_file, "rb") as fp:
    test = pickle.load(fp)

In [32]:
train_data = []
train_labels = []
for image, label in train:
    train_data.append(image.flatten() / 255.0)
    train_labels.append(label)


In [34]:
test_data = []
for image, label in test:
    test_data.append(image.flatten() / 255.0)


In [35]:
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z) * (1 - sigmoid(z))

def relu(z):
    return np.maximum(0, z)
def relu_prime(z):
    return (z > 0).astype(float)
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [64]:
X = np.array(train_data)
Y = np.array(train_labels)


n_inputs = 784
n_hidden = 100
n_outputs = 10
n_epochs = 100
batch_size = 64
lr = 0.01
dropout_rate = 0.5

limit_W1 = np.sqrt(6 / (n_inputs + n_hidden))
W1 = np.random.uniform(-limit_W1, limit_W1, (n_inputs, n_hidden))
B1 = np.zeros(n_hidden)

limit_W2 = np.sqrt(6 / (n_hidden + n_outputs))
W2 = np.random.uniform(-limit_W2, limit_W2, (n_hidden, n_outputs))
B2 = np.zeros(n_outputs)
Y_binary = np.zeros((Y.shape[0], n_outputs))


for i in range(n_outputs):
    Y_binary[:, i] = np.where(Y == i, 1, 0)

start_time = time.time()
for epoch in range(n_epochs):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X_shuffled = X[indices]
    Y_shuffled = Y_binary[indices]



    for i in range(0, X.shape[0], batch_size):
        X_batch =X_shuffled[i : i + batch_size]
        Y_batch = Y_shuffled[i : i + batch_size]
        dim_batch = X_batch.shape[0]

        z1 = np.dot(X_batch, W1) + B1
        a1 = relu(z1)

        z2 = np.dot(a1, W2) + B2
        a2 = softmax(z2)

        dropout_mask = (np.random.rand(*a1.shape) > dropout_rate).astype(float)
        a1 *= dropout_mask
        a1 /= (1.0 - dropout_rate)

        loss = -np.sum(Y_batch * np.log(a2 + 1e-9)) / dim_batch

        d2 = a2 - Y_batch
        dW2 = np.dot(a1.T, d2) / dim_batch
        dB2 = np.sum(d2, axis=0) / dim_batch

        d1_prop = np.dot(d2, W2.T)
        d1 = d1_prop * relu_prime(z1)

        d1 *= dropout_mask
        d1 /= (1.0 - dropout_rate)

        dW1 = np.dot(X_batch.T, d1) / dim_batch
        dB1 = np.sum(d1, axis=0) / dim_batch

        W2 -= lr * dW2
        B2 -= lr * dB2
        W1 -= lr * dW1
        B1 -= lr * dB1

    if epoch % 10 == 0 or epoch == n_epochs - 1:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")
    W = W2
    B = B2
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total training time: {elapsed_time:.2f} seconds")


Epoch 0, Loss: 0.5210
Epoch 10, Loss: 0.0294
Epoch 20, Loss: 0.1611
Epoch 30, Loss: 0.1129
Epoch 40, Loss: 0.0494
Epoch 50, Loss: 0.1395
Epoch 60, Loss: 0.0696
Epoch 70, Loss: 0.0448
Epoch 80, Loss: 0.0461
Epoch 90, Loss: 0.0165
Epoch 99, Loss: 0.0927
Total training time: 152.41 seconds


In [65]:
def predict(X_test, W1, B1, W2, B2):
    z1 = np.dot(X_test, W1) + B1
    a1 = relu(z1)

    z2 = np.dot(a1, W2) + B2
    P = softmax(z2)

    return np.argmax(P, axis=1)

In [66]:
predictions = predict(test_data, W1, B1, W2, B2)


In [67]:
# This is how you prepare a submission for the competition
predictions_csv = {
    "ID": [],
    "target": [],
}

for i, label in enumerate(predictions):
    predictions_csv["ID"].append(i)
    predictions_csv["target"].append(label)

df = pd.DataFrame(predictions_csv)
df.to_csv("submission.csv", index=False)