In [1]:
import pickle
import time
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.utils.data as data
import torch.optim as optim

from datasets import load_dataset



In [2]:
mnist = load_dataset("mnist")
train, test = mnist.get("train"), mnist.get("test")

In [3]:
train.set_format(type="numpy", columns=["image", "label"])
test.set_format(type="numpy", columns=["image", "label"])
num_train_samples = 10000
num_test_samples = 1000

train_indices = np.random.choice(num_train_samples, num_train_samples, replace=False)
test_indices = np.random.choice(num_test_samples, num_test_samples, replace=False)
train = train.rename_column("image", "input").select(train_indices)
test = test.rename_column("image", "input").select(test_indices)

In [4]:
def preprocess(example):
    arr = np.reshape(example["input"], -1)
    example["input"] = arr
    return example

train = train.map(preprocess, num_proc=2)
test = test.map(preprocess, num_proc=2)

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]



Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]



In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
train_inputs = torch.from_numpy(train["input"]).float().squeeze()
test_inputs = torch.from_numpy(test["input"]).float().squeeze()
train_labels = torch.from_numpy(train["label"]).long()
test_labels = torch.from_numpy(test["label"]).long()


In [6]:
train_dataset = data.TensorDataset(train_inputs, train_labels)
test_dataset = data.TensorDataset(test_inputs, test_labels)

In [7]:
class Model(nn.Module):

    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(28 * 28, 512)
        self.norm_1 = nn.LayerNorm(512)
        self.drop_1 = nn.Dropout(p=0.4)
        self.linear_2 = nn.Linear(512, 512)
        self.norm_2 = nn.LayerNorm(512)
        self.drop_2 = nn.Dropout(p=0.2)
        self.linear_3 = nn.Linear(512, 512)
        self.norm_3 = nn.LayerNorm(512)
        self.linear_4 = nn.Linear(512, 10)
        

    def forward(self, x):
        x = self.drop_1(f.relu(self.norm_1(self.linear_1(x))))
        x = self.drop_2(f.relu(self.norm_2(self.linear_2(x))))
        x = f.relu(self.norm_3(self.linear_3(x)))
        out = self.linear_4(x)
        return out


In [8]:
def create_dataloader(dataset, batch_size):
    return data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=False, num_workers=2, pin_memory=True)

def train(model, optimizer, train_dataloader, test_dataloader, epochs=10, device=None, verbose=False):
    if verbose:
        print("Training has started")
        
    model = model.to(device)
    loss_fn = nn.CrossEntropyLoss()
    train_losses = []
    test_losses = []
    
    for epoch in range(epochs):
        train_loss = 0
        model.train()
        
        for inputs, labels in train_dataloader:
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
    
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        test_loss = 0
        model.eval()
        with torch.no_grad():
            for inputs, labels in test_dataloader:
                inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
                
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)
                test_loss += loss.item()

        train_loss /= len(train_dataloader)
        test_loss /= len(test_dataloader)
        train_losses.append(train_loss)
        test_losses.append(test_loss)
        if verbose and epoch and ((epoch + 1) % int(epochs * 0.25) == 0):
            print(f"Epoch {epoch + 1} complete, train loss: {train_loss:.3f}, test loss: {test_loss:.3f}")

    if verbose:
        print("Training is complete")

    return train_losses, test_losses

## Batch Gradient Descent

In [9]:
train_dataloader = create_dataloader(train_dataset, batch_size=len(train_dataset))
test_dataloader = create_dataloader(test_dataset, batch_size=len(test_dataset))

model = Model()
optimizer = optim.SGD(model.parameters(), lr=1e-1, momentum=0)

In [10]:
print(f"Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Parameters: 935434


In [11]:
start_time = time.time()
train_loss, test_loss = train(model, optimizer, train_dataloader, test_dataloader, epochs=100, device=device, verbose=True)
end_time = time.time()
elapsed_time = end_time - start_time

Training has started
Epoch 25 complete, train loss: 0.666, test loss: 0.643
Epoch 50 complete, train loss: 0.346, test loss: 0.364
Epoch 75 complete, train loss: 0.254, test loss: 0.297
Epoch 100 complete, train loss: 0.201, test loss: 0.260
Training is complete


In [12]:
with open("./batch_metrics.pkl", "wb") as file:
    pickle.dump((train_loss, test_loss), file)

with open("./batch_time.pkl", "wb") as file:
    pickle.dump(elapsed_time, file)

## Mini-batch Gradient Descent (512)

In [13]:
train_dataloader = create_dataloader(train_dataset, batch_size=512)
test_dataloader = create_dataloader(test_dataset, batch_size=len(test_dataset))

model = Model()
optimizer = optim.SGD(model.parameters(), lr=1e-1, momentum=0)

In [14]:
start_time = time.time()
train_loss, test_loss = train(model, optimizer, train_dataloader, test_dataloader, epochs=100, device=device, verbose=True)
end_time = time.time()
elapsed_time = end_time - start_time

Training has started
Epoch 25 complete, train loss: 0.066, test loss: 0.159
Epoch 50 complete, train loss: 0.022, test loss: 0.158
Epoch 75 complete, train loss: 0.013, test loss: 0.163
Epoch 100 complete, train loss: 0.007, test loss: 0.178
Training is complete


In [15]:
with open("./mini_batch_512_metrics.pkl", "wb") as file:
    pickle.dump((train_loss, test_loss), file)

with open("./mini_batch_512_time.pkl", "wb") as file:
    pickle.dump(elapsed_time, file)

## Mini-batch Gradient Descent (256)

In [17]:
train_dataloader = create_dataloader(train_dataset, batch_size=256)
test_dataloader = create_dataloader(test_dataset, batch_size=len(test_dataset))

model = Model()
optimizer = optim.SGD(model.parameters(), lr=1e-1, momentum=0)

In [18]:
train_loss, test_loss = train(model, optimizer, train_dataloader, test_dataloader, epochs=100, device=device, verbose=True)

Training has started
Epoch 25 complete, train loss: 0.109, test loss: 0.741
Epoch 50 complete, train loss: 0.015, test loss: 0.144
Epoch 75 complete, train loss: 0.009, test loss: 0.238
Epoch 100 complete, train loss: 0.008, test loss: 0.157
Training is complete


In [19]:
with open("./mini_batch_256_metrics.pkl", "wb") as file:
    pickle.dump((train_loss, test_loss), file)

## Mini-batch Gradient Descent (128)

In [21]:
train_dataloader = create_dataloader(train_dataset, batch_size=128)
test_dataloader = create_dataloader(test_dataset, batch_size=len(test_dataset))

model = Model()
optimizer = optim.SGD(model.parameters(), lr=1e-1, momentum=0)

In [22]:
train_loss, test_loss = train(model, optimizer, train_dataloader, test_dataloader, epochs=100, device=device, verbose=True)

Training has started
Epoch 25 complete, train loss: 0.040, test loss: 0.202
Epoch 50 complete, train loss: 0.009, test loss: 0.151
Epoch 75 complete, train loss: 0.011, test loss: 0.163
Epoch 100 complete, train loss: 0.006, test loss: 0.241
Training is complete


In [23]:
with open("./mini_batch_128_metrics.pkl", "wb") as file:
    pickle.dump((train_loss, test_loss), file)

## Mini-batch Gradient Descent (64)

In [25]:
train_dataloader = create_dataloader(train_dataset, batch_size=64)
test_dataloader = create_dataloader(test_dataset, batch_size=len(test_dataset))

model = Model()
optimizer = optim.SGD(model.parameters(), lr=1e-1, momentum=0)

In [26]:
train_loss, test_loss = train(model, optimizer, train_dataloader, test_dataloader, epochs=100, device=device, verbose=True)

Training has started
Epoch 25 complete, train loss: 0.035, test loss: 0.224
Epoch 50 complete, train loss: 0.012, test loss: 0.172
Epoch 75 complete, train loss: 0.009, test loss: 0.168
Epoch 100 complete, train loss: 0.004, test loss: 0.145
Training is complete


In [27]:
with open("./mini_batch_64_metrics.pkl", "wb") as file:
    pickle.dump((train_loss, test_loss), file)

## Mini-batch Gradient Descent (32)

In [29]:
train_dataloader = create_dataloader(train_dataset, batch_size=32)
test_dataloader = create_dataloader(test_dataset, batch_size=len(test_dataset))

model = Model()
optimizer = optim.SGD(model.parameters(), lr=1e-1, momentum=0)

In [30]:
train_loss, test_loss = train(model, optimizer, train_dataloader, test_dataloader, epochs=100, device=device, verbose=True)

Training has started
Epoch 25 complete, train loss: 0.036, test loss: 0.143
Epoch 50 complete, train loss: 0.008, test loss: 0.163
Epoch 75 complete, train loss: 0.006, test loss: 0.177
Epoch 100 complete, train loss: 0.003, test loss: 0.168
Training is complete


In [31]:
with open("./mini_batch_32_metrics.pkl", "wb") as file:
    pickle.dump((train_loss, test_loss), file)

## Stochastic Gradient Descent

In [32]:
train_dataloader = create_dataloader(train_dataset, batch_size=1)
test_dataloader = create_dataloader(test_dataset, batch_size=len(test_dataset))

model = Model()
optimizer = optim.SGD(model.parameters(), lr=1e-1, momentum=0)

In [33]:
start_time = time.time()
train_loss, test_loss = train(model, optimizer, train_dataloader, test_dataloader, epochs=100, device=device, verbose=True)
end_time = time.time()
elapsed_time = end_time - start_time

Training has started
Epoch 25 complete, train loss: 0.058, test loss: 0.155
Epoch 50 complete, train loss: 0.033, test loss: 0.213
Epoch 75 complete, train loss: 0.024, test loss: 0.207
Epoch 100 complete, train loss: 0.014, test loss: 0.220
Training is complete


In [34]:
with open("./stochastic_metrics.pkl", "wb") as file:
    pickle.dump((train_loss, test_loss), file)

with open("./stochastic_time.pkl", "wb") as file:
    pickle.dump(elapsed_time, file)