# Assignment 1, Yifan Han, Oct 17 2024

## Question 1

In [7]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import time

training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NeuralNetwork().to(device)  # Move the model to the GPU/CPU

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  Tesla T4                       On  |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

In [2]:
# Which GPU you used? How much memory does it have?

print('The GPU is an NVIDIA Tesla T4. The memory of the GPU is 15,360 MiB (15 GB), as seen in the "Memory-Usage" section.')

The GPU is an NVIDIA Tesla T4. The memory of the GPU is 15,360 MiB (15 GB), as seen in the "Memory-Usage" section.


In [3]:
# Confirm inside jupyter or python that your PyTorch installation supports GPU.
print(torch.cuda.is_available())  

True


In [10]:
learning_rate = 1e-3
batch_size = 64

# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [6]:
# Run using GPU
# Increase the number of epochs to at least 30.
epochs = 30
total_time_gpu = 0

for t in range(epochs):
    start_time = time.time()  # Start timer

    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)

    end_time = time.time()  # End timer
    epoch_time = end_time - start_time  # Time taken for this epoch
    total_time_gpu += epoch_time

print("Done!")

Epoch 1
-------------------------------
loss: 2.308072  [   64/60000]
loss: 2.290076  [ 6464/60000]
loss: 2.271990  [12864/60000]
loss: 2.268460  [19264/60000]
loss: 2.243478  [25664/60000]
loss: 2.221152  [32064/60000]
loss: 2.224496  [38464/60000]
loss: 2.192322  [44864/60000]
loss: 2.193143  [51264/60000]
loss: 2.159572  [57664/60000]
Test Error: 
 Accuracy: 50.5%, Avg loss: 2.153347 

Epoch 2
-------------------------------
loss: 2.168822  [   64/60000]
loss: 2.156839  [ 6464/60000]
loss: 2.094479  [12864/60000]
loss: 2.117905  [19264/60000]
loss: 2.053223  [25664/60000]
loss: 1.998184  [32064/60000]
loss: 2.026289  [38464/60000]
loss: 1.944067  [44864/60000]
loss: 1.952070  [51264/60000]
loss: 1.877596  [57664/60000]
Test Error: 
 Accuracy: 54.5%, Avg loss: 1.874624 

Epoch 3
-------------------------------
loss: 1.915162  [   64/60000]
loss: 1.884786  [ 6464/60000]
loss: 1.754298  [12864/60000]
loss: 1.807019  [19264/60000]
loss: 1.685389  [25664/60000]
loss: 1.635760  [32064/600

Confirm with “nvidia-smi -l” that you were indeed using GPU when the program was running.
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  Tesla T4                       On  |   00000000:00:04.0 Off |                    0 |
| N/A   62C    P0             29W /   70W |     161MiB /  15360MiB |      5%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
+-----------------------------------------------------------------------------------------+


In [11]:
# Run using CPU
device = 'cpu'

# Move the model to the CPU
model = model.to(device)

# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

total_time_cpu = 0

for t in range(epochs):
    start_time = time.time()  # Start timer

    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)

    end_time = time.time()  # End timer
    epoch_time = end_time - start_time  # Time taken for this epoch
    total_time_cpu += epoch_time

print("Done!")

Epoch 1
-------------------------------
loss: 2.306399  [   64/60000]
loss: 2.294306  [ 6464/60000]
loss: 2.285729  [12864/60000]
loss: 2.279481  [19264/60000]
loss: 2.271997  [25664/60000]
loss: 2.234895  [32064/60000]
loss: 2.237722  [38464/60000]
loss: 2.212082  [44864/60000]
loss: 2.204960  [51264/60000]
loss: 2.180465  [57664/60000]
Test Error: 
 Accuracy: 45.6%, Avg loss: 2.174642 

Epoch 2
-------------------------------
loss: 2.180748  [   64/60000]
loss: 2.166188  [ 6464/60000]
loss: 2.122184  [12864/60000]
loss: 2.140046  [19264/60000]
loss: 2.095920  [25664/60000]
loss: 2.032320  [32064/60000]
loss: 2.056299  [38464/60000]
loss: 1.987244  [44864/60000]
loss: 1.986756  [51264/60000]
loss: 1.925987  [57664/60000]
Test Error: 
 Accuracy: 58.6%, Avg loss: 1.920554 

Epoch 3
-------------------------------
loss: 1.949208  [   64/60000]
loss: 1.906153  [ 6464/60000]
loss: 1.809785  [12864/60000]
loss: 1.852741  [19264/60000]
loss: 1.747474  [25664/60000]
loss: 1.691912  [32064/600

In [13]:
# How much faster does it run on GPU?
print(round(total_time_cpu-total_time_gpu,2),'seconds')

21.02 seconds


In [10]:
# What accuracy is achieved after 30 epochs?
print('The accuracy is 80.9% after 30 epochs')

The accuracy is 80.9% after 30 epochs


## Question 2

In [18]:
import torch

device = "cuda"

# Get the total memory available on the GPU
gpu_total_memory = torch.cuda.get_device_properties(device).total_memory

# Loop to increase matrix size until GPU memory runs out
for iteration in range(0,500):
    matrix_size = 100 + iteration * 1000  # Increase matrix size progressively
    
    # Generate two random matrices of the current size
    matrix_A = torch.randn(matrix_size, matrix_size, device=device)
    matrix_B = torch.randn(matrix_size, matrix_size, device=device)
    
    # Perform matrix multiplication
    matrix_product = torch.matmul(matrix_A, matrix_B)
    
    # Get the currently allocated memory on the GPU
    current_memory_usage = torch.cuda.memory_allocated(device)
    
    # Calculate the percentage of memory used
    memory_usage_percentage = current_memory_usage / gpu_total_memory * 100
    
    # Print the current matrix size and memory usage
    print(f"Matrix Size: {matrix_size}x{matrix_size}, "
          f"Memory Usage: {current_memory_usage} / {gpu_total_memory} "
          f"({memory_usage_percentage:.4f}%)")
    
    # Check if the GPU memory usage exceeds 99%
    if memory_usage_percentage > 99:
        print(f"Memory limit reached at matrix size: {matrix_size}")
        break
    
    # Clean up by deleting the matrix product and freeing memory
    del matrix_product
    torch.cuda.empty_cache()

Matrix Size: 100x100, Memory Usage: 8641024 / 15642329088 (0.0552%)
Matrix Size: 1100x1100, Memory Usage: 23041024 / 15642329088 (0.1473%)
Matrix Size: 2100x2100, Memory Usage: 61441024 / 15642329088 (0.3928%)
Matrix Size: 3100x3100, Memory Usage: 123841024 / 15642329088 (0.7917%)
Matrix Size: 4100x4100, Memory Usage: 210241024 / 15642329088 (1.3441%)
Matrix Size: 5100x5100, Memory Usage: 323092480 / 15642329088 (2.0655%)
Matrix Size: 6100x6100, Memory Usage: 455213056 / 15642329088 (2.9101%)
Matrix Size: 7100x7100, Memory Usage: 613441024 / 15642329088 (3.9217%)
Matrix Size: 8100x8100, Memory Usage: 795841024 / 15642329088 (5.0877%)
Matrix Size: 9100x9100, Memory Usage: 1002569728 / 15642329088 (6.4093%)
Matrix Size: 10100x10100, Memory Usage: 1235353600 / 15642329088 (7.8975%)
Matrix Size: 11100x11100, Memory Usage: 1487041024 / 15642329088 (9.5065%)
Matrix Size: 12100x12100, Memory Usage: 1765441024 / 15642329088 (11.2863%)
Matrix Size: 13100x13100, Memory Usage: 2067841024 / 156423

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.86 GiB (GPU 0; 14.57 GiB total capacity; 9.72 GiB already allocated; 4.73 GiB free; 9.73 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [3]:
# Set device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# List of 5 matrix sizes (including the maximum size that GPU can handle)
matrix_sizes = [1000, 5000, 10000, 20000, 35100]  # Modify this based on your GPU limits

# Function to measure time for matrix multiplication on GPU and CPU
def measure_time(matrix_size):
    # Generate random matrices on the GPU
    matrix_A = torch.randn(matrix_size, matrix_size, device=device)
    matrix_B = torch.randn(matrix_size, matrix_size, device=device)

    # GPU multiplication
    start_gpu = time.time()
    result_gpu = torch.matmul(matrix_A, matrix_B)
    end_gpu = time.time()
    gpu_time = end_gpu - start_gpu
    
    torch.cuda.empty_cache()

    # Move matrices to CPU for CPU multiplication
    A_cpu = matrix_A.to('cpu')
    B_cpu = matrix_B.to('cpu')

    # CPU multiplication
    start_cpu = time.time()
    result_cpu = torch.matmul(A_cpu, B_cpu)
    end_cpu = time.time()
    cpu_time = end_cpu - start_cpu

    return gpu_time, cpu_time

# Run the experiment for each matrix size and repeat 3 times
for matrix_size in matrix_sizes:
    gpu_times = []
    cpu_times = []
    for _ in range(3):
        gpu_time, cpu_time = measure_time(matrix_size)
        gpu_times.append(gpu_time)
        cpu_times.append(cpu_time)
    
    # Calculate and print the average time for GPU and CPU
    print(f"Matrix size: {matrix_size}x{matrix_size}")
    print(f"Average GPU duration: {sum(gpu_times) / 3:.2f} seconds")
    print(f"Average CPU duration: {sum(cpu_times) / 3:.2f} seconds")
    print('----------------------------')


Matrix size: 1000x1000
Average GPU duration: 0.01 seconds
Average CPU duration: 0.01 seconds
----------------------------
Matrix size: 5000x5000
Average GPU duration: 0.00 seconds
Average CPU duration: 0.40 seconds
----------------------------
Matrix size: 10000x10000
Average GPU duration: 0.00 seconds
Average CPU duration: 3.15 seconds
----------------------------
Matrix size: 20000x20000
Average GPU duration: 0.00 seconds
Average CPU duration: 25.34 seconds
----------------------------
Matrix size: 35100x35100
Average GPU duration: 0.08 seconds
Average CPU duration: 136.83 seconds
----------------------------


+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  Tesla T4                       On  |   00000000:00:04.0 Off |                    0 |
| N/A   82C    P0             70W /   70W |   14235MiB /  15360MiB |    100%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
+-----------------------------------------------------------------------------------------+


In [4]:
# What was the load on GPU? How much GPU memory did you program use?
print('GPU load is 100%.')
print('Memory Usage: 14,235 MiB / 15,360 MiB')

GPU load is 100%.
Memory Usage: 14,235 MiB / 15,360 MiB


## Question 3

In [8]:
!wget -nc https://download.pytorch.org/tutorial/hymenoptera_data.zip

In [7]:
!rm -rf data; mkdir -p data; cd data; unzip ../hymenoptera_data.zip

In [10]:
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = 'data/hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torchvision import datasets, transforms
import time
import os
from tempfile import TemporaryDirectory

# Define the custom CNN architecture
class CustomCNN(nn.Module):
    def __init__(self, num_classes=2):
        super(CustomCNN, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        
        # Pooling layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Batch normalization layers
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Dropout layer
        self.dropout = nn.Dropout(p=0.5)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 28 * 28, 512)  # Assuming input image size is 224x224
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        # Forward pass through the network
        x = self.pool(self.bn1(torch.relu(self.conv1(x))))
        x = self.pool(self.bn2(torch.relu(self.conv2(x))))
        x = self.pool(self.bn3(torch.relu(self.conv3(x))))
        
        # Flatten the output
        x = x.view(-1, 128 * 28 * 28)
        
        # Fully connected layers with dropout
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

# Data augmentation and normalization
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = 'data/hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Instantiate the model, define criterion, optimizer, and scheduler
model = CustomCNN(num_classes=len(class_names))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# Function to train the model
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')
    
        best_acc = 0.0

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

                # deep copy the model
                if phase == 'val' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

            print()

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model

# Train the model
num_epochs = 25
trained_model = train_model(model, criterion, optimizer, scheduler, num_epochs=num_epochs)


Epoch 0/24
----------
train Loss: 0.8942 Acc: 0.5000
val Loss: 0.6939 Acc: 0.5621

Epoch 1/24
----------
train Loss: 0.7341 Acc: 0.5820
val Loss: 0.7231 Acc: 0.5686

Epoch 2/24
----------
train Loss: 0.7771 Acc: 0.5779
val Loss: 0.6783 Acc: 0.5425

Epoch 3/24
----------
train Loss: 0.6935 Acc: 0.6066
val Loss: 0.6131 Acc: 0.6797

Epoch 4/24
----------
train Loss: 0.6814 Acc: 0.5902
val Loss: 0.6164 Acc: 0.7255

Epoch 5/24
----------
train Loss: 0.7380 Acc: 0.5492
val Loss: 0.6299 Acc: 0.6667

Epoch 6/24
----------
train Loss: 0.6694 Acc: 0.5902
val Loss: 0.6025 Acc: 0.7059

Epoch 7/24
----------
train Loss: 0.6183 Acc: 0.6189
val Loss: 0.5987 Acc: 0.7451

Epoch 8/24
----------
train Loss: 0.6026 Acc: 0.6721
val Loss: 0.5920 Acc: 0.7320

Epoch 9/24
----------
train Loss: 0.6033 Acc: 0.6680
val Loss: 0.5895 Acc: 0.7190

Epoch 10/24
----------
train Loss: 0.6086 Acc: 0.6680
val Loss: 0.5705 Acc: 0.7320

Epoch 11/24
----------
train Loss: 0.5994 Acc: 0.7049
val Loss: 0.5872 Acc: 0.7255

Ep

In [1]:
# What accuracy did you manage to achieve?
print('75.8%')

75.8%


## Question 4

In [7]:
import pandas as pd
import torch
import random
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Input, LSTM, Dense, Attention, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tqdm import tqdm

# Generate date-based data
date_range = pd.date_range(start='1950-01-01', end='2050-12-31', freq='D')

# Input data: list of date strings
input_dates = date_range.strftime('%Y-%m-%d').tolist()

# Output data: formatted date strings with '<eos>' token
target_dates = [date.strftime('%d-%m-%Y') + '<eos>' for date in date_range]

# Shuffle the data
shuffled_data = list(zip(input_dates, target_dates))
random.shuffle(shuffled_data)
input_dates, target_dates = zip(*shuffled_data)
input_dates = list(input_dates)
target_dates = list(target_dates)

# Split into train, validation, and test sets
total_samples = len(input_dates)
test_size = int(0.1 * total_samples)
val_size = int(0.1 * total_samples)
train_size = total_samples - test_size - val_size

train_inputs = input_dates[:train_size]
train_targets = target_dates[:train_size]
val_inputs = input_dates[train_size:train_size + val_size]
val_targets = target_dates[train_size:train_size + val_size]
test_inputs = input_dates[train_size + val_size:]
test_targets = target_dates[train_size + val_size:]

special_tokens = ['<pad>', '<eos>'] 

# Tokenize characters, add special tokens, and create mappings
unique_characters = set(''.join(train_inputs + train_targets + val_inputs + val_targets + test_inputs + test_targets))
unique_tokens = sorted(unique_characters.union(set(special_tokens)))
unique_tokens = sorted(set(unique_tokens) - {'<pad>'})

token2idx = {'<pad>': 0}
for idx, token in enumerate(unique_tokens):
    token2idx[token] = idx + 1  # Start from 1
idx2token = {idx: token for token, idx in token2idx.items()}

num_tokens = len(token2idx)
embedding_dim = 128
hidden_size = 256

# Function to encode date strings into token indices
def encode_dates(dates, token2idx, special_tokens):
    encoded_sequences = []

    for date in dates:
        tokens = []
        i = 0
        while i < len(date):
            matched = False
            for token in special_tokens:
                if date[i:i+len(token)] == token:
                    tokens.append(token)
                    i += len(token)
                    matched = True
                    break
            if not matched:
                tokens.append(date[i])
                i += 1
        encoded_sequences.append([token2idx[token] for token in tokens])

    return encoded_sequences

# Encode and pad the sequences
train_inputs_encoded = encode_dates(train_inputs, token2idx, special_tokens)
train_targets_encoded = encode_dates(train_targets, token2idx, special_tokens)

max_input_length = max([len(seq) for seq in train_inputs_encoded])
max_target_length = max([len(seq) for seq in train_targets_encoded])

train_inputs_padded = pad_sequences(
    train_inputs_encoded, maxlen=max_input_length, padding='post', value=token2idx['<pad>']
)
train_targets_padded = pad_sequences(
    train_targets_encoded, maxlen=max_target_length, padding='post', value=token2idx['<pad>']
)

train_decoder_inputs = np.full_like(train_targets_padded, fill_value=token2idx['<pad>'])
train_decoder_inputs[:, 1:] = train_targets_padded[:, :-1]

train_decoder_targets = np.expand_dims(train_targets_padded, -1)

# Validation encoding and padding
val_inputs_encoded = encode_dates(val_inputs, token2idx, special_tokens)
val_targets_encoded = encode_dates(val_targets, token2idx, special_tokens)

val_inputs_padded = pad_sequences(
    val_inputs_encoded, maxlen=max_input_length, padding='post', value=token2idx['<pad>']
)
val_targets_padded = pad_sequences(
    val_targets_encoded, maxlen=max_target_length, padding='post', value=token2idx['<pad>']
)

val_decoder_inputs = np.full_like(val_targets_padded, fill_value=token2idx['<pad>'])
val_decoder_inputs[:, 1:] = val_targets_padded[:, :-1]
val_decoder_targets = np.expand_dims(val_targets_padded, -1)

# Test encoding and padding
test_inputs_encoded = encode_dates(test_inputs, token2idx, special_tokens)
test_targets_encoded = encode_dates(test_targets, token2idx, special_tokens)

test_inputs_padded = pad_sequences(
    test_inputs_encoded, maxlen=max_input_length, padding='post', value=token2idx['<pad>']
)
test_targets_padded = pad_sequences(
    test_targets_encoded, maxlen=max_target_length, padding='post', value=token2idx['<pad>']
)

# Dataset class to handle batching
from torch.utils.data import DataLoader, Dataset

class DateTranslationDataset(Dataset):
    def __init__(self, encoder_inputs, decoder_inputs=None, decoder_targets=None):
        self.encoder_inputs = encoder_inputs
        self.decoder_inputs = decoder_inputs
        self.decoder_targets = decoder_targets

    def __len__(self):
        return len(self.encoder_inputs)

    def __getitem__(self, idx):
        sample = {'encoder_input': self.encoder_inputs[idx]}
        
        if self.decoder_inputs is not None and self.decoder_targets is not None:
            sample['decoder_input'] = self.decoder_inputs[idx]
            sample['decoder_target'] = self.decoder_targets[idx]
        
        return sample




In [9]:
import torch.nn as nn

class LSTMEncoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, dropout=0.2):
        super(LSTMEncoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
            dropout=dropout
        )

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class LSTMDecoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, dropout=0.2):
        super(LSTMDecoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
            dropout=dropout
        )
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, trg, hidden, cell):
        embedded = self.dropout(self.embedding(trg))
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        predictions = self.fc_out(outputs)
        return predictions

class Seq2SeqLSTM(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2SeqLSTM, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        hidden, cell = self.encoder(src)
        outputs = self.decoder(trg, hidden, cell)
        return outputs

# Instantiate model components
encoder = LSTMEncoder(input_dim=num_tokens, embedding_dim=embedding_dim, hidden_dim=hidden_size)
decoder = LSTMDecoder(output_dim=num_tokens, embedding_dim=embedding_dim, hidden_dim=hidden_size)

model = Seq2SeqLSTM(encoder, decoder).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=token2idx['<pad>'])
optimizer = torch.optim.Adam(model.parameters())

In [10]:
# Convert data to PyTorch tensors
train_inputs_padded = torch.tensor(train_inputs_padded, dtype=torch.long)
train_targets_padded = torch.tensor(train_targets_padded, dtype=torch.long)
train_decoder_inputs = torch.tensor(train_decoder_inputs, dtype=torch.long)
train_decoder_targets = torch.tensor(train_decoder_targets, dtype=torch.long)

val_inputs_padded = torch.tensor(val_inputs_padded, dtype=torch.long)
val_targets_padded = torch.tensor(val_targets_padded, dtype=torch.long)
val_decoder_inputs = torch.tensor(val_decoder_inputs, dtype=torch.long)
val_decoder_targets = torch.tensor(val_decoder_targets, dtype=torch.long)

test_inputs_padded = torch.tensor(test_inputs_padded, dtype=torch.long)
test_targets_padded = torch.tensor(test_targets_padded, dtype=torch.long)

# Create datasets
train_dataset = DateTranslationDataset(
    encoder_inputs=train_inputs_padded,
    decoder_inputs=train_decoder_inputs,
    decoder_targets=train_decoder_targets
)

val_dataset = DateTranslationDataset(
    encoder_inputs=val_inputs_padded,
    decoder_inputs=val_decoder_inputs,
    decoder_targets=val_decoder_targets
)

test_dataset = DateTranslationDataset(
    encoder_inputs=test_inputs_padded,
    decoder_inputs=None,
    decoder_targets=None
)

batch_size = 64

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)



  train_inputs_padded = torch.tensor(train_inputs_padded, dtype=torch.long)
  train_targets_padded = torch.tensor(train_targets_padded, dtype=torch.long)
  train_decoder_inputs = torch.tensor(train_decoder_inputs, dtype=torch.long)
  train_decoder_targets = torch.tensor(train_decoder_targets, dtype=torch.long)
  val_inputs_padded = torch.tensor(val_inputs_padded, dtype=torch.long)
  val_targets_padded = torch.tensor(val_targets_padded, dtype=torch.long)
  val_decoder_inputs = torch.tensor(val_decoder_inputs, dtype=torch.long)
  val_decoder_targets = torch.tensor(val_decoder_targets, dtype=torch.long)
  test_inputs_padded = torch.tensor(test_inputs_padded, dtype=torch.long)
  test_targets_padded = torch.tensor(test_targets_padded, dtype=torch.long)


In [11]:
# Training and validation loop
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        src = batch['encoder_input'].to(device)
        trg = batch['decoder_input'].to(device)
        trg_labels = batch['decoder_target'].to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output = output.view(-1, output.shape[-1])
        trg_labels = trg_labels.view(-1)

        loss = criterion(output, trg_labels)
        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

def validate_epoch(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader):
            src = batch['encoder_input'].to(device)
            trg = batch['decoder_input'].to(device)
            trg_labels = batch['decoder_target'].to(device)

            output = model(src, trg)

            output = output.view(-1, output.shape[-1])
            trg_labels = trg_labels.view(-1)

            loss = criterion(output, trg_labels)
            total_loss += loss.item()

    return total_loss / len(val_loader)

# Training loop with validation
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = validate_epoch(model, val_loader, criterion, device)

    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")


100% 462/462 [00:02<00:00, 189.98it/s]
100% 58/58 [00:00<00:00, 240.90it/s]


Epoch 1/10: Train Loss = 0.2471, Val Loss = 0.0038


100% 462/462 [00:02<00:00, 198.57it/s]
100% 58/58 [00:00<00:00, 230.32it/s]


Epoch 2/10: Train Loss = 0.0036, Val Loss = 0.0008


100% 462/462 [00:02<00:00, 200.53it/s]
100% 58/58 [00:00<00:00, 243.19it/s]


Epoch 3/10: Train Loss = 0.0012, Val Loss = 0.0004


100% 462/462 [00:02<00:00, 190.51it/s]
100% 58/58 [00:00<00:00, 243.63it/s]


Epoch 4/10: Train Loss = 0.0006, Val Loss = 0.0002


100% 462/462 [00:02<00:00, 195.49it/s]
100% 58/58 [00:00<00:00, 240.07it/s]


Epoch 5/10: Train Loss = 0.0007, Val Loss = 0.0021


100% 462/462 [00:02<00:00, 201.19it/s]
100% 58/58 [00:00<00:00, 237.03it/s]


Epoch 6/10: Train Loss = 0.0009, Val Loss = 0.0001


100% 462/462 [00:02<00:00, 199.05it/s]
100% 58/58 [00:00<00:00, 258.62it/s]


Epoch 7/10: Train Loss = 0.0001, Val Loss = 0.0001


100% 462/462 [00:02<00:00, 194.90it/s]
100% 58/58 [00:00<00:00, 232.12it/s]


Epoch 8/10: Train Loss = 0.0001, Val Loss = 0.0000


100% 462/462 [00:02<00:00, 196.09it/s]
100% 58/58 [00:00<00:00, 237.60it/s]


Epoch 9/10: Train Loss = 0.0001, Val Loss = 0.0000


100% 462/462 [00:02<00:00, 195.62it/s]
100% 58/58 [00:00<00:00, 239.17it/s]

Epoch 10/10: Train Loss = 0.0000, Val Loss = 0.0000





In [12]:
# Test a specific date
def test_specific_date(model, date, token2idx, idx2token, device):
    model.eval()
    
    # Encode the input date
    encoded_input = encode_dates([date], token2idx, special_tokens)
    input_padded = pad_sequences(encoded_input, maxlen=max_input_length, padding='post', value=token2idx['<pad>'])
    input_tensor = torch.tensor(input_padded, dtype=torch.long).to(device)

    # Prepare the decoder input with the start token (<eos>)
    start_token = token2idx['<eos>']
    trg = torch.full((input_tensor.size(0), 1), start_token, dtype=torch.long).to(device)  # Start with <eos>

    with torch.no_grad():
        hidden, cell = model.encoder(input_tensor)  # Get the hidden state from the encoder
        outputs = []

        for _ in range(max_target_length):  # Generate up to the maximum target length
            output = model.decoder(trg, hidden, cell)  # Pass the target sequence and the hidden state
            top1 = output[:, -1, :].argmax(1)  # Get the most probable next token
            outputs.append(top1.item())  # Collect the output
            trg = torch.cat((trg, top1.unsqueeze(1)), dim=1)  # Append predicted token to the target sequence

        # Decode the output tokens back to strings
        decoded_dates = ''.join([idx2token[idx] for idx in outputs if idx not in {token2idx['<pad>'], token2idx['<eos>']}])
        print(f"Input date: {date} -> Predicted date: {decoded_dates}")

# Run the test for 2025-01-05
test_specific_date(model, '2025-01-05', token2idx, idx2token, device)


Input date: 2025-01-05 -> Predicted date: 05-01-2025
