In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
import pandas as pd
import os
import pickle
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

## Resnet Architecture Setup

In [2]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

In [3]:
class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

In [4]:
# One of our first target models, otherwise known as ResNet10.
# We use 1 block 
def ResNet_v1():
    return ResNet(BasicBlock, [1, 1, 1, 1])

def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


def ResNet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])


def ResNet50():
    return ResNet(Bottleneck, [3, 4, 6, 3])


def ResNet101():
    return ResNet(Bottleneck, [3, 4, 23, 3])


def ResNet152():
    return ResNet(Bottleneck, [3, 8, 36, 3])

## Batching trainset and testsets

In [5]:
# Function to load one CIFAR batch (pickle file)
def load_cifar_batch(file):
    with open(file, 'rb') as fo:
        dict_data = pickle.load(fo, encoding='bytes')
    return dict_data

# Define the directory where the CIFAR-10 batch files are stored
cifar10_dir = '/kaggle/input/deep-learning-spring-2025-project-1/cifar-10-python/cifar-10-batches-py'

# Load label names from batches.meta
meta_data_dict = load_cifar_batch(os.path.join(cifar10_dir, 'batches.meta'))
label_names = meta_data_dict[b'label_names']  # labels are stored as bytes

# Load and combine all training batches (data_batch_1 to data_batch_5)
train_data_list = []
train_labels = []
for i in range(1, 6):
    batch_file = os.path.join(cifar10_dir, f'data_batch_{i}')
    batch_dict = load_cifar_batch(batch_file)
    # Reshape data from (10000, 3072) to (10000, 32, 32, 3)
    batch_data = batch_dict[b'data']
    batch_data = batch_data.reshape((10000, 3, 32, 32)).transpose(0, 2, 3, 1)
    train_data_list.append(batch_data)
    train_labels += batch_dict[b'labels']

# Concatenate training batches to create the full training set
train_images = np.concatenate(train_data_list, axis=0)
print("Training set:", train_images.shape)

# Load the test batch
test_batch = load_cifar_batch(os.path.join(cifar10_dir, 'test_batch'))
test_images = test_batch[b'data']
test_images = test_images.reshape((10000, 3, 32, 32)).transpose(0, 2, 3, 1)
test_labels = test_batch[b'labels']
print("Test set:", test_images.shape)

# Define a custom dataset that applies transforms
class CIFAR10CustomDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images  # Expected shape: (N, 32, 32, 3)
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, index):
        # Convert the numpy image to a PIL image for transformation
        img = Image.fromarray(self.images[index])
        label = self.labels[index]
        if self.transform:
            img = self.transform(img)
        return img, label

# Define transforms for training and testing (same as before)
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), 
                         (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), 
                         (0.2023, 0.1994, 0.2010)),
])

# Create dataset objects
train_dataset = CIFAR10CustomDataset(train_images, train_labels, transform=transform_train)
test_dataset = CIFAR10CustomDataset(test_images, test_labels, transform=transform_test)

# Create DataLoaders
trainloader = DataLoader(train_dataset, batch_size=2048, shuffle=True, num_workers=2)
testloader = DataLoader(test_dataset, batch_size=750, shuffle=False, num_workers=2)

Training set: (50000, 32, 32, 3)
Test set: (10000, 32, 32, 3)


# Model Assumptions

In [6]:
# Parameters (instead of argparse)
lr = 0.001            # learning rate
resume = True      # whether to resume from checkpoint
num_new_epochs = 100  # number of new epochs to run
start_epoch = 0     # start from epoch 0 or last checkpoint epoch

# Define file paths in /kaggle/working
csv_file = '/kaggle/working/history/resnet1_metrics.csv'
current_checkpoint_file = '/kaggle/input/checkpoint/ckpt_resnetv1_e200.pth'
checkpoint_dir = '/kaggle/working/checkpoint'


# Set device and beggining best acc
device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0        # best test accuracy

# Create directories if they don't exist
os.makedirs(os.path.dirname(csv_file), exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_file = os.path.join(checkpoint_dir, 'ckpt.pth')


## Define Model

In [7]:
print('==> Building model..')
net = ResNet_v1()  # Use the ResNet18 model from your models module
net = net.to(device)
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

# Optionally resume from checkpoint
if resume:
    print('==> Resuming from checkpoint..')
    if os.path.isfile(current_checkpoint_file):
        checkpoint = torch.load(current_checkpoint_file)
        net.load_state_dict(checkpoint['net'])
        best_acc = checkpoint['acc']
        start_epoch = checkpoint['epoch']
    else:
        print("No checkpoint file found. Starting from scratch.")

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

==> Building model..
==> Resuming from checkpoint..


  checkpoint = torch.load(current_checkpoint_file)


### Training Function

In [8]:
# Define training function
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        #print('Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))

    print('Total Loss: %.3f | Total Acc: %.3f%% (%d/%d)' % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
        
    
    train_avg_loss = train_loss / len(trainloader)
    train_accuracy = 100. * correct / total
    return train_avg_loss, train_accuracy



### Testset Function

In [9]:
# Define test function
def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            
            #print('Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))

        print('Total Loss: %.3f | Total Acc: %.3f%% (%d/%d)' % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
    
    test_avg_loss = test_loss / len(testloader)
    test_accuracy = 100. * correct / total

    # Save checkpoint if accuracy improves
    acc = 100. * correct / total
    if acc > best_acc:
        print('Saving checkpoint..')
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        torch.save(state, checkpoint_file)
        best_acc = acc

    return test_avg_loss, test_accuracy

In [10]:
if os.path.isfile(csv_file):
    df_existing = pd.read_csv(csv_file)
    last_epoch = df_existing["epoch"].iloc[-1]
    print(f"Found existing CSV. Last recorded epoch: {last_epoch}")
    current_epoch = last_epoch + 1
else:
    print(f"CSV file not found. Starting at epoch: {start_epoch}")
    current_epoch = start_epoch

CSV file not found. Starting at epoch: 199


In [11]:
for epoch in range(current_epoch, current_epoch + num_new_epochs):
    train_avg_loss, train_accuracy = train(epoch)
    test_avg_loss, test_accuracy = test(epoch)
    scheduler.step()

    # Record metrics for the epoch.
    df_epoch = pd.DataFrame([{
        "epoch": epoch,
        "train_avg_loss": train_avg_loss,
        "train_accuracy": train_accuracy,
        "test_avg_loss": test_avg_loss,
        "test_accuracy": test_accuracy
    }])
    
    # Append to CSV history.
    if os.path.isfile(csv_file):
        df_epoch.to_csv(csv_file, mode='a', header=False, index=False)
    else:
        df_epoch.to_csv(csv_file, mode='w', header=True, index=False)


Epoch: 199
Total Loss: 0.003 | Total Acc: 99.986% (49993/50000)
Total Loss: 0.281 | Total Acc: 92.660% (9266/10000)
Saving checkpoint..

Epoch: 200
Total Loss: 0.003 | Total Acc: 99.990% (49995/50000)
Total Loss: 0.279 | Total Acc: 92.710% (9271/10000)
Saving checkpoint..

Epoch: 201
Total Loss: 0.003 | Total Acc: 99.994% (49997/50000)
Total Loss: 0.278 | Total Acc: 92.790% (9279/10000)
Saving checkpoint..

Epoch: 202
Total Loss: 0.003 | Total Acc: 99.996% (49998/50000)
Total Loss: 0.276 | Total Acc: 92.850% (9285/10000)
Saving checkpoint..

Epoch: 203
Total Loss: 0.003 | Total Acc: 99.994% (49997/50000)
Total Loss: 0.276 | Total Acc: 92.830% (9283/10000)

Epoch: 204
Total Loss: 0.003 | Total Acc: 99.998% (49999/50000)
Total Loss: 0.274 | Total Acc: 92.850% (9285/10000)

Epoch: 205
Total Loss: 0.003 | Total Acc: 99.992% (49996/50000)
Total Loss: 0.274 | Total Acc: 92.830% (9283/10000)

Epoch: 206
Total Loss: 0.003 | Total Acc: 99.996% (49998/50000)
Total Loss: 0.274 | Total Acc: 92.80

In [12]:
import pandas as pd

# Define the file path
file_path = "/kaggle/working/history/resnet1_metrics.csv"

# Read the CSV file
df = pd.read_csv(file_path)

# Print the contents of the dataframe
print(df)


    epoch  train_avg_loss  train_accuracy  test_avg_loss  test_accuracy
0     199        0.003357          99.986       0.281433          92.66
1     200        0.003081          99.990       0.278881          92.71
2     201        0.003026          99.994       0.277577          92.79
3     202        0.002877          99.996       0.276467          92.85
4     203        0.002756          99.994       0.275557          92.83
..    ...             ...             ...            ...            ...
95    294        0.002028         100.000       0.266346          92.89
96    295        0.001997          99.998       0.265762          92.89
97    296        0.002044         100.000       0.266294          92.87
98    297        0.002009         100.000       0.266666          92.92
99    298        0.002063          99.998       0.265882          92.87

[100 rows x 5 columns]


In [13]:
checkpoint_file = '/kaggle/working/checkpoint/ckpt.pth'
if os.path.isfile(checkpoint_file):
    # Load the checkpoint (using map_location='cpu' to ensure it loads even if CUDA isn't available)
    checkpoint = torch.load(checkpoint_file, map_location='cpu')
    
    # Print the keys and a summary of the checkpoint content
    print("Checkpoint keys:", list(checkpoint.keys()))
    print("\nFull checkpoint contents:")
    for key, value in checkpoint.items():
        if key == 'net':
            print(f"{key}: state_dict with {len(value)} keys")
        else:
            print(f"{key}: {value}")
else:
    print("Checkpoint file not found.")

Checkpoint keys: ['net', 'acc', 'epoch']

Full checkpoint contents:
net: state_dict with 74 keys
acc: 92.95
epoch: 260


  checkpoint = torch.load(checkpoint_file, map_location='cpu')


In [14]:
import os
import pickle
import pandas as pd
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np

# Create predictions directory if it doesn't exist.
predictions_dir = '/kaggle/working/predictions'
os.makedirs(predictions_dir, exist_ok=True)
predictions_filename = os.path.join(predictions_dir, 'resnetv1_e300_r003.csv')

# Define paths for the test set and checkpoint.
testset_path = '/kaggle/input/deep-learning-spring-2025-project-1/cifar_test_nolabel.pkl'
checkpoint_path = '/kaggle/working/checkpoint/ckpt.pth'

# Set device for inference.
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Function to load a CIFAR batch from a pickle file.
def load_cifar_batch(file):
    with open(file, 'rb') as fo:
        batch = pickle.load(fo, encoding='bytes')
    return batch

# Load the hidden test batch.
cifar10_batch = load_cifar_batch(testset_path)

# Extract images; the test data is in (N x W x H x C) format.
images = cifar10_batch[b'data']

# Ensure images are uint8 if they are in a numpy array.
if isinstance(images, np.ndarray) and images.dtype != 'uint8':
    images = images.astype('uint8')

# Define the test transform.
transform_test = transforms.Compose([
    transforms.ToPILImage(),  # Convert numpy array (H x W x C) to a PIL Image.
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Create a custom Dataset for the test images.
class TestDataset(Dataset):
    def __init__(self, images, transform=None):
        self.images = images
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        if self.transform:
            image = self.transform(image)
        return image, idx  # Return index as the image ID.

# Create the test dataset and DataLoader.
test_dataset = TestDataset(images, transform=transform_test)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# Load the best model checkpoint.
assert os.path.isfile(checkpoint_path), "Checkpoint file not found!"
checkpoint = torch.load(checkpoint_path, map_location=device)
state_dict = checkpoint['net']

from collections import OrderedDict
# Initialize the model and load the state.
net = ResNet_v1()
model = net.to(device)   
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    new_key = k.replace("module.", "")  # remove "module." prefix
    new_state_dict[new_key] = v

model.load_state_dict(new_state_dict)
model.eval()

# Generate predictions on the test set.
predictions = []
with torch.no_grad():
    for imgs, ids in test_loader:
        imgs = imgs.to(device)
        outputs = model(imgs)
        _, preds = outputs.max(1)
        preds = preds.cpu().numpy()
        for image_id, pred in zip(ids.numpy(), preds):
            predictions.append({'ID': image_id, 'Label': int(pred)})

# Convert predictions to a DataFrame and save as CSV.
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv(predictions_filename, index=False)
print(f"Predictions saved to {predictions_filename}")

  checkpoint = torch.load(checkpoint_path, map_location=device)


Predictions saved to /kaggle/working/predictions/resnetv1_e300_r003.csv
