# IIT 4316 Deep Learning<br>Homework #2-1: CNN

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from tqdm import tqdm as tq


DIGITS     = ['0','1','2','3','4','5','6','7','8','9']
VOCAB_SIZE = len(DIGITS)
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Fixed hyperparameters
INPUT_LEN     = 4
OUTPUT_LEN    = 3
LEARNING_RATE = 1e-4
NUM_EPOCH     = 1000
BATCH_SIZE    = 128

# Changable hyperparameters
EMBED_DIM      = 16
CONV_LAYER_NUM = 2
NUM_CHANNELS   = 32

char_to_idx = {c: i for i, c in enumerate(DIGITS)}
idx_to_char = {i: c for i, c in enumerate(DIGITS)}

# batch data generation
def generate_batch(batch_size=100):
    inputs, targets = [], []
    for _ in range(batch_size):
        tens1, ones1 = random.randint(0, 9), random.randint(0, 9)
        tens2, ones2 = random.randint(0, 9), random.randint(0, 9)
        num1, num2 = tens1 * 10 + ones1, tens2 * 10 + ones2
        s = num1 + num2
        inputs.append([tens1, ones1, tens2, ones2])
        sum_str = f"{s:03d}"
        targets.append([char_to_idx[c] for c in sum_str])
    return (torch.tensor(inputs, dtype=torch.long).to(DEVICE),
            torch.tensor(targets, dtype=torch.long).to(DEVICE))

# ReLU
def my_relu(x):
    return torch.clamp(x, min=0.0)

#------------------------------------
# 4 digits to one-hot vectors
#------------------------------------
def MyOneHot(x, vocab_size):    
    B, L = x.size()     # batch size x INPUT_LEN

    ############################################################################
    # TODO: Convert x to one-hot
    ############################################################################

    # Create zero tensor of shape (B, L, vocab_size)
    out = torch.zeros(B, L, vocab_size, device=x.device)
    # Use scatter to fill in the one-hot encoding
    out.scatter_(2, x.unsqueeze(2), 1.0)

    ############################################################################
    # END TODO
    ############################################################################

    return out   # B x L x vocab_size

#------------------------------------
# Embedding layer
#------------------------------------
class MyEmbedding(nn.Module):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(vocab_size, dim) * 0.1)

    def forward(self, x):
        B, L = x.size()

        ############################################################################
        # TODO: Convert x to one-hot and then embedding
        #    Use MyOneHot() function above
        ############################################################################

        # Convert to one-hot: (B, L, vocab_size)
        one_hot = MyOneHot(x, self.weight.size(0))
        # Matrix multiply with embedding weight: (B, L, vocab_size) @ (vocab_size, dim) = (B, L, dim)
        out = torch.matmul(one_hot, self.weight)
        
        ############################################################################
        # END TODO
        ############################################################################

        return out  # B x L x dim

#------------------------------------
# Linear layer
#------------------------------------
class MyLinear(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(out_dim, in_dim) * 0.1)
        self.bias   = nn.Parameter(torch.zeros(out_dim))

    def forward(self, x):

        ############################################################################
        # TODO: Compute linear layer output
        #   You should implement your own linear layer operation.
        ############################################################################
            
        # x: (..., in_dim), weight: (out_dim, in_dim), bias: (out_dim)
        # out = x @ W^T + bias
        out = torch.matmul(x, self.weight.t()) + self.bias
        
        ############################################################################
        # END TODO
        ############################################################################

        return out  

#------------------------------------
# Conv layer
#------------------------------------
class MyConv2D(nn.Module):
    def __init__(self, in_channel, out_channel, kernel_size=3, padding=1):
        super().__init__()

        self.in_channel  = in_channel
        self.out_channel = out_channel
        self.kernel      = kernel_size
        self.pad         = padding
        
        self.weight = nn.Parameter(torch.randn(out_channel, in_channel, kernel_size, kernel_size) * 0.1)
        self.bias   = nn.Parameter(torch.zeros(out_channel))

    def forward(self, x):
        B, C, H, W = x.shape

        ############################################################################
        # TODO: 2D convolution operation
        #   You should implement your own convolution operation.
        ############################################################################

        # Pad the input
        if self.pad > 0:
            x_padded = torch.zeros(B, C, H + 2 * self.pad, W + 2 * self.pad, device=x.device)
            x_padded[:, :, self.pad:self.pad + H, self.pad:self.pad + W] = x
        else:
            x_padded = x
        
        H_pad, W_pad = x_padded.shape[2], x_padded.shape[3]
        
        # Output dimensions (stride=1)
        H_out = H_pad - self.kernel + 1
        W_out = W_pad - self.kernel + 1
        
        # Initialize output
        out = torch.zeros(B, self.out_channel, H_out, W_out, device=x.device)
        
        # Perform convolution
        for i in range(H_out):
            for j in range(W_out):
                # Extract patch: (B, C, kernel, kernel)
                patch = x_padded[:, :, i:i + self.kernel, j:j + self.kernel]
                # patch: (B, in_channel, kernel, kernel)
                # weight: (out_channel, in_channel, kernel, kernel)
                # Compute convolution for all output channels at once
                for oc in range(self.out_channel):
                    # Sum over in_channel, kernel_h, kernel_w
                    out[:, oc, i, j] = (patch * self.weight[oc]).sum(dim=(1, 2, 3)) + self.bias[oc]

        ############################################################################
        # END TODO
        ############################################################################

        return out  # B x out_ch x H x W

#------------------------------------
# CNN model
#------------------------------------
class MyCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_channels, num_conv_layers):
        super().__init__()

        ############################################################################
        # TODO: Define each layer (embedding, conv1, conv2, ..., fc)
        #     Use MyEmbedding, MyConv2D, MyLinear classes defined above.
        ############################################################################
        
        self.embed_dim       = embed_dim
        self.num_channels    = num_channels
        self.num_conv_layers = num_conv_layers
        
        # Embedding layer
        self.embedding = MyEmbedding(vocab_size, embed_dim)
        
        # Conv layers using nn.ModuleList for dynamic number of layers
        self.conv_layers = nn.ModuleList()
        for i in range(num_conv_layers):
            if i == 0:
                # First conv: in_channel = embed_dim, out_channel = num_channels
                self.conv_layers.append(MyConv2D(embed_dim, num_channels, kernel_size=3, padding=1))
            else:
                # Subsequent convs: in_channel = num_channels, out_channel = num_channels
                self.conv_layers.append(MyConv2D(num_channels, num_channels, kernel_size=3, padding=1))
        
        # FC layer: input is flattened conv output (2 * 2 * num_channels), output is 3 * VOCAB_SIZE
        self.fc = MyLinear(2 * 2 * num_channels, OUTPUT_LEN * vocab_size)

        ############################################################################
        # END TODO
        ############################################################################

    def forward(self, x):
        B = x.size(0)

        ############################################################################
        # TODO: Define forward path: 
        #    embedding -> reshape -> conv1 -> ReLU -> conv2 -> ReLU -> ... -> flatten -> fc
        ############################################################################

        # Embedding: (B, 4) -> (B, 4, embed_dim)
        out = self.embedding(x)

        # Reshape to image-like: (B, 4, embed_dim) -> (B, embed_dim, 2, 2)
        out = out.permute(0, 2, 1)  # (B, embed_dim, 4)
        out = out.view(B, self.embed_dim, 2, 2)  # (B, embed_dim, 2, 2)
        
        # Apply conv layers with ReLU
        for conv in self.conv_layers:
            out = conv(out)  # (B, num_channels, 2, 2)
            out = my_relu(out)
        
        # Flatten
        out = out.view(B, -1)  # (B, 2*2*num_channels)
        
        # FC layer
        out = self.fc(out)  # (B, 3*VOCAB_SIZE)
        
        # Reshape to (B, 3, VOCAB_SIZE)
        out = out.view(B, OUTPUT_LEN, VOCAB_SIZE)

        ############################################################################
        # END TODO
        ############################################################################

        return out    # B x 3 x VOCAB_SIZE



In [None]:
# -----------------------------------------------------------
# Training & test
# -----------------------------------------------------------
def main():
    torch.manual_seed(42)
    random.seed(42)

    model     = MyCNN(VOCAB_SIZE, EMBED_DIM, NUM_CHANNELS, CONV_LAYER_NUM).to(DEVICE)
    optim     = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    for epoch in tq(range(1, NUM_EPOCH+1)):   
        model.train()
        src, tgt = generate_batch(BATCH_SIZE)
        logits   = model(src)
        loss     = criterion(logits.view(-1, VOCAB_SIZE), tgt.view(-1))

        optim.zero_grad()
        loss.backward()
        optim.step()

        if epoch % 100 == 0 or epoch == 1:
            print(f"Epoch {epoch:03d}  Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    correct = 0
    total   = 0
    with torch.no_grad():
        src, tgt = generate_batch(100)
        logits   = model(src)
        preds    = logits.argmax(-1)
        for s, t, p in zip(src.cpu().tolist(), tgt.cpu().tolist(), preds.cpu().tolist()):
            s_str = f"{s[0]}{s[1]} + {s[2]}{s[3]}"
            t_str = ''.join(idx_to_char[x] for x in t)
            p_str = ''.join(idx_to_char[x] for x in p)
            print(f"src: {s_str} = {t_str} | pred: {p_str}")
            if t_str == p_str:
                correct += 1
            total += 1
    print(f"Correct: {correct}/{total}")


if __name__ == '__main__':
    main()

In [None]:
import matplotlib.pyplot as plt
import time

# ============================================================
# Hyperparameter Experiment for CNN
# ============================================================

def train_and_evaluate(vocab_size, embed_dim, num_channels, num_conv_layers, 
                       num_epochs=1000, batch_size=128, lr=1e-4, test_size=1000):
    """Train model and return final loss, test accuracy, and training time."""
    torch.manual_seed(42)
    random.seed(42)
    
    model     = MyCNN(vocab_size, embed_dim, num_channels, num_conv_layers).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # Training with time measurement
    start_time = time.time()
    final_loss = 0
    for epoch in range(1, num_epochs + 1):
        model.train()
        src, tgt = generate_batch(batch_size)
        logits   = model(src)
        loss     = criterion(logits.view(-1, VOCAB_SIZE), tgt.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        final_loss = loss.item()
    train_time = time.time() - start_time
    
    # Evaluation
    model.eval()
    correct = 0
    with torch.no_grad():
        src, tgt = generate_batch(test_size)
        logits   = model(src)
        preds    = logits.argmax(-1)
        for t, p in zip(tgt.cpu().tolist(), preds.cpu().tolist()):
            if t == p:
                correct += 1
    
    return final_loss, correct, train_time

# Default values
DEFAULT_EMBED_DIM      = 16
DEFAULT_CONV_LAYER_NUM = 2
DEFAULT_NUM_CHANNELS   = 32

# ============================================================
# Experiment 1: Varying EMBED_DIM
# ============================================================
print("=" * 50)
print("Experiment 1: Varying EMBED_DIM")
print("=" * 50)

embed_dims   = [4, 8, 12, 16, 24, 32, 48, 64]

losses_embed = []
accs_embed   = []
times_embed  = []

for ed in tq(embed_dims, desc="EMBED_DIM"):
    loss, acc, t = train_and_evaluate(VOCAB_SIZE, ed, DEFAULT_NUM_CHANNELS, DEFAULT_CONV_LAYER_NUM)
    losses_embed.append(loss)
    accs_embed.append(acc)
    times_embed.append(t)
    print(f"EMBED_DIM={ed:2d} | Loss: {loss:.4f} | Correct: {acc}/1000 | Time: {t:.2f}s")

# ============================================================
# Experiment 2: Varying CONV_LAYER_NUM
# ============================================================
print("\n" + "=" * 50)
print("Experiment 2: Varying CONV_LAYER_NUM")
print("=" * 50)

conv_layers = [1, 2, 3, 4, 5, 6, 7, 8]
losses_conv = []
accs_conv   = []
times_conv  = []

for nl in tq(conv_layers, desc="CONV_LAYER_NUM"):
    loss, acc, t = train_and_evaluate(VOCAB_SIZE, DEFAULT_EMBED_DIM, DEFAULT_NUM_CHANNELS, nl)
    losses_conv.append(loss)
    accs_conv.append(acc)
    times_conv.append(t)
    print(f"CONV_LAYER_NUM={nl} | Loss: {loss:.4f} | Correct: {acc}/1000 | Time: {t:.2f}s")

# ============================================================
# Experiment 3: Varying NUM_CHANNELS
# ============================================================
print("\n" + "=" * 50)
print("Experiment 3: Varying NUM_CHANNELS")
print("=" * 50)

num_channels_list = [8, 16, 24, 32, 48, 64, 96, 128]
losses_channels   = []
accs_channels     = []
times_channels    = []

for nc in tq(num_channels_list, desc="NUM_CHANNELS"):
    loss, acc, t = train_and_evaluate(VOCAB_SIZE, DEFAULT_EMBED_DIM, nc, DEFAULT_CONV_LAYER_NUM)
    losses_channels.append(loss)
    accs_channels.append(acc)
    times_channels.append(t)
    print(f"NUM_CHANNELS={nc:3d} | Loss: {loss:.4f} | Correct: {acc}/1000 | Time: {t:.2f}s")



Experiment 1: Varying EMBED_DIM


EMBED_DIM:  12%|█▎        | 1/8 [00:45<05:20, 45.82s/it]

EMBED_DIM= 4 | Loss: 1.6839 | Correct: 4/1000 | Time: 45.25s


EMBED_DIM:  25%|██▌       | 2/8 [01:29<04:28, 44.75s/it]

EMBED_DIM= 8 | Loss: 1.5850 | Correct: 21/1000 | Time: 43.97s


EMBED_DIM:  38%|███▊      | 3/8 [02:13<03:41, 44.40s/it]

EMBED_DIM=12 | Loss: 1.5841 | Correct: 19/1000 | Time: 43.95s


EMBED_DIM:  50%|█████     | 4/8 [03:00<03:01, 45.32s/it]

EMBED_DIM=16 | Loss: 1.5736 | Correct: 14/1000 | Time: 46.68s


EMBED_DIM:  62%|██████▎   | 5/8 [03:48<02:19, 46.43s/it]

EMBED_DIM=24 | Loss: 1.5421 | Correct: 30/1000 | Time: 48.34s


EMBED_DIM:  75%|███████▌  | 6/8 [04:39<01:35, 47.74s/it]

EMBED_DIM=32 | Loss: 1.5344 | Correct: 38/1000 | Time: 50.21s


EMBED_DIM:  88%|████████▊ | 7/8 [05:52<00:56, 56.24s/it]

EMBED_DIM=48 | Loss: 1.5251 | Correct: 43/1000 | Time: 73.69s


EMBED_DIM: 100%|██████████| 8/8 [06:39<00:00, 49.97s/it]
EMBED_DIM: 100%|██████████| 8/8 [06:39<00:00, 49.97s/it]


EMBED_DIM=64 | Loss: 1.4730 | Correct: 65/1000 | Time: 46.74s

Experiment 2: Varying CONV_LAYER_NUM


CONV_LAYER_NUM:  12%|█▎        | 1/8 [00:28<03:17, 28.19s/it]

CONV_LAYER_NUM=1 | Loss: 1.7076 | Correct: 10/1000 | Time: 28.17s


CONV_LAYER_NUM:  25%|██▌       | 2/8 [01:17<04:03, 40.57s/it]

CONV_LAYER_NUM=2 | Loss: 1.5736 | Correct: 14/1000 | Time: 49.19s


CONV_LAYER_NUM:  38%|███▊      | 3/8 [02:24<04:24, 52.81s/it]

CONV_LAYER_NUM=3 | Loss: 1.5055 | Correct: 40/1000 | Time: 67.32s


CONV_LAYER_NUM:  50%|█████     | 4/8 [03:52<04:27, 66.76s/it]

CONV_LAYER_NUM=4 | Loss: 1.2434 | Correct: 80/1000 | Time: 88.07s


CONV_LAYER_NUM:  62%|██████▎   | 5/8 [05:58<04:23, 87.82s/it]

CONV_LAYER_NUM=5 | Loss: 1.1295 | Correct: 84/1000 | Time: 125.08s


In [None]:
# ============================================================
# Plotting - Single Figure with 3 y-axes
# ============================================================
fig, ax1 = plt.subplots(figsize=(12, 7))

# Create additional y-axes
ax2 = ax1.twinx()
ax3 = ax1.twinx()
ax3.spines['right'].set_position(('outward', 60))

# X-axis: use indices (0 to n-1)
n = len(embed_dims)
x_indices = list(range(n))

# Plot Train Loss (RED) - Left y-axis
# Solid: EMBED_DIM, Dotted: CONV_LAYER_NUM, Dashed: NUM_CHANNELS
l1 = ax1.plot(x_indices, losses_embed, 'r-o', linewidth=2, markersize=6, label='EMBED_DIM (Loss)')
l2 = ax1.plot(x_indices, losses_conv, 'r:s', linewidth=2, markersize=6, label='CONV_LAYER_NUM (Loss)')
l3 = ax1.plot(x_indices, losses_channels, 'r--^', linewidth=2, markersize=6, label='NUM_CHANNELS (Loss)')

# Plot Test Accuracy (GREEN) - Right y-axis 1
l4 = ax2.plot(x_indices, accs_embed, 'g-o', linewidth=2, markersize=6, label='EMBED_DIM (Acc)')
l5 = ax2.plot(x_indices, accs_conv, 'g:s', linewidth=2, markersize=6, label='CONV_LAYER_NUM (Acc)')
l6 = ax2.plot(x_indices, accs_channels, 'g--^', linewidth=2, markersize=6, label='NUM_CHANNELS (Acc)')

# Plot Training Time (BLUE) - Right y-axis 2
l7 = ax3.plot(x_indices, times_embed, 'b-o', linewidth=2, markersize=6, alpha=0.7, label='EMBED_DIM (Time)')
l8 = ax3.plot(x_indices, times_conv, 'b:s', linewidth=2, markersize=6, alpha=0.7, label='CONV_LAYER_NUM (Time)')
l9 = ax3.plot(x_indices, times_channels, 'b--^', linewidth=2, markersize=6, alpha=0.7, label='NUM_CHANNELS (Time)')

# Labels
ax1.set_xlabel('Index (see legend for actual values)', fontsize=12)
ax1.set_ylabel('Train Loss (RED)', fontsize=12, color='red')
ax2.set_ylabel('Test Correct /1000 (GREEN)', fontsize=12, color='green')
ax3.set_ylabel('Training Time in sec (BLUE)', fontsize=12, color='blue')

# Set x-axis ticks
ax1.set_xticks(x_indices)
ax1.set_xticklabels([f'{i}' for i in x_indices])

# Add text annotation for actual values
embed_str   = f"EMBED_DIM: {embed_dims}"
conv_str    = f"CONV_LAYER_NUM: {conv_layers}"
channel_str = f"NUM_CHANNELS: {num_channels_list}"
fig.text(0.5, -0.02, f"{embed_str}\n{conv_str}\n{channel_str}", 
         ha='center', fontsize=9, family='monospace')

# Combine legends
lines  = l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='upper left', fontsize=8, ncol=3)

ax1.grid(True, alpha=0.3)
plt.title('CNN Hyperparameter Analysis\n(Color: Metric, Style: Hyperparameter | Solid: EMBED, Dotted: LAYER, Dashed: CHANNEL)', 
          fontsize=12, fontweight='bold')
plt.tight_layout()
plt.subplots_adjust(bottom=0.15)
plt.show()