CRNN Base Structure (TODO Training and Accuracy)

In [1]:
# Import required libraries
import numpy as np
import torch
import torch.nn as nn

In [73]:
# Base structure of CRNN
class CRNN(nn.Module):
    def __init__(self):
        super(CRNN, self).__init__()
        self.features = nn.Sequential (
            nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(3,3)),
            nn.BatchNorm2d(64),
            nn.ELU(),
            nn.MaxPool2d((2,2)),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3)),
            nn.BatchNorm2d(128),
            nn.ELU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,3)),
            nn.BatchNorm2d(256),
            nn.ELU(),
            nn.MaxPool2d((2, 2)),

            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3,3)),
            nn.BatchNorm2d(256),
            nn.ELU(),
            nn.MaxPool2d((2, 2)),
        )

        final_conv_height = 6
        final_conv_width = 5
        gru_input_size = 256 * final_conv_width

        totalInstruments = 3

        self.gru = nn.GRU(
            input_size=768, 
            hidden_size=256, 
            batch_first=True
        )
        # out_features is technically 18 in paper, but realistically 3 at the beginning
        self.fc = nn.Linear(in_features=256, out_features=totalInstruments)
        
    def forward(self, input):
        # Pass input through convolutional layers
        conv_output = self.features(input)
        
        # Get the batch size (B), and the number of feature maps (C), height (H), and width (W)
        B, C, H, W = conv_output.size()

        # Reshape the output to treat the height 'H' as the sequence and combine the channels
        # and width 'W' as features, which should have been calculated as 256*5
        # Here we assume each timestep corresponds to a row in the feature maps
        x = conv_output.view(B, H, C * W)  # Shape: (B, H, C*W)

        # Pass the reshaped conv_output to the GRU
        gru_output, _ = self.gru(x)

        # Usually, you'd take just the last time step
        last_time_step_output = gru_output[:, -1, :]

        # Pass that through your fully connected layer
        output = self.fc(last_time_step_output)

        return output

In [74]:
# Future stuff for training and accuracy of CNN
import torch.optim as optim

model = CRNN()
lossAlg = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9)

In [75]:
from torch.utils.data import DataLoader, Dataset, random_split
import logging, sys

sys.path.append('/Users/adarshbharathwaj/Desktop/eng100/project3/ENGR_100_Project_3/src')
from utils import *

class FrameDataset(Dataset):
    def __init__(self, npz_path):
        self.data = load_npz_file_with_condition(npz_path, max_size=1024**3)
        self.keys = [k for k in self.data.keys() if "_data" in k]

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        data_key = self.keys[idx]
        data = self.data[data_key]

        C, H, W = 1, 96, 87
        data_reshaped = data.reshape(C, H, W)
        data_tensor = torch.tensor(data_reshaped, dtype=torch.float32)

        labels_key = f'{data_key.split("_data_")[0]}_labels'
        labels_tensor = torch.tensor(self.data[labels_key], dtype=torch.float32)

        return data_tensor, labels_tensor
    
def load_npz_file_with_condition(file_path, max_size: int):
    file_size = os.path.getsize(file_path)

    if file_size > max_size:
        logging.info(
            f"File size is {file_size / (1024**2):.2f}MB. Using mmap_mode='r'."
        )
        data = np.load(file_path, mmap_mode="r", allow_pickle=True)
    else:
        logging.info(f"File size is {file_size / (1024**2):.2f}MB. Loading normally.")
        data = np.load(file_path, allow_pickle=True)

    return data



In [76]:
# Training the model
npz_path = "data.npz"
full_dataset = FrameDataset(npz_path)

train_size = int(0.8 * len(full_dataset))
validation_size = int(0.1 * len(full_dataset))
test_size = len(full_dataset) - train_size - validation_size
batch_size = 32

train_dataset, validation_dataset, test_dataset = random_split(
    full_dataset, [train_size, validation_size, test_size]
)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 20
for epoch in range(num_epochs):
    runningLoss = 0.0
    for batch in train_dataloader:
        input, labels = batch
        input = input.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        output = model(input)
        loss = lossAlg(output.view(-1), labels.view(-1))
        loss.backward()
        optimizer.step()
    
        runningLoss += loss.item()
    epoch_loss = runningLoss / len(train_dataloader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/20], Loss: 0.5437
Epoch [2/20], Loss: 0.5294
Epoch [3/20], Loss: 0.4976
Epoch [4/20], Loss: 0.5224
Epoch [5/20], Loss: 0.4577
Epoch [6/20], Loss: 0.5226
Epoch [7/20], Loss: 0.4721


KeyboardInterrupt: 