# IDM architecture 

### Step 1: Load the image dataset 

In [92]:
import torchvision 
import os
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ExponentialLR
import gc
import numpy as np
from torch.utils.tensorboard import SummaryWriter

Flag to run on GPU or CPU

In [93]:
run_on_gpu = True

Adding tensorboard summary writer for better visualization

In [94]:
writer = SummaryWriter('runs/experiment-3')

No of frames the model will process at a time. For CPU 128 frames work if you have GPU with 12 GB VRAM it will eat up all memory. I tested it with 32 frames and that works.

In [95]:
# No of frame we will process at a time
no_of_frames = 32

In [96]:
device = torch.device("cuda:0" if run_on_gpu and torch.cuda.is_available() else "cpu")
print(device)

cuda:0


Loading the images from the data. 
First getting the filenames

In [97]:
relative_path = 'data'
data_dir = os.path.abspath(relative_path)

# This below code is only for Anirudha's PC because after restarting Python kernel the abspath gets a bit messed up
data_dir = 'C:\\Users\\aniru\\Desktop\\Code\\VPTAirsim\\data'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.png']


Same thing for validation data

In [98]:
validation_data_dir = 'C:\\Users\\aniru\\Desktop\\Code\\VPTAirsim\\validation'
validation_filenames = [name for name in os.listdir(validation_data_dir) if os.path.splitext(name)[-1] == '.png']



Sort the filenames by the number in the string

In [99]:
def get_number_from_string(string):
    # Extracts the number from a string by filtering out non-digit characters
    return int(''.join(filter(str.isdigit, string)))

def sort_array_by_number(array):
    # Sorts the array based on the number appearing in each string
    return sorted(array, key=get_number_from_string)



In [100]:
sorted_filenames = sort_array_by_number(filenames)
# sorted_filenames = sorted_filenames[0:12800]

Same thing for Validation set

In [101]:
sorted_val_filenames = sort_array_by_number(validation_filenames)


Generating the labels

In [102]:
labels = []

for name in sorted_filenames:
    
    if "w" in name:
        labels.append(0)
    elif "a" in name:
        labels.append(1)
    elif "d" in name:
        labels.append(2)


In [103]:
val_labels = []

for name in sorted_val_filenames:
    
    if "w" in name:
        val_labels.append(0)
    elif "a" in name:
        val_labels.append(1)
    elif "d" in name:
        val_labels.append(2)

Making one hot label

In [104]:
labels = torch.tensor(labels)

# Perform one-hot encoding
one_hot_labels = torch.eye(3)[labels]

In [105]:
val_labels = torch.tensor(val_labels)

# Perform one-hot encoding
val_one_hot_labels = torch.eye(3)[val_labels]

Then using the filenames to load images.


In [106]:
# Making sure the total data is a multiplication of no of frames 
dataset_size = len(sorted_filenames) - (len(sorted_filenames) % no_of_frames)


dataset = torch.zeros(dataset_size, 3, 128, 128)
for i in range(dataset_size):
 dataset[i] = torchvision.io.read_image(os.path.join(data_dir, sorted_filenames[i]))
 

In [107]:
# Making sure the total data is a multiplication of no of frames 
val_dataset_size = len(sorted_val_filenames) - (len(sorted_val_filenames) % no_of_frames)


val_dataset = torch.zeros(val_dataset_size, 3, 128, 128)
for i in range(val_dataset_size):
 val_dataset[i] = torchvision.io.read_image(os.path.join(validation_data_dir, sorted_val_filenames[i]))
 

Normalizing the pixel value by dividing it by 255. Now it is between 0 and 1

In [108]:
print(dataset.size())
dataset = dataset / 255.0


torch.Size([43808, 3, 128, 128])


In [109]:
print(val_dataset.size())
val_dataset = val_dataset / 255.0

torch.Size([1280, 3, 128, 128])


This is to make no_of_frames size chunks of frames

In [110]:
# Create a TensorDataset with data and their label
one_hot_labels = one_hot_labels[0:dataset_size]
data = TensorDataset(dataset, one_hot_labels)
# Create a DataLoader
dataloader = DataLoader(data, batch_size=no_of_frames, shuffle=False)


In [111]:
# Create a TensorDataset with data and their label
val_one_hot_labels = val_one_hot_labels[0:val_dataset_size]
val_data = TensorDataset(val_dataset, val_one_hot_labels)
# Create a DataLoader
val_dataloader = DataLoader(val_data, batch_size=no_of_frames, shuffle=False)


### Step 2 Pass the data through 3D Convolution

This is not a final model. I am writing this to make proper data shape. 

In [112]:
class Temporal3DConv(nn.Module):
    def __init__(self):
        super(Temporal3DConv, self).__init__()

        # 3 is input channel because of RGB images. 
        # 128 is the output channel or learnable filters
        # Kernel size 5 is temporal kernel width 
        # (1*1) is spatial kernel width
        # 2 Depth padding for initial and end frames
        self.conv3d = nn.Conv3d(3, 128, kernel_size=(5, 1, 1), padding=(2,0,0))
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.conv3d(x)
        out = self.relu(out)
        return out

### Step 3 Pass the 3D Convolution layer outcome through ResNet

This is the middle Res Net layer. So the ResNetBlock represents the Deep residual learning for image
recognition paper architecture. 

ResNetBlocksWithPooling represents the Resnet stack mentioned in the VPT paper. We will use three stacks consecuvely then flatten it before passing it to attention layer. 

In [113]:
class ResNetBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResNetBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)

        self.shortcut = nn.Sequential()
        if in_channels != out_channels:
            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)

    def forward(self, x):
        residual = self.shortcut(x)

        out = self.conv1(x)
        out = self.relu(out)

        out = self.conv2(out)
        out += residual
        out = self.relu(out)

        return out

class ResNetBlocksWithPooling(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResNetBlocksWithPooling, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.resnet_block1 = ResNetBlock(out_channels, out_channels)
        self.resnet_block2 = ResNetBlock(out_channels, out_channels)

    def forward(self, x):
        out = self.conv(x)
        out = self.pool(out)
        out = self.resnet_block1(out)
        out = self.resnet_block2(out)
        return out

### Step 4 Pass ResNet outcome through Multiheaded Residual Transformer

In [114]:

class FrameWiseDense(nn.Module):
    def __init__(self, in_features, out_features):
        super(FrameWiseDense, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.linear(x)
        out = self.relu(out)
        return out

class ResidualTransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, dropout=0.1):
        super(ResidualTransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout)
        # self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.dense1 = FrameWiseDense(embedding_dim, 16384)
        # self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dense2 = FrameWiseDense(16384, embedding_dim)

    def forward(self, x):
        residual = x
        out, _ = self.attention(x, x, x)
        # out = self.dropout1(out)
        out = self.norm1(out + residual)
        residual = out
        out = self.dense1(out)
        # out = self.dropout2(out)
        out = self.dense2(out)
        out = self.norm2(out + residual)
        return out

class ActionPredictionModel(nn.Module):
    def __init__(self, num_actions):
        super(ActionPredictionModel, self).__init__()
        # the initial value is 16384 because it is the flattened output dimension for ResNet.
        #  For 1 ResNet it is 262144
        #  For 2 ResNet it is 65536
        # For 3 ResNet it is 16384
        self.dense1 = FrameWiseDense(65536, 256) 
        self.dense2 = FrameWiseDense(256, 4096)  # first 4096 was 256
        self.residual_transformer_blocks = nn.Sequential(
            ResidualTransformerBlock(embedding_dim=4096, num_heads=32),
            # ResidualTransformerBlock(embedding_dim=4096, num_heads=32),
            # ResidualTransformerBlock(embedding_dim=4096, num_heads=32),
            # ResidualTransformerBlock(embedding_dim=4096, num_heads=32)
        )
        self.dense3 = FrameWiseDense(4096, 16384)
        self.dense4 = FrameWiseDense(16384, 4096)
        self.action_head = nn.Linear(4096, num_actions)

    def forward(self, x):
        out = self.dense1(x)
        out = self.dense2(out)
        # out = out.permute(1, 0, 2)
        out = self.residual_transformer_blocks(out)
        # out = out.permute(1, 0, 2)
        out = self.dense3(out)
        out = self.dense4(out)
        # out = out.mean(dim=1)
        out = self.action_head(out)
        out = F.softmax(out, dim=1)
        return out

### Step 5 Combining everything in a single model 

In [115]:
class IDM(nn.Module):
    def __init__(self):
        super(IDM, self).__init__()
        self.temporal3DConv = Temporal3DConv()
        self.ResNetStack1 = ResNetBlocksWithPooling(128, 64)
        self.ResNetStack2 = ResNetBlocksWithPooling(64, 64)
        # self.ResNetStack3 = ResNetBlocksWithPooling(64, 64)
        self.flattenLayer = nn.Flatten()
        self.transformerLayer = ActionPredictionModel(num_actions=3)

    def forward(self, input):
        # For Conv3D the input format is (batch_size, num_channels, num_frames, height, width)
        # So I am using unsqueeze to increase the outer dimension to make batch_size = 1 . 
        # Then using the permute to make the dimension in proper shape.
        out = input.unsqueeze(0).permute(0, 2, 1, 3, 4)
        out = self.temporal3DConv(out)
        
        # To match the expected input shape of the ResNet model, we need to reshape the output tensor. 
        # First, we use permute to rearrange the dimensions of the tensor, swapping the second and third dimensions. 
        # Then, we use contiguous to ensure the tensor's memory is laid out contiguously. 
        # Finally, we use view to reshape the tensor into a 4D tensor with dimensions (batch_size * num_frames, num_channels, height, width).

        out = out.permute(0, 2, 1, 3, 4).contiguous().view(1 * no_of_frames, 128, 128, 128)
        out = self.ResNetStack1(out)
        out = self.ResNetStack2(out)
        # out = self.ResNetStack3(out)
        out = self.flattenLayer(out)
        out = self.transformerLayer(out)
        return out

    

Some codes to check GPU memory status

In [116]:
# print(torch.cuda.mem_get_info())
# torch.cuda.empty_cache()
# torch.cuda.mem_get_info()

### Step 6 Declaring the model

Loading and printing the model 

In [117]:

# model = IDM().to(device)
# print(model)

Checkinng number of trainable parameters

In [118]:
# total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(f"Total Trainable Parameters: {total_params}")
# print(next(model.parameters()).is_cuda)

### Step 7 Doing Forward pass (NEED TO MODIFY TO SUPPORT FULL TRAINING)

In [119]:
# output = any

# for framechunk in dataloader:
#     # Access the batched tensor data
#     # Pass the input through the model
#     input = framechunk[0].to(device)
#     true_label = framechunk[1].to(device)
#     output = model(input)
#     del input
#     torch.cuda.empty_cache()
#     gc.collect()
#     print(output.shape)

Writing the training loop 

In [120]:
model = IDM().to(device)

# Define your loss function
crossLoss = nn.CrossEntropyLoss()

# Define your optimizer with weight decay
# optimizer = optim.AdamW(model.parameters(), lr=0.003, weight_decay=0.01)

# Define the learning rate scheduler
# scheduler = ExponentialLR(optimizer, gamma=0.95)

# Track the initial model parameters
# initial_params = [param.clone().detach() for param in model.parameters()]
optimizer = optim.SGD(model.parameters(), lr=0.001)
# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()  # Set the model in training mode
    for framechunk in dataloader:
        # Access the batched tensor data
        # Pass the input through the model
        input = framechunk[0].to(device)
        true_label = framechunk[1].to(device)
        output = model(input)
        # print("True = ", true_label[0])
        # print("Output = ", output[0])
         # Compute the loss
        loss = crossLoss(output, true_label)
        optimizer.zero_grad()
        # Backward pass
        loss.backward()

        # Update the model's parameters
        optimizer.step()

        # del input
        # torch.cuda.empty_cache()
        # gc.collect()
        # print(output.shape)
        # Convert softmax predictions to class labels by taking the argmax
        # softmax_predictions_labels = np.argmax(output.cpu().detach().numpy(), axis=1)

        # Compare the predicted labels with the one-hot encoded labels
        # correct_predictions = np.sum(np.equal(softmax_predictions_labels, np.argmax(true_label.cpu().detach().numpy(), axis=1)))

        # Calculate accuracy
        # accuracy = correct_predictions / float(softmax_predictions_labels.shape[0])
        # print("Accuracy:", accuracy)
        # print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
    
    # Check if the model parameters have been updated
    # for i, param in enumerate(model.parameters()):
    #     if not torch.all(torch.eq(initial_params[i], param)):
    #         print(f"Parameters {i} have been updated.")

    # Update the initial parameters for the next epoch
    # initial_params = [param.clone().detach() for param in model.parameters()]

    # Update the learning rate
    # scheduler.step()
    # Validation
    model.eval()  # Set the model in evaluation mode
    accuracy_list = []
    with torch.no_grad():
        for framechunk in val_dataloader:
            # Access the batched tensor data
            # Pass the input through the model
            val_input = framechunk[0].to(device)
            val_true_label = framechunk[1].to(device)
            val_output = model(val_input)

            # Convert softmax predictions to class labels by taking the argmax
            val_softmax_predictions_labels = np.argmax(val_output.cpu().detach().numpy(), axis=1)

            # Compare the predicted labels with the one-hot encoded labels
            val_correct_predictions = np.sum(np.equal(val_softmax_predictions_labels, np.argmax(val_true_label.cpu().detach().numpy(), axis=1)))

            # Calculate accuracy
            val_accuracy = val_correct_predictions / float(val_softmax_predictions_labels.shape[0])
            # print("val Accuracy:", val_accuracy)
            accuracy_list.append(val_accuracy)

        

    # Print the loss for this epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
    # Print accuracy on validation set 
    avg_val_accuracy = sum(accuracy_list)/ len(accuracy_list)
    print("Validation accuracy = ", avg_val_accuracy)
    # Log the loss
    writer.add_scalar('Loss/train', loss.item(), epoch)

    # Log the accuracy
    writer.add_scalar('Accuracy/train', avg_val_accuracy, epoch)
writer.close()

Epoch 1/20, Loss: 1.060563325881958
Validation accuracy =  0.33359375
Epoch 2/20, Loss: 1.0504696369171143
Validation accuracy =  0.33671875
Epoch 3/20, Loss: 1.0332612991333008
Validation accuracy =  0.35546875
Epoch 4/20, Loss: 1.0074260234832764
Validation accuracy =  0.3515625
Epoch 5/20, Loss: 1.0026075839996338
Validation accuracy =  0.4875
Epoch 6/20, Loss: 0.9102625250816345
Validation accuracy =  0.61640625
Epoch 7/20, Loss: 0.7579656839370728
Validation accuracy =  0.66484375
Epoch 8/20, Loss: 0.7054595351219177
Validation accuracy =  0.79765625
Epoch 9/20, Loss: 0.7472288608551025
Validation accuracy =  0.61328125
Epoch 10/20, Loss: 0.6568589806556702
Validation accuracy =  0.87265625
Epoch 11/20, Loss: 0.9920228719711304
Validation accuracy =  0.8
Epoch 12/20, Loss: 0.8556854724884033
Validation accuracy =  0.7765625
Epoch 13/20, Loss: 0.6098064184188843
Validation accuracy =  0.88515625
Epoch 14/20, Loss: 0.5727171897888184
Validation accuracy =  0.90390625
Epoch 15/20, Lo

This theoritically should clear up the memory but this is not working properly

In [121]:
# del model
# torch.cuda.empty_cache()
# gc.collect()