# IDM architecture 

### Step 1: Load the image dataset 

In [161]:
import torchvision 
import os
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import gc

Flag to run on GPU or CPU

In [162]:
run_on_gpu = False

No of frames the model will process at a time. For CPU 128 frames work if you have GPU with 12 GB VRAM it will eat up all memory. I tested it with 32 frames and that works.

In [163]:
# No of frame we will process at a time
no_of_frames = 128

In [164]:
device = torch.device("cuda:0" if run_on_gpu and torch.cuda.is_available() else "cpu")
print(device)

cpu


Loading the images from the data. 
First getting the filenames

In [165]:
relative_path = 'data'
data_dir = os.path.abspath(relative_path)

# This below code is only for Anirudha's PC because after restarting Python kernel the abspath gets a bit messed up
data_dir = 'C:\\Users\\aniru\\Desktop\\Code\\VPTAirsim\\data'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.png']


Sort the filenames by the number in the string

In [166]:
def get_number_from_string(string):
    # Extracts the number from a string by filtering out non-digit characters
    return int(''.join(filter(str.isdigit, string)))

def sort_array_by_number(array):
    # Sorts the array based on the number appearing in each string
    return sorted(array, key=get_number_from_string)

sorted_filenames = sort_array_by_number(filenames)


Generating the labels

In [167]:
labels = []

for name in sorted_filenames:
    
    if "w" in name:
        labels.append(0)
    elif "a" in name:
        labels.append(1)
    elif "s" in name:
        labels.append(2)
    elif "d" in name:
        labels.append(3)


Making one hot label

In [168]:
labels = torch.tensor(labels)

# Perform one-hot encoding
one_hot_labels = torch.eye(4)[labels]

Then using the filenames to load images.


In [169]:
# Making sure the total data is a multiplication of no of frames 
dataset_size = len(sorted_filenames) - (len(sorted_filenames) % no_of_frames)

dataset = torch.zeros(dataset_size, 3, 128, 128)
for i in range(dataset_size):
 dataset[i] = torchvision.io.read_image(os.path.join(data_dir, sorted_filenames[i]))
 

Normalizing the pixel value by dividing it by 255. Now it is between 0 and 1

In [170]:
print(dataset.size())
dataset = dataset / 255.0


torch.Size([256, 3, 128, 128])


This is to make no_of_frames size chunks of frames

In [171]:
# Create a TensorDataset with data and their label
data = TensorDataset(dataset, one_hot_labels)
# Create a DataLoader
dataloader = DataLoader(data, batch_size=no_of_frames, shuffle=False)


### Step 2 Pass the data through 3D Convolution

This is not a final model. I am writing this to make proper data shape. 

In [172]:
class Temporal3DConv(nn.Module):
    def __init__(self):
        super(Temporal3DConv, self).__init__()

        # 3 is input channel because of RGB images. 
        # 128 is the output channel or learnable filters
        # Kernel size 5 is temporal kernel width 
        # (1*1) is spatial kernel width
        # 2 Depth padding for initial and end frames
        self.conv3d = nn.Conv3d(3, 128, kernel_size=(5, 1, 1), padding=(2,0,0))
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.conv3d(x)
        out = self.relu(out)
        return out

### Step 3 Pass the 3D Convolution layer outcome through ResNet

This is the middle Res Net layer. So the ResNetBlock represents the Deep residual learning for image
recognition paper architecture. 

ResNetBlocksWithPooling represents the Resnet stack mentioned in the VPT paper. We will use three stacks consecuvely then flatten it before passing it to attention layer. 

In [173]:
class ResNetBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResNetBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)

        self.shortcut = nn.Sequential()
        if in_channels != out_channels:
            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)

    def forward(self, x):
        residual = self.shortcut(x)

        out = self.conv1(x)
        out = self.relu(out)

        out = self.conv2(out)
        out += residual
        out = self.relu(out)

        return out

class ResNetBlocksWithPooling(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResNetBlocksWithPooling, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.resnet_block1 = ResNetBlock(out_channels, out_channels)
        self.resnet_block2 = ResNetBlock(out_channels, out_channels)

    def forward(self, x):
        out = self.conv(x)
        out = self.pool(out)
        out = self.resnet_block1(out)
        out = self.resnet_block2(out)
        return out

### Step 4 Pass ResNet outcome through Multiheaded Residual Transformer

In [174]:

class FrameWiseDense(nn.Module):
    def __init__(self, in_features, out_features):
        super(FrameWiseDense, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.linear(x)
        out = self.relu(out)
        return out

class ResidualTransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, dropout=0.1):
        super(ResidualTransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.dense1 = FrameWiseDense(embedding_dim, 16384)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dense2 = FrameWiseDense(16384, embedding_dim)

    def forward(self, x):
        residual = x
        out, _ = self.attention(x, x, x)
        out = self.dropout1(out)
        out = self.norm1(out + residual)
        residual = out
        out = self.dense1(out)
        out = self.dropout2(out)
        out = self.dense2(out)
        out = self.norm2(out + residual)
        return out

class ActionPredictionModel(nn.Module):
    def __init__(self, num_actions):
        super(ActionPredictionModel, self).__init__()
        # the initial value is 16384 because it is the flattened output dimension for ResNet
        self.dense1 = FrameWiseDense(16384, 256)
        self.dense2 = FrameWiseDense(256, 4096)
        self.residual_transformer_blocks = nn.Sequential(
            ResidualTransformerBlock(embedding_dim=4096, num_heads=32),
            ResidualTransformerBlock(embedding_dim=4096, num_heads=32),
            ResidualTransformerBlock(embedding_dim=4096, num_heads=32),
            ResidualTransformerBlock(embedding_dim=4096, num_heads=32)
        )
        self.dense3 = FrameWiseDense(4096, 16384)
        self.dense4 = FrameWiseDense(16384, 4096)
        self.action_head = nn.Linear(4096, num_actions)

    def forward(self, x):
        out = self.dense1(x)
        out = self.dense2(out)
        # out = out.permute(1, 0, 2)
        out = self.residual_transformer_blocks(out)
        # out = out.permute(1, 0, 2)
        out = self.dense3(out)
        out = self.dense4(out)
        # out = out.mean(dim=1)
        out = self.action_head(out)
        out = F.softmax(out, dim=1)
        return out

### Step 5 Combining everything in a single model 

In [175]:
class IDM(nn.Module):
    def __init__(self):
        super(IDM, self).__init__()
        self.temporal3DConv = Temporal3DConv()
        self.ResNetStack1 = ResNetBlocksWithPooling(128, 64)
        self.ResNetStack2 = ResNetBlocksWithPooling(64, 64)
        self.ResNetStack3 = ResNetBlocksWithPooling(64, 64)
        self.flattenLayer = nn.Flatten()
        self.transformerLayer = ActionPredictionModel(num_actions=4)

    def forward(self, input):
        # For Conv3D the input format is (batch_size, num_channels, num_frames, height, width)
        # So I am using unsqueeze to increase the outer dimension to make batch_size = 1 . 
        # Then using the permute to make the dimension in proper shape.
        out = input.unsqueeze(0).permute(0, 2, 1, 3, 4)
        out = self.temporal3DConv(out)
        
        # To match the expected input shape of the ResNet model, we need to reshape the output tensor. 
        # First, we use permute to rearrange the dimensions of the tensor, swapping the second and third dimensions. 
        # Then, we use contiguous to ensure the tensor's memory is laid out contiguously. 
        # Finally, we use view to reshape the tensor into a 4D tensor with dimensions (batch_size * num_frames, num_channels, height, width).

        out = out.permute(0, 2, 1, 3, 4).contiguous().view(1 * no_of_frames, 128, 128, 128)
        out = self.ResNetStack1(out)
        out = self.ResNetStack2(out)
        out = self.ResNetStack3(out)
        out = self.flattenLayer(out)
        out = self.transformerLayer(out)
        return out

    

Some codes to check GPU memory status

In [176]:
# print(torch.cuda.mem_get_info())
# torch.cuda.empty_cache()
# torch.cuda.mem_get_info()

### Step 6 Declaring the model

Loading and printing the model 

In [177]:

model = IDM().to(device)
print(model)

IDM(
  (temporal3DConv): Temporal3DConv(
    (conv3d): Conv3d(3, 128, kernel_size=(5, 1, 1), stride=(1, 1, 1), padding=(2, 0, 0))
    (relu): ReLU()
  )
  (ResNetStack1): ResNetBlocksWithPooling(
    (conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (pool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (resnet_block1): ResNetBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (shortcut): Sequential()
    )
    (resnet_block2): ResNetBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (shortcut): Sequential()
    )
  )
  (ResNetStack2): ResNetBlocksWithPooling(
    (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1

Checkinng number of trainable parameters

In [178]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Trainable Parameters: {total_params}")
print(next(model.parameters()).is_cuda)

Total Trainable Parameters: 945614020
False


### Step 7 Doing Forward pass (NEED TO MODIFY TO SUPPORT FULL TRAINING)

In [179]:
output = any

for framechunk in dataloader:
    # Access the batched tensor data
    # Pass the input through the model
    input = framechunk[0].to(device)
    true_label = framechunk[1].to(device)
    output = model(input)
    del input
    torch.cuda.empty_cache()
    gc.collect()
    print(output.shape)

torch.Size([128, 4])
torch.Size([128, 4])


This theoritically should clear up the memory but this is not working properly

In [180]:
del model
torch.cuda.empty_cache()
gc.collect()

0