In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [3]:
print(torch.__version__)

2.4.1+cu121


In [4]:
print(torch.cuda.is_available())

True


### LSTM Implementation

In [5]:
# Not Using
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # Weights for input gate
        self.Wi = nn.Parameter(torch.Tensor(hidden_size, input_size + hidden_size))
        # Weights for forget gate
        self.Wf = nn.Parameter(torch.Tensor(hidden_size, input_size + hidden_size))
        # Weights for cell state
        self.Wc = nn.Parameter(torch.Tensor(hidden_size, input_size + hidden_size))
        # Weights for output gate
        self.Wo = nn.Parameter(torch.Tensor(hidden_size, input_size + hidden_size))
        
        # Biases
        if bias:
            self.bi = nn.Parameter(torch.Tensor(hidden_size))
            self.bf = nn.Parameter(torch.Tensor(hidden_size))
            self.bc = nn.Parameter(torch.Tensor(hidden_size))
            self.bo = nn.Parameter(torch.Tensor(hidden_size))
        else:
            self.register_parameter('bi', None)
            self.register_parameter('bf', None)
            self.register_parameter('bc', None)
            self.register_parameter('bo', None)
        
        self.reset_parameters()
    
    def reset_parameters(self):
        # Xavier/Glorot initialization
        std = 1.0 / np.sqrt(self.hidden_size)
        for p in self.parameters():
            p.data.uniform_(-std, std)
    
    def forward(self, x, state):
        # Unpack previous hidden and cell states
        h_prev, c_prev = state
        
        # Concatenate input and previous hidden state
        combined = torch.cat((x, h_prev), dim=1)
        
        # Input gate
        i = torch.sigmoid(F.linear(combined, self.Wi, self.bi))
        
        # Forget gate
        f = torch.sigmoid(F.linear(combined, self.Wf, self.bf))
        
        # Cell state candidate
        c_candidate = torch.tanh(F.linear(combined, self.Wc, self.bc))
        
        # Output gate
        o = torch.sigmoid(F.linear(combined, self.Wo, self.bo))
        
        # Update cell state
        c_next = f * c_prev + i * c_candidate
        
        # Update hidden state
        h_next = o * torch.tanh(c_next)
        
        return h_next, c_next

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, bias=True):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Create LSTM cells for each layer
        self.lstm_cells = nn.ModuleList([
            LSTMCell(input_size if layer == 0 else hidden_size, 
                     hidden_size, bias) 
            for layer in range(num_layers)
        ])
    
    def forward(self, x, states=None):
        # x shape: (batch_size, seq_len, input_size)
        batch_size, seq_len, _ = x.size()
        
        # Initialize hidden states if not provided
        if states is None:
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=x.device)
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=x.device)
            states = (h0, c0)
        
        # Unpack initial states
        h_prev, c_prev = states
        
        # Output tensor to store hidden states for each timestep
        outputs = []
        
        # Process each timestep
        for t in range(seq_len):
            layer_input = x[:, t, :]
            
            # Process through each LSTM layer
            layer_h_states = []
            for layer in range(self.num_layers):
                # Get cell state for this layer
                h_prev_layer = h_prev[layer]
                c_prev_layer = c_prev[layer]
                
                # Update hidden and cell states
                h_next, c_next = self.lstm_cells[layer](layer_input, (h_prev_layer, c_prev_layer))
                
                # Update layer input for next layer
                layer_input = h_next
                
                # Store updated states
                layer_h_states.append(h_next)
                h_prev[layer] = h_next
                c_prev[layer] = c_next
            
            # Store final layer's hidden state
            outputs.append(layer_h_states[-1])
        
        # Stack outputs and return
        return torch.stack(outputs, dim=1)

In [6]:
num_layers = 2
model = LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)

# Print the model structure
print(model)

NameError: name 'input_size' is not defined

In [7]:
class CustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(CustomLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Create LSTM cells for each layer
        self.lstm_cells = nn.ModuleList([
            nn.LSTMCell(input_size if i == 0 else hidden_size, hidden_size) 
            for i in range(num_layers)
        ])
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """
        Args:
            x: Input tensor of shape (batch_size, seq_len, input_size)
        Returns:
            output: Output tensor of shape (batch_size, seq_len, output_size)
        """
        batch_size, seq_len, _ = x.size()
        
        # Initialize hidden and cell states for each layer
        h_t = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        c_t = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        
        # To store the output at each time step
        outputs = []
        
        for t in range(seq_len):
            # Extract the time step t input
            x_t = x[:, t, :]
            
            # Pass through each layer
            for layer in range(self.num_layers):
                h_t[layer], c_t[layer] = self.lstm_cells[layer](
                    x_t, (h_t[layer], c_t[layer])
                )
                # The input to the next layer is the output of the current layer
                x_t = h_t[layer]
            
            # Pass the final layer's output through the fully connected layer
            output_t = self.fc(h_t[-1])
            outputs.append(output_t)
        
        # Stack the outputs to form the final output tensor
        outputs = torch.stack(outputs, dim=1)
        return outputs

# Example usage
input_size = 10  # Dimensionality of input features
hidden_size = 20  # Number of hidden units
output_size = 5  # Dimensionality of output features
seq_len = 7  # Length of the input sequence
batch_size = 3  # Batch size
num_layers = 4  # Number of LSTM layers

model = CustomLSTM(input_size, hidden_size, output_size, num_layers)

# Random input tensor
x = torch.randn(batch_size, seq_len, input_size)

# Forward pass
output = model(x)
print("Output shape:", output.shape)  # Should be (batch_size, seq_len, output_size)

Output shape: torch.Size([3, 7, 5])


In [8]:
model

CustomLSTM(
  (lstm_cells): ModuleList(
    (0): LSTMCell(10, 20)
    (1-3): 3 x LSTMCell(20, 20)
  )
  (fc): Linear(in_features=20, out_features=5, bias=True)
)

### ConvLSTM

https://github.com/rogertrullo/pytorch_convlstm/blob/master/conv_lstm.py


https://github.com/Atcold/pytorch-CortexNet/blob/master/model/ConvLSTMCell.py

https://sladewinter.medium.com/video-frame-prediction-using-convlstm-network-in-pytorch-b5210a6ce582

https://towardsdatascience.com/video-prediction-using-convlstm-with-pytorch-lightning-27b195fd21a2

https://www.kaggle.com/code/nguyenmanhcuongg/pytorch-video-classification-with-conv2d-lstm/notebook

https://www.kaggle.com/code/lonnieqin/video-classification

In [9]:
class ConvLSTMCell(nn.Module):
    def __init__(self, input_dim, hidden_dim, kernel_size):
        super(ConvLSTMCell, self).__init__()
        self.hidden_dim = hidden_dim
        self.padding = kernel_size // 2
        self.conv = nn.Conv2d(input_dim + hidden_dim, 4 * hidden_dim, kernel_size, padding=self.padding)
    
    def forward(self, x, state):
        h_prev, c_prev = state
        combined = torch.cat([x, h_prev], dim=1)
        gates = self.conv(combined)
        i, f, o, g = torch.chunk(gates, 4, dim=1)
        i, f, o, g = torch.sigmoid(i), torch.sigmoid(f), torch.sigmoid(o), torch.tanh(g)
        c = f * c_prev + i * g
        h = o * torch.tanh(c)
        return h, (h, c)

class ConvLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, kernel_size, num_layers):
        super(ConvLSTM, self).__init__()
        self.num_layers = num_layers
        self.cells = nn.ModuleList(
            [ConvLSTMCell(input_dim if i == 0 else hidden_dim, hidden_dim, kernel_size) for i in range(num_layers)]
        )
    
    def forward(self, x, state=None):
        b, t, c, h, w = x.size()
        if state is None:
            state = [(torch.zeros(b, self.cells[i].hidden_dim, h, w).to(x.device),
                      torch.zeros(b, self.cells[i].hidden_dim, h, w).to(x.device)) for i in range(self.num_layers)]
        
        outputs = []
        for t in range(t):
            x_t = x[:, t]
            for i, cell in enumerate(self.cells):
                x_t, state[i] = cell(x_t, state[i])
            outputs.append(x_t)
        return torch.stack(outputs, dim=1), state

In [10]:
# Example ConvLSTM initialization
input_dim = 3        # Number of input channels (e.g., RGB)
hidden_dim = 64      # Number of hidden channels
kernel_size = 3      # Size of the convolution kernel
num_layers = 2       # Number of ConvLSTM layers

model = ConvLSTM(input_dim, hidden_dim, kernel_size, num_layers)

# Print the model structure
print(model)

ConvLSTM(
  (cells): ModuleList(
    (0): ConvLSTMCell(
      (conv): Conv2d(67, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (1): ConvLSTMCell(
      (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
  )
)


In [11]:
import os
from torchvision import transforms
from PIL import Image
import torch

In [12]:
class KTHProcessedDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.data = []
        
        # Gather all image paths and their corresponding labels
        for label, category in enumerate(os.listdir(root_dir)):
            category_path = os.path.join(root_dir, category)
            for img_file in os.listdir(category_path):
                self.data.append((os.path.join(category_path, img_file), label))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, label


In [13]:
# Example usage
dataset = KTHProcessedDataset(
    root_dir="/home/nfs/inf6/data/datasets/kth_actions/processed",
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5], std=[0.5])
    ])
)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [14]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import random

In [33]:
class KTHProcessedDataset(Dataset):
    def __init__(self, root_dir, sequence_length, transform=None):
        self.root_dir = root_dir
        self.sequence_length = sequence_length
        self.transform = transform
        self.data = []

        # Traverse through action categories and their subfolders
        for label, category in enumerate(os.listdir(root_dir)):
            category_path = os.path.join(root_dir, category)
            if not os.path.isdir(category_path):
                continue
            for subfolder in os.listdir(category_path):
                subfolder_path = os.path.join(category_path, subfolder)
                if os.path.isdir(subfolder_path):
                    frames = sorted(os.listdir(subfolder_path))  # Ensure frames are ordered
                    if len(frames) >= sequence_length:
                        self.data.append((subfolder_path, frames, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        subfolder_path, frames, label = self.data[idx]

        # Randomly select a starting point for the sequence
        # start_idx = random.randint(0, len(frames) - self.sequence_length)
        # selected_frames = frames[start_idx:start_idx + self.sequence_length]

        # Select frames sequentially from the start, up to sequence_length
        selected_frames = frames[:self.sequence_length]


        # Load and transform the frames
        sequence = []
        for frame_file in selected_frames:
            frame_path = os.path.join(subfolder_path, frame_file)
            img = Image.open(frame_path).convert("L")  # Convert to grayscale
            if self.transform:
                img = self.transform(img)
            # print("Image shape:",  img.shape)
            # Resize the image to 64x64 if needed
            # img = img.resize((64, 64))  # Ensure size is 64x64
            # Flatten the image to a 1D tensor
            img = img.view(-1)  # Flattening the image to size 4096 (64x64)
            sequence.append(img)

        # Stack frames into a tensor of shape [seq_len, channels, height, width]
        sequence = torch.stack(sequence, dim=0)
        return sequence, label


# Transformations and DataLoader
transform = transforms.Compose([
    # transforms.RandomResizedCrop((64, 64)),  # Randomly crop and resize
    # transforms.RandomHorizontalFlip(),       # Flip image horizontally
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

In [34]:
sequence_length = 40
batch_size = 16

dataset = KTHProcessedDataset(root_dir="/home/nfs/inf6/data/datasets/kth_actions/processed", sequence_length=sequence_length, transform=transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True) # using drop_last because the last batch size is causing problems so using data with only full batch sizes

In [35]:
import torch.optim as optim
from tqdm import tqdm
import random

In [42]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 537 kB/s eta 0:00:011
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.67.1


In [36]:
# Define the CustomLSTM model
input_size = 64 * 64  # Flattened image size (for grayscale images)
hidden_size = 256
output_size = len(os.listdir("/home/nfs/inf6/data/datasets/kth_actions/processed"))  # Number of action classes
num_layers = 4

model = CustomLSTM(input_size, hidden_size, output_size, num_layers)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 25

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    correct = 0
    total = 0
    
    for sequences, labels in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # print("Seq shape: ", sequences.shape)
        # Prepare input and labels
        sequences = sequences.view(batch_size, sequence_length, -1).to(device)  # Flatten each frame
        labels = labels.to(device)
        # print(sequences.shape)
        
        # Forward pass
        outputs = model(sequences)  # Shape: [batch_size, seq_len, output_size]
        outputs = outputs[:, -1, :]  # Use the output of the last time step for classification
        
        # Calculate loss
        loss = criterion(outputs, labels)
        epoch_loss += loss.item()
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.2f}%")

Epoch 1/25: 100%|███████████████████████████████████████████| 37/37 [00:22<00:00,  1.65it/s]


Epoch 1/25, Loss: 58.5711, Accuracy: 23.31%


Epoch 2/25: 100%|███████████████████████████████████████████| 37/37 [00:23<00:00,  1.58it/s]


Epoch 2/25, Loss: 53.8805, Accuracy: 27.36%


Epoch 3/25: 100%|███████████████████████████████████████████| 37/37 [00:21<00:00,  1.69it/s]


Epoch 3/25, Loss: 53.6750, Accuracy: 28.38%


Epoch 4/25: 100%|███████████████████████████████████████████| 37/37 [00:22<00:00,  1.64it/s]


Epoch 4/25, Loss: 50.0498, Accuracy: 30.41%


Epoch 5/25: 100%|███████████████████████████████████████████| 37/37 [00:21<00:00,  1.70it/s]


Epoch 5/25, Loss: 48.5600, Accuracy: 29.39%


Epoch 6/25: 100%|███████████████████████████████████████████| 37/37 [00:22<00:00,  1.65it/s]


Epoch 6/25, Loss: 50.5933, Accuracy: 31.76%


Epoch 7/25: 100%|███████████████████████████████████████████| 37/37 [00:22<00:00,  1.64it/s]


Epoch 7/25, Loss: 48.3554, Accuracy: 33.61%


Epoch 8/25: 100%|███████████████████████████████████████████| 37/37 [00:22<00:00,  1.67it/s]


Epoch 8/25, Loss: 51.0739, Accuracy: 28.89%


Epoch 9/25: 100%|███████████████████████████████████████████| 37/37 [00:23<00:00,  1.59it/s]


Epoch 9/25, Loss: 50.6373, Accuracy: 34.97%


Epoch 10/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.64it/s]


Epoch 10/25, Loss: 48.1520, Accuracy: 35.98%


Epoch 11/25: 100%|██████████████████████████████████████████| 37/37 [00:21<00:00,  1.69it/s]


Epoch 11/25, Loss: 46.7283, Accuracy: 31.42%


Epoch 12/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.67it/s]


Epoch 12/25, Loss: 43.8926, Accuracy: 41.39%


Epoch 13/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.65it/s]


Epoch 13/25, Loss: 43.6088, Accuracy: 41.22%


Epoch 14/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.63it/s]


Epoch 14/25, Loss: 49.0614, Accuracy: 41.22%


Epoch 15/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.67it/s]


Epoch 15/25, Loss: 44.5216, Accuracy: 45.95%


Epoch 16/25: 100%|██████████████████████████████████████████| 37/37 [00:23<00:00,  1.60it/s]


Epoch 16/25, Loss: 42.3128, Accuracy: 44.59%


Epoch 17/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.66it/s]


Epoch 17/25, Loss: 41.1004, Accuracy: 46.28%


Epoch 18/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.64it/s]


Epoch 18/25, Loss: 41.1744, Accuracy: 45.44%


Epoch 19/25: 100%|██████████████████████████████████████████| 37/37 [00:21<00:00,  1.70it/s]


Epoch 19/25, Loss: 39.3515, Accuracy: 46.96%


Epoch 20/25: 100%|██████████████████████████████████████████| 37/37 [00:21<00:00,  1.68it/s]


Epoch 20/25, Loss: 40.5433, Accuracy: 46.79%


Epoch 21/25: 100%|██████████████████████████████████████████| 37/37 [00:21<00:00,  1.71it/s]


Epoch 21/25, Loss: 40.5336, Accuracy: 46.79%


Epoch 22/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.67it/s]


Epoch 22/25, Loss: 36.7278, Accuracy: 50.00%


Epoch 23/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.66it/s]


Epoch 23/25, Loss: 34.6514, Accuracy: 51.52%


Epoch 24/25: 100%|██████████████████████████████████████████| 37/37 [00:21<00:00,  1.69it/s]


Epoch 24/25, Loss: 36.9132, Accuracy: 52.53%


Epoch 25/25: 100%|██████████████████████████████████████████| 37/37 [00:22<00:00,  1.68it/s]

Epoch 25/25, Loss: 36.5339, Accuracy: 53.04%



