In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import StepLR
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image
import random
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import seaborn as sns
import wandb

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class KTHProcessedDataset(Dataset):
    def __init__(self, root_dir, sequence_length, transform=None):
        self.root_dir = root_dir
        self.sequence_length = sequence_length
        self.transform = transform
        self.data = []
        self.class_mapping = {}  # To store class ID to name mapping

        # Traverse through action categories and their subfolders
        for label, category in enumerate(sorted(os.listdir(root_dir))):
            category_path = os.path.join(root_dir, category)
            if not os.path.isdir(category_path):
                continue
            self.class_mapping[label] = category  # Map class ID to category name
            for subfolder in os.listdir(category_path):
                subfolder_path = os.path.join(category_path, subfolder)
                if os.path.isdir(subfolder_path):
                    frames = sorted(os.listdir(subfolder_path))  # Ensure frames are ordered

                    # Check if there are enough frames
                    if len(frames) >= sequence_length:
                        self.data.append((subfolder_path, frames, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        subfolder_path, frames, label = self.data[idx]

        # Select frames sequentially from the start, up to sequence_length
        selected_frames = frames[:self.sequence_length]

        sequence = []
        for frame_file in selected_frames:
            frame_path = os.path.join(subfolder_path, frame_file)
            try:
                # Try to open the image
                img = Image.open(frame_path).convert("L")  # Convert to grayscale
                if self.transform:
                    img = self.transform(img)

                # Flatten the image to a 1D tensor
                img = img.view(-1)  # Flattening the image to size 4096 (64x64)
                sequence.append(img)

            except (IOError, OSError) as e:
                # Log the error and skip the corrupted frame
                print(f"Warning: Skipping corrupted image {frame_path} due to error: {e}")
                return self.__getitem__((idx + 1) % len(self))  # Skip to the next sample

        # Stack frames into a tensor of shape [sequence_length, 4096]
        sequence = torch.stack(sequence, dim=0)
        return sequence, label


In [4]:
# Define augmentations and transformations
transform = transforms.Compose([
    # Spatial augmentations
    # RandomHorizontalFlip(p=0.5),                # Flip frames horizontally with 50% probability
    # RandomRotation(degrees=15),                # Random rotation within ±15 degrees
    transforms.RandomCrop(size=(64, 64), pad_if_needed=True),  # Random crop to 64x64, pad if needed

    transforms.RandomApply([transforms.ColorJitter(brightness=0.2, contrast=0.2)], p=0.3),  # Adjust brightness/contrast
    transforms.GaussianBlur(3, sigma=(0.1, 2.0)),  # Random blur
    
    # Conversion and normalization
    transforms.ToTensor(),                                # Convert PIL image to tensor
    transforms.Normalize(mean=[0.5], std=[0.5])           # Normalize to [-1, 1]
])

In [5]:
# Paths and hyperparameters
root_dir = '/home/nfs/inf6/data/datasets/kth_actions/processed'
sequence_length = 45
batch_size = 16
train_ratio = 0.8  # 80% for training, 20% for validation


In [6]:
# Load the full dataset
dataset = KTHProcessedDataset(root_dir=root_dir, sequence_length=sequence_length, transform=transform)

# Calculate train and validation sizes
dataset_size = len(dataset)
train_size = int(train_ratio * dataset_size)
val_size = dataset_size - train_size

# Split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for train and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

print(f"Dataset size: {dataset_size}")
print(f"Train set size: {train_size}")
print(f"Validation set size: {val_size}")

Dataset size: 599
Train set size: 479
Validation set size: 120


In [7]:
print("Dataset Classes:")
for class_id, class_name in dataset.class_mapping.items():
    print(f"Class ID: {class_id}, Class Name: {class_name}")

Dataset Classes:
Class ID: 0, Class Name: boxing
Class ID: 1, Class Name: handclapping
Class ID: 2, Class Name: handwaving
Class ID: 3, Class Name: jogging
Class ID: 4, Class Name: running
Class ID: 5, Class Name: walking


In [8]:
def train_nn(model, train_loader, criterion, optimizer, device):
    """Train the LSTM model for one epoch"""
    model.train()
    running_loss = 0.0
    correct_labels = 0
    total_labels = 0

    for inputs, labels in train_loader:
        inputs = inputs.view(-1, 45, 1, height, width)
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)  # Get outputs for the full sequence
        
        # Only use the last time step's output for classification
        loss = criterion(outputs, labels)  # Use the last timestep output
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Compute accuracy for the last time step
        _, predicted_labels = torch.max(outputs, dim=1)
        correct_labels += (predicted_labels == labels).sum().item()
        total_labels += labels.size(0)

    avg_loss = running_loss / len(train_loader)
    accuracy = correct_labels / total_labels
    return avg_loss, accuracy

def evaluate_nn(model, eval_loader, criterion, device):
    """Evaluate the LSTM model"""
    model.eval()
    running_loss = 0.0
    correct_labels = 0
    total_labels = 0

    with torch.no_grad():
        for inputs, labels in eval_loader:
            inputs = inputs.view(-1, 45, 1, height, width)
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)  # Get outputs for the full sequence
            
            loss = criterion(outputs, labels)  # Use the last timestep output
            running_loss += loss.item()

            # Compute accuracy for the last time step
            _, predicted_labels = torch.max(outputs, dim=1)
            correct_labels += (predicted_labels == labels).sum().item()
            total_labels += labels.size(0)

    avg_loss = running_loss / len(eval_loader)
    accuracy = correct_labels / total_labels
    return avg_loss, accuracy

def run_training(model, train_loader, eval_loader, criterion, device, num_epochs, learning_rate=0.001, step_size=5, gamma=0.5,
                 project='lstm_training', name='test_run'):
    """Train and evaluate the LSTM model for a given number of epochs with W&B logging"""
    # Initialize W&B logging
    wandb.init(project=project, name=name, config={
        "learning_rate": learning_rate,
        "num_epochs": num_epochs,
        "step_size": step_size,
        "gamma": gamma,
        "optimizer": "Adam",
    },
    )
    config = wandb.config

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer
    scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)  # Learning rate scheduler
    
    train_losses = []
    eval_losses = []
    train_accuracies = []
    eval_accuracies = []
    learning_rates = []

    for epoch in tqdm(range(num_epochs)):
        # Train for one epoch
        train_loss, train_accuracy = train_nn(model, train_loader, criterion, optimizer, device)
        # Evaluate after each epoch
        eval_loss, eval_accuracy = evaluate_nn(model, eval_loader, criterion, device)

        # Get current learning rate
        current_lr = scheduler.get_last_lr()[0]

        # Record the metrics
        train_losses.append(train_loss)
        eval_losses.append(eval_loss)
        train_accuracies.append(train_accuracy)
        eval_accuracies.append(eval_accuracy)
        learning_rates.append(current_lr)

        # Log metrics to W&B
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_accuracy": train_accuracy,
            "eval_loss": eval_loss,
            "eval_accuracy": eval_accuracy,
            "learning_rate": current_lr
        })

        # Print epoch statistics
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}")
        print(f"Learning Rate: {current_lr:.6f}")
        
        # Step the learning rate scheduler
        scheduler.step()

    # Finish W&B run
    wandb.finish()

    return model, train_losses, train_accuracies, eval_losses, eval_accuracies, learning_rates

### Pytorch LSTM model

In [9]:
class CustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(CustomLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Create LSTM cells for each layer
        self.lstm_cells = nn.ModuleList([
            nn.LSTMCell(input_size if i == 0 else hidden_size, hidden_size) 
            for i in range(num_layers)
        ])
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """
        Args:
            x: Input tensor of shape (batch_size, seq_len, input_size)
        Returns:
            output: Output tensor of shape (batch_size, seq_len, output_size)
        """
        batch_size, seq_len, _ = x.size()
        
        # Initialize hidden and cell states for each layer
        h_t = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        c_t = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        
        # To store the output at each time step
        outputs = []
        
        for t in range(seq_len):
            # Extract the time step t input
            x_t = x[:, t, :]
            
            # Pass through each layer
            for layer in range(self.num_layers):
                h_t[layer], c_t[layer] = self.lstm_cells[layer](
                    x_t, (h_t[layer], c_t[layer])
                )
                # The input to the next layer is the output of the current layer
                x_t = h_t[layer]
            
            # Pass the final layer's output through the fully connected layer
            output_t = self.fc(h_t[-1])
            outputs.append(output_t)
        
        # Stack the outputs to form the final output tensor
        outputs = torch.stack(outputs, dim=1)
        return outputs

In [10]:
class ComplexActionRecognitionModel(nn.Module):
    def __init__(self, sequence_length, input_dim=64*64, num_classes=6):
        super(ComplexActionRecognitionModel, self).__init__()
        self.sequence_length = sequence_length

        # Convolutional Encoder
        self.encoder = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),  # Output: (64, 64, 64)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),  # Output: (64, 64, 64)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (64, 32, 32)
            nn.Dropout(0.2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),  # Output: (128, 32, 32)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),  # Output: (128, 32, 32)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (128, 16, 16)
            nn.Dropout(0.3),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),  # Output: (256, 16, 16)
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),  # Output: (256, 16, 16)
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (256, 8, 8)
            nn.Dropout(0.4),
        )

        # Replace PyTorch LSTM with Custom LSTM
        self.rnn = CustomLSTM(
            input_size=256 * 8 * 8,  # Flattened size from encoder
            hidden_size=256,         # Hidden state size
            output_size=256,         # Output size per time step
            num_layers=2             # Number of LSTM layers
        )

        # Classifier
        self.classifier = nn.Sequential(
            nn.Conv1d(256, 128, kernel_size=1),  # Temporal conv
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),  # Reduce sequence length to 1
            nn.Flatten(),  # Flatten the output
            nn.Linear(128, num_classes)  # Final classification layer
        )

    def forward(self, x):
        # x: [batch_size, sequence_length, 64*64]
        batch_size, seq_len, _ = x.size()
        
        # Reshape and encode each frame
        x = x.view(batch_size * seq_len, 1, 64, 64)  # Reshape to (batch_size * sequence_length, 1, 64, 64)
        x = self.encoder(x)  # Pass through convolutional encoder
        x = x.view(batch_size, seq_len, -1)  # Reshape to (batch_size, sequence_length, 256*8*8)
        
        # Recurrent Module
        x = self.rnn(x)  # Custom LSTM output shape: (batch_size, sequence_length, 256)
        
        # Classifier
        x = x.transpose(1, 2)  # Change shape to (batch_size, 256, sequence_length) for Conv1d
        x = self.classifier(x)  # Output shape: (batch_size, num_classes)
        
        return x

In [10]:
num_classes = 6
sequence_length = 45
input_dim = 64 * 64
num_epochs = 10
learning_rate = 0.001
step_size = 5
gamma = 0.5

# Define the criterion
criterion = nn.CrossEntropyLoss()

In [12]:
model = ComplexActionRecognitionModel(input_dim=input_dim, sequence_length=sequence_length, num_classes=num_classes).to(device)
model

ComplexActionRecognitionModel(
  (encoder): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.2, inplace=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (15)

In [13]:
# Run training
model, train_losses, train_accuracies, eval_losses, eval_accuracies, learning_rates = run_training(
    model, train_loader, val_loader, criterion, device,
    num_epochs=num_epochs, learning_rate=learning_rate, step_size=step_size, gamma=gamma, name='pth_lstm', project='assignment_4'
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mahmadjaved97[0m. Use [1m`wandb login --relogin`[0m to force relogin


 10%|█████▌                                                  | 1/10 [00:17<02:39, 17.77s/it]

Epoch 1/10
Train Loss: 1.3945, Train Accuracy: 0.3215
Eval Loss: 1.2967, Eval Accuracy: 0.3333
Learning Rate: 0.001000


 20%|███████████▏                                            | 2/10 [00:32<02:07, 15.98s/it]

Epoch 2/10
Train Loss: 1.1502, Train Accuracy: 0.3737
Eval Loss: 1.0825, Eval Accuracy: 0.4417
Learning Rate: 0.001000


 30%|████████████████▊                                       | 3/10 [00:47<01:48, 15.46s/it]

Epoch 3/10
Train Loss: 1.1374, Train Accuracy: 0.4363
Eval Loss: 1.0816, Eval Accuracy: 0.4250
Learning Rate: 0.001000


 40%|██████████████████████▍                                 | 4/10 [01:02<01:31, 15.20s/it]

Epoch 4/10
Train Loss: 1.1175, Train Accuracy: 0.3758
Eval Loss: 0.9975, Eval Accuracy: 0.4667
Learning Rate: 0.001000


 50%|████████████████████████████                            | 5/10 [01:17<01:15, 15.12s/it]

Epoch 5/10
Train Loss: 1.0168, Train Accuracy: 0.4593
Eval Loss: 0.9421, Eval Accuracy: 0.4417
Learning Rate: 0.001000


 60%|█████████████████████████████████▌                      | 6/10 [01:32<01:00, 15.04s/it]

Epoch 6/10
Train Loss: 0.9499, Train Accuracy: 0.4843
Eval Loss: 0.8649, Eval Accuracy: 0.5667
Learning Rate: 0.000500


 70%|███████████████████████████████████████▏                | 7/10 [01:46<00:44, 14.99s/it]

Epoch 7/10
Train Loss: 0.8751, Train Accuracy: 0.5678
Eval Loss: 0.7853, Eval Accuracy: 0.5917
Learning Rate: 0.000500


 80%|████████████████████████████████████████████▊           | 8/10 [02:01<00:29, 14.98s/it]

Epoch 8/10
Train Loss: 0.8598, Train Accuracy: 0.5637
Eval Loss: 0.7806, Eval Accuracy: 0.6000
Learning Rate: 0.000500


 90%|██████████████████████████████████████████████████▍     | 9/10 [02:16<00:14, 15.00s/it]

Epoch 9/10
Train Loss: 0.7797, Train Accuracy: 0.6326
Eval Loss: 0.7810, Eval Accuracy: 0.6583
Learning Rate: 0.000500


100%|███████████████████████████████████████████████████████| 10/10 [02:31<00:00, 15.18s/it]

Epoch 10/10
Train Loss: 0.8333, Train Accuracy: 0.5866
Eval Loss: 0.7435, Eval Accuracy: 0.6000
Learning Rate: 0.000500





0,1
epoch,▁▂▃▃▄▅▆▆▇█
eval_accuracy,▁▃▃▄▃▆▇▇█▇
eval_loss,█▅▅▄▄▃▂▁▁▁
learning_rate,█████▁▁▁▁▁
train_accuracy,▁▂▄▂▄▅▇▆█▇
train_loss,█▅▅▅▄▃▂▂▁▂

0,1
epoch,10.0
eval_accuracy,0.6
eval_loss,0.74352
learning_rate,0.0005
train_accuracy,0.58664
train_loss,0.83326


### Pytorch GRU

In [14]:
class CustomGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(CustomGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Create GRU cells for each layer
        self.gru_cells = nn.ModuleList([
            nn.GRUCell(input_size if i == 0 else hidden_size, hidden_size)
            for i in range(num_layers)
        ])
        
        # Output layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """
        Args:
            x: Input tensor of shape (batch_size, seq_len, input_size)
        Returns:
            output: Output tensor of shape (batch_size, seq_len, output_size)
        """
        batch_size, seq_len, _ = x.size()
        
        # Initialize hidden states for each layer
        h_t = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        
        # To store the output at each time step
        outputs = []
        
        for t in range(seq_len):
            # Extract the time step t input
            x_t = x[:, t, :]
            
            # Pass through each layer
            for layer in range(self.num_layers):
                h_t[layer] = self.gru_cells[layer](x_t, h_t[layer])
                # The input to the next layer is the output of the current layer
                x_t = h_t[layer]
            
            # Pass the final layer's output through the fully connected layer
            output_t = self.fc(h_t[-1])
            outputs.append(output_t)
        
        # Stack the outputs to form the final output tensor
        outputs = torch.stack(outputs, dim=1)
        return outputs

In [18]:
class ComplexActionRecognitionModelGRU(nn.Module):
    def __init__(self, sequence_length, input_dim=64*64, num_classes=6):
        super(ComplexActionRecognitionModelGRU, self).__init__()
        self.sequence_length = sequence_length

        # Convolutional Encoder
        self.encoder = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),  # Output: (64, 64, 64)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),  # Output: (64, 64, 64)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (64, 32, 32)
            nn.Dropout(0.2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),  # Output: (128, 32, 32)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),  # Output: (128, 32, 32)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (128, 16, 16)
            nn.Dropout(0.3),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),  # Output: (256, 16, 16)
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),  # Output: (256, 16, 16)
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (256, 8, 8)
            nn.Dropout(0.4),
        )

        # Replace PyTorch LSTM with Custom GRU
        self.rnn = CustomGRU(
            input_size=256 * 8 * 8,  # Flattened size from encoder
            hidden_size=256,         # Hidden state size
            output_size=256,         # Output size per time step
            num_layers=2             # Number of GRU layers
        )

        # Classifier
        self.classifier = nn.Sequential(
            nn.Conv1d(256, 128, kernel_size=1),  # Temporal conv
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),  # Reduce sequence length to 1
            nn.Flatten(),  # Flatten the output
            nn.Linear(128, num_classes)  # Final classification layer
        )

    def forward(self, x):
        # x: [batch_size, sequence_length, 64*64]
        batch_size, seq_len, _ = x.size()
        
        # Reshape and encode each frame
        x = x.view(batch_size * seq_len, 1, 64, 64)  # Reshape to (batch_size * sequence_length, 1, 64, 64)
        x = self.encoder(x)  # Pass through convolutional encoder
        x = x.view(batch_size, seq_len, -1)  # Reshape to (batch_size, sequence_length, 256*8*8)
        
        # Recurrent Module
        x = self.rnn(x)  # Custom GRU output shape: (batch_size, sequence_length, 256)
        
        # Classifier
        x = x.transpose(1, 2)  # Change shape to (batch_size, 256, sequence_length) for Conv1d
        x = self.classifier(x)  # Output shape: (batch_size, num_classes)
        
        return x

In [19]:
model = ComplexActionRecognitionModelGRU(input_dim=input_dim, sequence_length=sequence_length, num_classes=num_classes).to(device)
model

ComplexActionRecognitionModelGRU(
  (encoder): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.2, inplace=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (

In [21]:
# Run training
model, train_losses, train_accuracies, eval_losses, eval_accuracies, learning_rates = run_training(
    model, train_loader, val_loader, criterion, device,
    num_epochs=num_epochs, learning_rate=learning_rate, step_size=step_size, gamma=gamma, name='pth_GRU', project='assignment_4'
)

 10%|█████▌                                                  | 1/10 [00:14<02:11, 14.61s/it]

Epoch 1/10
Train Loss: 1.2009, Train Accuracy: 0.3820
Eval Loss: 1.1400, Eval Accuracy: 0.3833
Learning Rate: 0.001000


 20%|███████████▏                                            | 2/10 [00:29<01:56, 14.59s/it]

Epoch 2/10
Train Loss: 1.0525, Train Accuracy: 0.4405
Eval Loss: 0.9315, Eval Accuracy: 0.5333
Learning Rate: 0.001000


 30%|████████████████▊                                       | 3/10 [00:43<01:42, 14.69s/it]

Epoch 3/10
Train Loss: 0.9382, Train Accuracy: 0.5219
Eval Loss: 0.8583, Eval Accuracy: 0.5750
Learning Rate: 0.001000


 40%|██████████████████████▍                                 | 4/10 [00:58<01:28, 14.68s/it]

Epoch 4/10
Train Loss: 0.8790, Train Accuracy: 0.5720
Eval Loss: 0.7472, Eval Accuracy: 0.6083
Learning Rate: 0.001000


 50%|████████████████████████████                            | 5/10 [01:13<01:13, 14.69s/it]

Epoch 5/10
Train Loss: 0.7939, Train Accuracy: 0.6221
Eval Loss: 0.7250, Eval Accuracy: 0.6333
Learning Rate: 0.001000


 60%|█████████████████████████████████▌                      | 6/10 [01:28<00:58, 14.68s/it]

Epoch 6/10
Train Loss: 0.6489, Train Accuracy: 0.6848
Eval Loss: 0.6460, Eval Accuracy: 0.7333
Learning Rate: 0.000500


 70%|███████████████████████████████████████▏                | 7/10 [01:42<00:44, 14.70s/it]

Epoch 7/10
Train Loss: 0.5795, Train Accuracy: 0.7035
Eval Loss: 0.6520, Eval Accuracy: 0.7083
Learning Rate: 0.000500


 80%|████████████████████████████████████████████▊           | 8/10 [01:57<00:29, 14.68s/it]

Epoch 8/10
Train Loss: 0.6026, Train Accuracy: 0.7015
Eval Loss: 0.5555, Eval Accuracy: 0.7083
Learning Rate: 0.000500


 90%|██████████████████████████████████████████████████▍     | 9/10 [02:12<00:14, 14.68s/it]

Epoch 9/10
Train Loss: 0.5262, Train Accuracy: 0.7370
Eval Loss: 0.5993, Eval Accuracy: 0.6917
Learning Rate: 0.000500


100%|███████████████████████████████████████████████████████| 10/10 [02:26<00:00, 14.69s/it]

Epoch 10/10
Train Loss: 0.5116, Train Accuracy: 0.7599
Eval Loss: 0.6588, Eval Accuracy: 0.7000
Learning Rate: 0.000500





0,1
epoch,▁▂▃▃▄▅▆▆▇█
eval_accuracy,▁▄▅▆▆█▇▇▇▇
eval_loss,█▆▅▃▃▂▂▁▂▂
learning_rate,█████▁▁▁▁▁
train_accuracy,▁▂▄▅▅▇▇▇██
train_loss,█▆▅▅▄▂▂▂▁▁

0,1
epoch,10.0
eval_accuracy,0.7
eval_loss,0.65878
learning_rate,0.0005
train_accuracy,0.75992
train_loss,0.5116


### Own LSTM

In [12]:
class LSTMCell(nn.Module):
    """
    Custom LSTM Cell implementation from scratch
    
    Args:
    - input_size: Number of input features
    - hidden_size: Number of hidden units
    - bias: Whether to use bias terms (default: True)
    """
    def __init__(self, input_size, hidden_size, bias=True):
        super(LSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # Combine all gates' weights into a single matrix for efficiency
        self.weight_ih = nn.Parameter(torch.randn(4 * hidden_size, input_size))
        self.weight_hh = nn.Parameter(torch.randn(4 * hidden_size, hidden_size))
        
        # Bias terms
        if bias:
            self.bias_ih = nn.Parameter(torch.randn(4 * hidden_size))
            self.bias_hh = nn.Parameter(torch.randn(4 * hidden_size))
        else:
            self.register_parameter('bias_ih', None)
            self.register_parameter('bias_hh', None)
        
        # Initialize weights
        self.reset_parameters()
    
    def reset_parameters(self):
        """
        Xavier uniform initialization for weights
        """
        std = 1.0 / np.sqrt(self.hidden_size)
        for weight in self.parameters():
            nn.init.uniform_(weight, -std, std)
    
    def forward(self, x, hidden_state=None):
        """
        Forward pass for LSTM Cell
        
        Args:
        - x: input tensor of shape (batch_size, input_size)
        - hidden_state: tuple of (h, c) - previous hidden and cell states
        
        Returns:
        - new_h: new hidden state
        - new_c: new cell state
        """
        # Initialize hidden state if not provided
        if hidden_state is None:
            batch_size = x.size(0)
            h = x.new_zeros(batch_size, self.hidden_size)
            c = x.new_zeros(batch_size, self.hidden_size)
        else:
            h, c = hidden_state
        
        # Compute gate inputs
        gates = F.linear(x, self.weight_ih, self.bias_ih) + \
                F.linear(h, self.weight_hh, self.bias_hh)
        
        # Split gates
        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
        
        # Apply activation functions
        ingate = torch.sigmoid(ingate)
        forgetgate = torch.sigmoid(forgetgate)
        cellgate = torch.tanh(cellgate)
        outgate = torch.sigmoid(outgate)
        
        # Update cell state
        new_c = (forgetgate * c) + (ingate * cellgate)
        
        # Compute new hidden state
        new_h = outgate * torch.tanh(new_c)
        
        return new_h, new_c

class LSTM(nn.Module):
    """
    Full LSTM implementation using LSTMCell
    
    Args:
    - input_size: Number of input features
    - hidden_size: Number of hidden units
    - num_layers: Number of LSTM layers
    - num_classes: Number of output classes
    - batch_first: Whether input is batch first (default: True)
    """
    def __init__(self, input_size, hidden_size, num_layers, num_classes, batch_first=True):
        print("here it is")
        super(LSTM, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        
        # Create LSTM cells for each layer
        self.lstm_cells = nn.ModuleList([
            LSTMCell(input_size if i == 0 else hidden_size, hidden_size)
            for i in range(num_layers)
        ])
        
        # Classification layer
        self.classifier = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        """
        Forward pass for multi-layer LSTM
        
        Args:
        - x: input tensor of shape (batch_size, sequence_length, input_size)
        
        Returns:
        - output: LSTM outputs for all time steps
        """
        # Ensure input is batch first
        if not self.batch_first:
            x = x.transpose(0, 1)
        
        batch_size, seq_len, _ = x.size()
        
        # Initialize hidden states for all layers
        h_list = [torch.zeros(batch_size, self.hidden_size, device=x.device) 
                  for _ in range(self.num_layers)]
        c_list = [torch.zeros(batch_size, self.hidden_size, device=x.device) 
                  for _ in range(self.num_layers)]
        
        # Collect outputs for all time steps
        outputs = []
        for t in range(seq_len):
            input_t = x[:, t, :]
            
            for layer in range(self.num_layers):
                cell = self.lstm_cells[layer]
                h_prev = input_t if layer == 0 else h_list[layer - 1]
                h_list[layer], c_list[layer] = cell(h_prev, (h_list[layer], c_list[layer]))
            
            # Append the output of the final layer at time t
            outputs.append(h_list[-1])
        
        # Stack outputs to form the full sequence
        outputs = torch.stack(outputs, dim=1)  # Shape: (batch_size, sequence_length, hidden_size)
        
        return outputs


In [22]:
class ComplexActionRecognitionModelLSTMO(nn.Module):
    def __init__(self, sequence_length, input_dim=64*64, num_classes=6):
        super(ComplexActionRecognitionModelLSTMO, self).__init__()
        self.sequence_length = sequence_length

        # Convolutional Encoder
        self.encoder = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),  # Output: (64, 64, 64)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),  # Output: (64, 64, 64)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (64, 32, 32)
            nn.Dropout(0.2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),  # Output: (128, 32, 32)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),  # Output: (128, 32, 32)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (128, 16, 16)
            nn.Dropout(0.3),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),  # Output: (256, 16, 16)
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),  # Output: (256, 16, 16)
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (256, 8, 8)
            nn.Dropout(0.4),
        )

        # Custom LSTM Module
        self.rnn = LSTM(
            input_size=256 * 8 * 8,  # Flattened size from encoder
            hidden_size=256,         # Hidden state size
            num_layers=2,            # Number of LSTM layers
            num_classes=256,         # Output size per time step (matches hidden size)
            batch_first=True         # Input format: (batch_size, seq_len, input_size)
        )


        # Classifier
        self.classifier = nn.Sequential(
            nn.Conv1d(256, 128, kernel_size=1),  # Temporal conv
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),  # Reduce sequence length to 1
            nn.Flatten(),  # Flatten the output
            nn.Linear(128, num_classes)  # Final classification layer
        )

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        
        # Reshape and encode each frame
        x = x.view(batch_size * seq_len, 1, 64, 64)  # Reshape to (batch_size * sequence_length, 1, 64, 64)
        x = self.encoder(x)  # Pass through convolutional encoder
        x = x.view(batch_size, seq_len, -1)  # Reshape to (batch_size, sequence_length, 256*8*8)
        
        # Recurrent Module
        x = self.rnn(x)  # Custom LSTM output shape: (batch_size, sequence_length, 256)
        
        # Verify shape before transpose
        # print("Post-RNN Shape:", x.shape)  # Add this debug statement
        
        # Classifier
        x = x.transpose(1, 2)  # Change shape to (batch_size, 256, sequence_length) for Conv1d
        x = self.classifier(x)  # Output shape: (batch_size, num_classes)
        
        return x

In [21]:
model = ComplexActionRecognitionModelLSTMO(input_dim=input_dim, sequence_length=sequence_length, num_classes=num_classes).to(device)
model

here it is


ComplexActionRecognitionModelLSTMO(
  (encoder): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.2, inplace=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   

In [15]:
# Run training
model, train_losses, train_accuracies, eval_losses, eval_accuracies, learning_rates = run_training(
    model, train_loader, val_loader, criterion, device,
    num_epochs=num_epochs, learning_rate=learning_rate, step_size=step_size, gamma=gamma, name='own_lstm', project='assignment_4'
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mahmadjaved97[0m. Use [1m`wandb login --relogin`[0m to force relogin


 10%|█████▌                                                  | 1/10 [00:17<02:41, 17.90s/it]

Epoch 1/10
Train Loss: 1.3967, Train Accuracy: 0.2985
Eval Loss: 1.2756, Eval Accuracy: 0.3417
Learning Rate: 0.001000


 20%|███████████▏                                            | 2/10 [00:32<02:09, 16.14s/it]

Epoch 2/10
Train Loss: 1.1504, Train Accuracy: 0.3236
Eval Loss: 1.2039, Eval Accuracy: 0.3583
Learning Rate: 0.001000


 30%|████████████████▊                                       | 3/10 [00:47<01:48, 15.57s/it]

Epoch 3/10
Train Loss: 1.1103, Train Accuracy: 0.3716
Eval Loss: 1.0655, Eval Accuracy: 0.3833
Learning Rate: 0.001000


 40%|██████████████████████▍                                 | 4/10 [01:02<01:32, 15.35s/it]

Epoch 4/10
Train Loss: 1.0656, Train Accuracy: 0.4050
Eval Loss: 1.0343, Eval Accuracy: 0.4833
Learning Rate: 0.001000


 50%|████████████████████████████                            | 5/10 [01:18<01:17, 15.43s/it]

Epoch 5/10
Train Loss: 0.9926, Train Accuracy: 0.4447
Eval Loss: 0.9737, Eval Accuracy: 0.4750
Learning Rate: 0.001000


 60%|█████████████████████████████████▌                      | 6/10 [01:33<01:01, 15.32s/it]

Epoch 6/10
Train Loss: 0.9628, Train Accuracy: 0.5010
Eval Loss: 0.9079, Eval Accuracy: 0.5167
Learning Rate: 0.000500


 70%|███████████████████████████████████████▏                | 7/10 [01:48<00:45, 15.19s/it]

Epoch 7/10
Train Loss: 0.9033, Train Accuracy: 0.5470
Eval Loss: 0.9197, Eval Accuracy: 0.5250
Learning Rate: 0.000500


 80%|████████████████████████████████████████████▊           | 8/10 [02:03<00:30, 15.12s/it]

Epoch 8/10
Train Loss: 0.8658, Train Accuracy: 0.5846
Eval Loss: 0.8267, Eval Accuracy: 0.5500
Learning Rate: 0.000500


 90%|██████████████████████████████████████████████████▍     | 9/10 [02:18<00:15, 15.09s/it]

Epoch 9/10
Train Loss: 0.8296, Train Accuracy: 0.6096
Eval Loss: 0.8568, Eval Accuracy: 0.5667
Learning Rate: 0.000500


100%|███████████████████████████████████████████████████████| 10/10 [02:33<00:00, 15.33s/it]

Epoch 10/10
Train Loss: 0.7501, Train Accuracy: 0.6618
Eval Loss: 0.7451, Eval Accuracy: 0.6250
Learning Rate: 0.000500





0,1
epoch,▁▂▃▃▄▅▆▆▇█
eval_accuracy,▁▁▂▄▄▅▆▆▇█
eval_loss,█▇▅▅▄▃▃▂▂▁
learning_rate,█████▁▁▁▁▁
train_accuracy,▁▁▂▃▄▅▆▇▇█
train_loss,█▅▅▄▄▃▃▂▂▁

0,1
epoch,10.0
eval_accuracy,0.625
eval_loss,0.74508
learning_rate,0.0005
train_accuracy,0.6618
train_loss,0.75011


### Own ConvLSTM

In [11]:
class ConvLSTMCell(nn.Module):
    def __init__(self, input_channels, hidden_channels, kernel_size, bias=True):
        super(ConvLSTMCell, self).__init__()
        self.input_channels = input_channels
        self.hidden_channels = hidden_channels
        self.kernel_size = kernel_size
        self.padding = kernel_size // 2
        self.bias = bias

        # Gates: i, f, o, g
        self.conv = nn.Conv2d(
            input_channels + hidden_channels, 
            4 * hidden_channels, 
            kernel_size, 
            padding=self.padding, 
            bias=self.bias
        )

    def forward(self, x, hidden):
        h_prev, c_prev = hidden

        # Concatenate along channel dimension
        combined = torch.cat([x, h_prev], dim=1)
        conv_output = self.conv(combined)

        # Split into gates
        i, f, o, g = torch.split(conv_output, self.hidden_channels, dim=1)
        i = torch.sigmoid(i)
        f = torch.sigmoid(f)
        o = torch.sigmoid(o)
        g = torch.tanh(g)

        # Update cell state and hidden state
        c_next = f * c_prev + i * g
        h_next = o * torch.tanh(c_next)

        return h_next, c_next

    def init_hidden(self, batch_size, height, width):
        h = torch.zeros(batch_size, self.hidden_channels, height, width, device=self.conv.weight.device)
        c = torch.zeros(batch_size, self.hidden_channels, height, width, device=self.conv.weight.device)
        return h, c


class ConvLSTM(nn.Module):
    def __init__(self, input_channels, hidden_channels, kernel_size, num_layers, num_classes, height, width, bias=True):
        super(ConvLSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_channels = hidden_channels
        self.height = height
        self.width = width

        # ConvLSTM layers
        self.layers = nn.ModuleList([
            ConvLSTMCell(
                input_channels=input_channels if i == 0 else hidden_channels[i - 1],
                hidden_channels=hidden_channels[i],
                kernel_size=kernel_size,
                bias=bias
            ) for i in range(num_layers)
        ])

        # Fully connected layer for classification
        self.fc = nn.Linear(hidden_channels[-1] * height * width, num_classes)

    def forward(self, x):
        """
        x: [batch_size, sequence_length, channels, height, width]
        Returns logits for classification.
        """
        batch_size, seq_len, _, height, width = x.size()
        assert height == self.height and width == self.width, "Input size mismatch with initialized height and width."

        # Initialize hidden states for all layers
        hidden_states = [layer.init_hidden(batch_size, height, width) for layer in self.layers]

        # Process sequence through ConvLSTM layers
        for t in range(seq_len):
            current_input = x[:, t]
            for l, layer in enumerate(self.layers):
                hidden_states[l] = layer(current_input, hidden_states[l])
                current_input = hidden_states[l][0]  # Use the hidden state (h_t)

        # Last layer's hidden state at the last time step
        last_hidden_state = hidden_states[-1][0]  # Shape: [batch_size, hidden_channels[-1], height, width]
        return last_hidden_state  # Return the last hidden state

In [12]:
class ComplexActionRecognitionModelConvLSTM(nn.Module):
    def __init__(self, num_classes):
        super(ComplexActionRecognitionModelConvLSTM, self).__init__()
        self.num_classes = num_classes

        # Convolutional Encoder
        self.encoder = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),  # Output: (64, 64, 64)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),  # Output: (64, 64, 64)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (64, 32, 32)
            nn.Dropout(0.2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),  # Output: (128, 32, 32)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),  # Output: (128, 32, 32)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (128, 16, 16)
            nn.Dropout(0.3),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),  # Output: (256, 16, 16)
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),  # Output: (256, 16, 16)
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),  # Output: (256, 8, 8)
            nn.Dropout(0.4),
        )

        # ConvLSTM
        self.convlstm = ConvLSTM(
            input_channels=256,
            hidden_channels=[64, 64],  # Two layers with 64 channels each
            kernel_size=3,
            num_layers=2,
            num_classes=num_classes,
            height=8,  # Adjusted based on encoder output size
            width=8
        )

        # Final classifier (after ConvLSTM)
        self.classifier = nn.Linear(64 * 8 * 8, num_classes)

    def forward(self, x):
        batch_size, seq_len, _, height, width = x.size()

        # Process each frame through the encoder
        x = x.view(batch_size * seq_len, 1, height, width)  # Combine batch and sequence dimensions
        x = self.encoder(x)  # Shape: (batch_size * seq_len, 256, 8, 8)
        _, channels, height, width = x.size()
        x = x.view(batch_size, seq_len, channels, height, width)  # Reshape for ConvLSTM

        # Pass through ConvLSTM
        x = self.convlstm(x)  # Last hidden state shape: (batch_size, hidden_channels[-1], height, width)

        # Flatten the last hidden state for classification
        x = x.view(batch_size, -1)  # Shape: (batch_size, hidden_channels[-1] * height * width)
        x = self.classifier(x)  # Shape: (batch_size, num_classes)

        return x

In [31]:
# Initialize the model
model = ComplexActionRecognitionModelConvLSTM(num_classes=10)

# Move to device
model = model.to(device)

In [32]:
model

ComplexActionRecognitionModelConvLSTM(
  (encoder): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.2, inplace=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)


In [33]:
# Run training
model, train_losses, train_accuracies, eval_losses, eval_accuracies, learning_rates = run_training(
    model, train_loader, val_loader, criterion, device,
    num_epochs=50, learning_rate=learning_rate, step_size=step_size, gamma=gamma, name='own_convlstm2', project='assignment_4'
)

  2%|█                                                       | 1/50 [00:14<11:58, 14.66s/it]

Epoch 1/50
Train Loss: 1.4242, Train Accuracy: 0.3800
Eval Loss: 2.0564, Eval Accuracy: 0.2667
Learning Rate: 0.001000


  4%|██▏                                                     | 2/50 [00:29<11:38, 14.56s/it]

Epoch 2/50
Train Loss: 0.9252, Train Accuracy: 0.5866
Eval Loss: 0.8451, Eval Accuracy: 0.6083
Learning Rate: 0.001000


  6%|███▎                                                    | 3/50 [00:43<11:25, 14.58s/it]

Epoch 3/50
Train Loss: 0.7879, Train Accuracy: 0.6347
Eval Loss: 0.8245, Eval Accuracy: 0.6833
Learning Rate: 0.001000


  8%|████▍                                                   | 4/50 [00:58<11:11, 14.59s/it]

Epoch 4/50
Train Loss: 0.7394, Train Accuracy: 0.6806
Eval Loss: 0.6501, Eval Accuracy: 0.7333
Learning Rate: 0.001000


 10%|█████▌                                                  | 5/50 [01:13<10:58, 14.63s/it]

Epoch 5/50
Train Loss: 0.5893, Train Accuracy: 0.7474
Eval Loss: 0.7111, Eval Accuracy: 0.6500
Learning Rate: 0.001000


 12%|██████▋                                                 | 6/50 [01:27<10:42, 14.60s/it]

Epoch 6/50
Train Loss: 0.5226, Train Accuracy: 0.7745
Eval Loss: 0.6526, Eval Accuracy: 0.7500
Learning Rate: 0.000500


 14%|███████▊                                                | 7/50 [01:42<10:27, 14.59s/it]

Epoch 7/50
Train Loss: 0.4175, Train Accuracy: 0.8205
Eval Loss: 0.6238, Eval Accuracy: 0.7583
Learning Rate: 0.000500


 16%|████████▉                                               | 8/50 [01:56<10:13, 14.60s/it]

Epoch 8/50
Train Loss: 0.3705, Train Accuracy: 0.8434
Eval Loss: 0.6709, Eval Accuracy: 0.7833
Learning Rate: 0.000500


 18%|██████████                                              | 9/50 [02:11<09:59, 14.62s/it]

Epoch 9/50
Train Loss: 0.3033, Train Accuracy: 0.8747
Eval Loss: 0.4952, Eval Accuracy: 0.8000
Learning Rate: 0.000500


 20%|███████████                                            | 10/50 [02:26<09:45, 14.64s/it]

Epoch 10/50
Train Loss: 0.2668, Train Accuracy: 0.9040
Eval Loss: 0.5566, Eval Accuracy: 0.8417
Learning Rate: 0.000500


 22%|████████████                                           | 11/50 [02:40<09:30, 14.62s/it]

Epoch 11/50
Train Loss: 0.2057, Train Accuracy: 0.9353
Eval Loss: 0.5320, Eval Accuracy: 0.8417
Learning Rate: 0.000250


 24%|█████████████▏                                         | 12/50 [02:55<09:16, 14.64s/it]

Epoch 12/50
Train Loss: 0.1573, Train Accuracy: 0.9478
Eval Loss: 0.4889, Eval Accuracy: 0.8583
Learning Rate: 0.000250


 26%|██████████████▎                                        | 13/50 [03:10<09:01, 14.63s/it]

Epoch 13/50
Train Loss: 0.1094, Train Accuracy: 0.9749
Eval Loss: 0.5505, Eval Accuracy: 0.8333
Learning Rate: 0.000250


 28%|███████████████▍                                       | 14/50 [03:24<08:47, 14.65s/it]

Epoch 14/50
Train Loss: 0.0896, Train Accuracy: 0.9749
Eval Loss: 0.4895, Eval Accuracy: 0.8500
Learning Rate: 0.000250


 30%|████████████████▌                                      | 15/50 [03:39<08:32, 14.65s/it]

Epoch 15/50
Train Loss: 0.0767, Train Accuracy: 0.9791
Eval Loss: 0.4983, Eval Accuracy: 0.8333
Learning Rate: 0.000250


 32%|█████████████████▌                                     | 16/50 [03:53<08:17, 14.64s/it]

Epoch 16/50
Train Loss: 0.0541, Train Accuracy: 0.9916
Eval Loss: 0.4656, Eval Accuracy: 0.8500
Learning Rate: 0.000125


 34%|██████████████████▋                                    | 17/50 [04:08<08:03, 14.64s/it]

Epoch 17/50
Train Loss: 0.0432, Train Accuracy: 0.9937
Eval Loss: 0.4701, Eval Accuracy: 0.8333
Learning Rate: 0.000125


 36%|███████████████████▊                                   | 18/50 [04:23<07:48, 14.64s/it]

Epoch 18/50
Train Loss: 0.0363, Train Accuracy: 0.9979
Eval Loss: 0.4569, Eval Accuracy: 0.8417
Learning Rate: 0.000125


 38%|████████████████████▉                                  | 19/50 [04:38<07:36, 14.71s/it]

Epoch 19/50
Train Loss: 0.0321, Train Accuracy: 0.9979
Eval Loss: 0.4829, Eval Accuracy: 0.8250
Learning Rate: 0.000125


 40%|██████████████████████                                 | 20/50 [04:52<07:22, 14.74s/it]

Epoch 20/50
Train Loss: 0.0268, Train Accuracy: 1.0000
Eval Loss: 0.5532, Eval Accuracy: 0.8333
Learning Rate: 0.000125


 42%|███████████████████████                                | 21/50 [05:07<07:07, 14.74s/it]

Epoch 21/50
Train Loss: 0.0244, Train Accuracy: 1.0000
Eval Loss: 0.5288, Eval Accuracy: 0.8333
Learning Rate: 0.000063


 44%|████████████████████████▏                              | 22/50 [05:22<06:51, 14.71s/it]

Epoch 22/50
Train Loss: 0.0229, Train Accuracy: 1.0000
Eval Loss: 0.4834, Eval Accuracy: 0.8667
Learning Rate: 0.000063


 46%|█████████████████████████▎                             | 23/50 [05:36<06:36, 14.68s/it]

Epoch 23/50
Train Loss: 0.0208, Train Accuracy: 1.0000
Eval Loss: 0.5322, Eval Accuracy: 0.8333
Learning Rate: 0.000063


 48%|██████████████████████████▍                            | 24/50 [05:51<06:21, 14.68s/it]

Epoch 24/50
Train Loss: 0.0201, Train Accuracy: 1.0000
Eval Loss: 0.4878, Eval Accuracy: 0.8417
Learning Rate: 0.000063


 50%|███████████████████████████▌                           | 25/50 [06:06<06:07, 14.69s/it]

Epoch 25/50
Train Loss: 0.0187, Train Accuracy: 1.0000
Eval Loss: 0.5003, Eval Accuracy: 0.8417
Learning Rate: 0.000063


 52%|████████████████████████████▌                          | 26/50 [06:21<05:52, 14.70s/it]

Epoch 26/50
Train Loss: 0.0166, Train Accuracy: 1.0000
Eval Loss: 0.5016, Eval Accuracy: 0.8167
Learning Rate: 0.000031


 54%|█████████████████████████████▋                         | 27/50 [06:35<05:37, 14.69s/it]

Epoch 27/50
Train Loss: 0.0167, Train Accuracy: 1.0000
Eval Loss: 0.5312, Eval Accuracy: 0.8333
Learning Rate: 0.000031


 56%|██████████████████████████████▊                        | 28/50 [06:50<05:24, 14.73s/it]

Epoch 28/50
Train Loss: 0.0166, Train Accuracy: 1.0000
Eval Loss: 0.5142, Eval Accuracy: 0.8417
Learning Rate: 0.000031


 58%|███████████████████████████████▉                       | 29/50 [07:05<05:09, 14.73s/it]

Epoch 29/50
Train Loss: 0.0149, Train Accuracy: 1.0000
Eval Loss: 0.5159, Eval Accuracy: 0.8333
Learning Rate: 0.000031


 60%|█████████████████████████████████                      | 30/50 [07:19<04:54, 14.71s/it]

Epoch 30/50
Train Loss: 0.0159, Train Accuracy: 1.0000
Eval Loss: 0.4912, Eval Accuracy: 0.8333
Learning Rate: 0.000031


 62%|██████████████████████████████████                     | 31/50 [07:34<04:39, 14.69s/it]

Epoch 31/50
Train Loss: 0.0152, Train Accuracy: 1.0000
Eval Loss: 0.4872, Eval Accuracy: 0.8417
Learning Rate: 0.000016


 64%|███████████████████████████████████▏                   | 32/50 [07:49<04:24, 14.70s/it]

Epoch 32/50
Train Loss: 0.0145, Train Accuracy: 1.0000
Eval Loss: 0.4811, Eval Accuracy: 0.8333
Learning Rate: 0.000016


 66%|████████████████████████████████████▎                  | 33/50 [08:04<04:09, 14.70s/it]

Epoch 33/50
Train Loss: 0.0142, Train Accuracy: 1.0000
Eval Loss: 0.4924, Eval Accuracy: 0.8417
Learning Rate: 0.000016


 68%|█████████████████████████████████████▍                 | 34/50 [08:18<03:55, 14.72s/it]

Epoch 34/50
Train Loss: 0.0147, Train Accuracy: 1.0000
Eval Loss: 0.5122, Eval Accuracy: 0.8333
Learning Rate: 0.000016


 70%|██████████████████████████████████████▌                | 35/50 [08:33<03:40, 14.69s/it]

Epoch 35/50
Train Loss: 0.0152, Train Accuracy: 1.0000
Eval Loss: 0.5238, Eval Accuracy: 0.8417
Learning Rate: 0.000016


 72%|███████████████████████████████████████▌               | 36/50 [08:48<03:25, 14.70s/it]

Epoch 36/50
Train Loss: 0.0129, Train Accuracy: 1.0000
Eval Loss: 0.5322, Eval Accuracy: 0.8417
Learning Rate: 0.000008


 74%|████████████████████████████████████████▋              | 37/50 [09:02<03:11, 14.71s/it]

Epoch 37/50
Train Loss: 0.0148, Train Accuracy: 1.0000
Eval Loss: 0.5163, Eval Accuracy: 0.8500
Learning Rate: 0.000008


 76%|█████████████████████████████████████████▊             | 38/50 [09:17<02:56, 14.70s/it]

Epoch 38/50
Train Loss: 0.0131, Train Accuracy: 1.0000
Eval Loss: 0.5000, Eval Accuracy: 0.8333
Learning Rate: 0.000008


 78%|██████████████████████████████████████████▉            | 39/50 [09:32<02:41, 14.71s/it]

Epoch 39/50
Train Loss: 0.0135, Train Accuracy: 1.0000
Eval Loss: 0.5107, Eval Accuracy: 0.8500
Learning Rate: 0.000008


 80%|████████████████████████████████████████████           | 40/50 [09:47<02:27, 14.73s/it]

Epoch 40/50
Train Loss: 0.0123, Train Accuracy: 1.0000
Eval Loss: 0.5541, Eval Accuracy: 0.8250
Learning Rate: 0.000008


 82%|█████████████████████████████████████████████          | 41/50 [10:01<02:12, 14.74s/it]

Epoch 41/50
Train Loss: 0.0134, Train Accuracy: 1.0000
Eval Loss: 0.5320, Eval Accuracy: 0.8333
Learning Rate: 0.000004


 84%|██████████████████████████████████████████████▏        | 42/50 [10:16<01:57, 14.73s/it]

Epoch 42/50
Train Loss: 0.0136, Train Accuracy: 1.0000
Eval Loss: 0.5276, Eval Accuracy: 0.8333
Learning Rate: 0.000004


 86%|███████████████████████████████████████████████▎       | 43/50 [10:31<01:43, 14.72s/it]

Epoch 43/50
Train Loss: 0.0125, Train Accuracy: 1.0000
Eval Loss: 0.5009, Eval Accuracy: 0.8250
Learning Rate: 0.000004


 88%|████████████████████████████████████████████████▍      | 44/50 [10:45<01:28, 14.71s/it]

Epoch 44/50
Train Loss: 0.0122, Train Accuracy: 1.0000
Eval Loss: 0.5310, Eval Accuracy: 0.8417
Learning Rate: 0.000004


 90%|█████████████████████████████████████████████████▌     | 45/50 [11:00<01:13, 14.70s/it]

Epoch 45/50
Train Loss: 0.0118, Train Accuracy: 1.0000
Eval Loss: 0.5149, Eval Accuracy: 0.8500
Learning Rate: 0.000004


 92%|██████████████████████████████████████████████████▌    | 46/50 [11:15<00:58, 14.71s/it]

Epoch 46/50
Train Loss: 0.0123, Train Accuracy: 1.0000
Eval Loss: 0.5234, Eval Accuracy: 0.8667
Learning Rate: 0.000002


 94%|███████████████████████████████████████████████████▋   | 47/50 [11:30<00:44, 14.73s/it]

Epoch 47/50
Train Loss: 0.0125, Train Accuracy: 1.0000
Eval Loss: 0.5138, Eval Accuracy: 0.8333
Learning Rate: 0.000002


 96%|████████████████████████████████████████████████████▊  | 48/50 [11:44<00:29, 14.69s/it]

Epoch 48/50
Train Loss: 0.0126, Train Accuracy: 1.0000
Eval Loss: 0.5360, Eval Accuracy: 0.8333
Learning Rate: 0.000002


 98%|█████████████████████████████████████████████████████▉ | 49/50 [11:59<00:14, 14.73s/it]

Epoch 49/50
Train Loss: 0.0125, Train Accuracy: 1.0000
Eval Loss: 0.5110, Eval Accuracy: 0.8250
Learning Rate: 0.000002


100%|███████████████████████████████████████████████████████| 50/50 [12:14<00:00, 14.68s/it]

Epoch 50/50
Train Loss: 0.0116, Train Accuracy: 1.0000
Eval Loss: 0.4885, Eval Accuracy: 0.8417
Learning Rate: 0.000002





0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
eval_accuracy,▁▅▆▆▅▇▇▇████████████▇███████████████████
eval_loss,█▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
learning_rate,█████▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▃▄▄▅▆▆▇▇▇██████████████████████████████
train_loss,█▆▅▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,50.0
eval_accuracy,0.84167
eval_loss,0.48846
learning_rate,0.0
train_accuracy,1.0
train_loss,0.01159


In [15]:
height=64
width=64

In [21]:
import math

https://github.com/kenshohara/video-classification-3d-cnn-pytorch/blob/master/models/pre_act_resnet.py#L10

In [26]:
class ResNet34VideoClassifier(nn.Module):
    def __init__(self, num_classes=6, sample_size=64, sample_duration=16):
        super(ResNet34VideoClassifier, self).__init__()

        self.inplanes = 64  # Initialize inplanes

        # Adjust the first convolution layer to handle 45 channels (for video frames)
        self.conv1 = nn.Conv3d(45, 64, kernel_size=7, stride=(1, 2, 2),
                               padding=(3, 3, 3), bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)

        # Residual layers
        self.layer1 = self._make_layer(64, 3)
        self.layer2 = self._make_layer(128, 4, stride=2)
        self.layer3 = self._make_layer(256, 6, stride=2)
        self.layer4 = self._make_layer(512, 3, stride=2)

        # Final adaptive average pooling
        last_duration = math.ceil(sample_duration / 16)
        last_size = math.ceil(sample_size / 32)
        self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
        
        # Fully connected layer for classification
        self.fc = nn.Linear(512, num_classes)

        self._initialize_weights()

    def _make_layer(self, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes:
            downsample = nn.Sequential(
                nn.Conv3d(self.inplanes, planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm3d(planes)
            )

        layers = []
        layers.append(self._basic_block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes
        for _ in range(1, blocks):
            layers.append(self._basic_block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def _basic_block(self, inplanes, planes, stride=1, downsample=None):
        block = nn.Sequential(
            nn.BatchNorm3d(inplanes),
            nn.ReLU(inplace=True),
            nn.Conv3d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm3d(planes),
            nn.ReLU(inplace=True),
            nn.Conv3d(planes, planes, kernel_size=3, padding=1, bias=False)
        )
        if downsample is not None:
            block.add_module("downsample", downsample)
        
        return block

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self, x):
        x = self.conv1(x)  # Input shape should be [batch_size, 45, 64, 64, 64]
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [27]:
# Initialize the ResNet34 model
model = ResNet34VideoClassifier(num_classes=num_classes, sample_size=64, sample_duration=45).to(device)

In [28]:
model

ResNet34VideoClassifier(
  (conv1): Conv3d(45, 64, kernel_size=(7, 7, 7), stride=(1, 2, 2), padding=(3, 3, 3), bias=False)
  (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Sequential(
      (0): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): ReLU(inplace=True)
      (2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
      (3): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (4): ReLU(inplace=True)
      (5): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
    )
    (1): Sequential(
      (0): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): ReLU(inplace=True)
      (2): Conv3d(64, 64, kernel

In [29]:
# Run training
model, train_losses, train_accuracies, eval_losses, eval_accuracies, learning_rates = run_training(
    model, train_loader, val_loader, criterion, device,
    num_epochs=num_epochs, learning_rate=learning_rate, step_size=step_size, gamma=gamma, name='resent3d', project='assignment_4'
)

  0%|                                                                | 0/10 [00:02<?, ?it/s]


RuntimeError: Given groups=1, weight of size [128, 64, 1, 1, 1], expected input[16, 128, 1, 8, 8] to have 64 channels, but got 128 channels instead