In [15]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class FrameDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        Args:
            root_dir (str): Path to the dataset root (e.g., 'Train/').
            transform (callable, optional): Optional transforms for frames.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.videos = []  # List of videos with their frame paths and labels

        # Traverse the root directory and organize frames by video
        print("Initializing dataset...")
        for label in os.listdir(root_dir):  # Class labels (e.g., Fighting, Shoplifting)
            class_dir = os.path.join(root_dir, label)
            if os.path.isdir(class_dir):
                # Group frames by video prefix (e.g., Fighting001, Fighting002)
                video_frames = {}
                for frame_name in os.listdir(class_dir):  # List all frames in class folder
                    prefix = "_".join(frame_name.split("_")[:-1])  # Extract video prefix
                    if prefix not in video_frames:
                        video_frames[prefix] = []
                    video_frames[prefix].append(os.path.join(class_dir, frame_name))

                # Sort frames for each video by frame index
                for prefix, frames in video_frames.items():
                    frames = sorted(
                        frames, key=lambda x: int(x.split("_")[-1].split(".")[0])  # Sort by frame index
                    )
                    self.videos.append((frames, label))  # (list of frame paths, label)

        print(f"Dataset initialized. Found {len(self.videos)} videos.")

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        frames, label = self.videos[idx]

        # Load frames as PIL images
        images = [Image.open(frame).convert("RGB") for frame in frames]

        if self.transform:
            images = [self.transform(img) for img in images]

        # Map labels to integers
        label_map = {"Fighting": 0, "Shoplifting": 1, "RoadAccidents": 2}
        label_tensor = torch.tensor(label_map[label])

        return torch.stack(images), label_tensor  # Return sequence of frames, label


In [16]:
transform = transforms.Compose([
    transforms.ToTensor(),          # Convert to Tensor
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # Normalize
])


In [17]:
dataset = FrameDataset(root_dir="Train", transform=transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

print("\nInspecting first few videos...")
for i, (video_frames, label) in enumerate(dataloader):
    print(f"\nBatch {i + 1}:")
    print(f"Video Frames Shape: {video_frames.shape} (Batch Size, Frames, Channels, Height, Width)")
    print(f"Label: {label}")
    if i == 2:  # Inspect only the first 3 videos
        break


Initializing dataset...
Dataset initialized. Found 201 videos.

Inspecting first few videos...

Batch 1:
Video Frames Shape: torch.Size([1, 117, 3, 64, 64]) (Batch Size, Frames, Channels, Height, Width)
Label: tensor([2])

Batch 2:
Video Frames Shape: torch.Size([1, 143, 3, 64, 64]) (Batch Size, Frames, Channels, Height, Width)
Label: tensor([1])

Batch 3:
Video Frames Shape: torch.Size([1, 460, 3, 64, 64]) (Batch Size, Frames, Channels, Height, Width)
Label: tensor([2])


In [18]:
import torchvision.models as models
from torch import nn

# Load a pre-trained ResNet18
cnn_model = models.resnet18(pretrained=True)
cnn_model.fc = nn.Identity()  # Remove the final classification layer
cnn_model = cnn_model.cuda()  # Move the model to GPU

print("\nLoaded CNN backbone (ResNet18). Output feature size:", 512)

def extract_spatial_features(video_frames, cnn_model):
    
    batch_size, num_frames, c, h, w = video_frames.size()

    # Reshape to process each frame independently
    video_frames = video_frames.view(batch_size * num_frames, c, h, w)
    features = cnn_model(video_frames)  # Extract features for all frames

    # Reshape back to (batch_size, num_frames, feature_dim)
    features = features.view(batch_size, num_frames, -1)

    return features



Loaded CNN backbone (ResNet18). Output feature size: 512


In [19]:
from torch.nn.utils.rnn import pad_sequence

def create_mask(sequences):
  
    # Pad the sequences to the length of the longest sequence
    padded_batch = pad_sequence(sequences, batch_first=True)  # Shape: (batch_size, max_seq_len, feature_dim)

    # Create a mask: True for padding, False for valid tokens
    batch_size = len(sequences)
    max_seq_len = padded_batch.size(1)
    mask = torch.ones((batch_size, max_seq_len), dtype=torch.bool)  # Shape: (batch_size, max_seq_len)

    for i, seq in enumerate(sequences):
        mask[i, :seq.size(0)] = False  # Valid tokens are False, padding is True

    # Debugging mask shape

    return padded_batch, mask


In [20]:
class PositionalEncoding(nn.Module):
    def __init__(self, feature_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = nn.Parameter(torch.zeros(1, max_len, feature_dim))

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.encoding[:, :seq_len, :]

from torch.nn import TransformerEncoder, TransformerEncoderLayer

import torch.nn as nn

class TemporalLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes):
      
        super(TemporalLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim, 
            hidden_size=hidden_dim, 
            num_layers=num_layers,
            batch_first=True,  # Input shape: (batch_size, seq_len, input_dim)
            dropout=0.1,       # Dropout between LSTM layers
        )
        self.fc = nn.Linear(hidden_dim, num_classes)  # Fully connected layer for classification

    def forward(self, features):
    


        # LSTM forward pass
        lstm_out, _ = self.lstm(features)  # lstm_out shape: (batch_size, seq_len, hidden_dim)

        # Take the output from the last time step
        last_hidden_state = lstm_out[:, -1, :]  # (batch_size, hidden_dim)

        # Fully connected layer
        logits = self.fc(last_hidden_state)  # (batch_size, num_classes)

        return logits





In [7]:
import torch.optim as optim

# Initialize model
feature_dim = 512
num_heads = 8
num_layers = 4
num_classes = 3
model = TemporalLSTM(feature_dim, num_heads, num_layers, num_classes).cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

scaler = torch.cuda.amp.GradScaler()

for epoch in range(10):
    model.train()
    total_loss = 0

    for video_frames, labels in dataloader:
        video_frames, labels = video_frames.cuda(), labels.cuda()

        # Extract spatial features using the CNN
        features = extract_spatial_features(video_frames, cnn_model)

        # Pad sequences
        padded_features, _ = create_mask([f.squeeze(0) for f in features])
        padded_features = padded_features.cuda()

        with torch.cuda.amp.autocast():  # Enable mixed precision
            outputs = model(padded_features)
            loss = criterion(outputs, labels)

        # Backward pass with scaling
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}], Loss: {total_loss:.4f}")



  scaler = torch.cuda.amp.GradScaler()


Label for video 51: 2


  with torch.cuda.amp.autocast():  # Enable mixed precision


Label for video 74: 2
Label for video 154: 2
Label for video 178: 1


KeyboardInterrupt: 

In [27]:
import torch

def save_checkpoint(model, optimizer, epoch, loss, file_path):
    """
    Save a checkpoint of the model for continued training.
    
    Args:
        model (nn.Module): The model to save.
        optimizer (torch.optim.Optimizer): The optimizer used during training.
        epoch (int): The current epoch.
        loss (float): The training loss at the time of saving.
        file_path (str): The path to save the checkpoint file.
    """
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss,
    }
    torch.save(checkpoint, file_path)
    print(f"Checkpoint saved to {file_path}")


In [None]:
# Assume `model`, `optimizer`, `epoch`, and `loss` are defined
save_checkpoint(model, optimizer, epoch, loss, file_path="lstm/checkpoint_epoch.pth")


Checkpoint saved to lstm/checkpoint_epoch30.pth


In [24]:
def load_checkpoint(model, optimizer, file_path="lstm/model_checkpoint2.pth"):
    """
    Load a saved model checkpoint for continued training.
    
    Args:
        model (nn.Module): The model to load the checkpoint into.
        optimizer (torch.optim.Optimizer): The optimizer to load the checkpoint into.
        file_path (str): The path to the checkpoint file.
    
    Returns:
        tuple: epoch (int), loss (float)
    """
    checkpoint = torch.load(file_path)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    print(f"Checkpoint loaded from {file_path}, starting from epoch {epoch + 1}")
    return epoch, loss


In [25]:
# Assume `model` and `optimizer` are already defined
start_epoch, prev_loss = load_checkpoint(model, optimizer, file_path="lstm/checkpoint_epoch2.pth")

# Resume training
for epoch in range(30):
    model.train()
    total_loss = 0

    for video_frames, labels in dataloader:
        video_frames, labels = video_frames.cuda(), labels.cuda()

        # Extract spatial features using the CNN
        features = extract_spatial_features(video_frames, cnn_model)

        # Pad sequences
        padded_features, _ = create_mask([f.squeeze(0) for f in features])
        padded_features = padded_features.cuda()

        with torch.cuda.amp.autocast():  # Enable mixed precision
            outputs = model(padded_features)
            loss = criterion(outputs, labels)

        # Backward pass with scaling
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}], Loss: {total_loss:.4f}")

    # Training loop...
    


  checkpoint = torch.load(file_path)


Checkpoint loaded from lstm/checkpoint_epoch2.pth, starting from epoch 10


  with torch.cuda.amp.autocast():  # Enable mixed precision


Epoch [1], Loss: 91.5762
Epoch [2], Loss: 90.1781
Epoch [3], Loss: 88.3611
Epoch [4], Loss: 85.4649
Epoch [5], Loss: 85.0240
Epoch [6], Loss: 82.4540
Epoch [7], Loss: 81.9375
Epoch [8], Loss: 80.9263
Epoch [9], Loss: 79.2760
Epoch [10], Loss: 79.9016
Epoch [11], Loss: 78.3686
Epoch [12], Loss: 77.8196
Epoch [13], Loss: 76.9582
Epoch [14], Loss: 76.4198
Epoch [15], Loss: 76.3869
Epoch [16], Loss: 75.3072
Epoch [17], Loss: 75.2737
Epoch [18], Loss: 74.5243
Epoch [19], Loss: 71.4722
Epoch [20], Loss: 73.8407
Epoch [21], Loss: 71.3456
Epoch [22], Loss: 73.0596
Epoch [23], Loss: 70.6635
Epoch [24], Loss: 70.3186
Epoch [25], Loss: 69.8441
Epoch [26], Loss: 69.3912
Epoch [27], Loss: 71.1430
Epoch [28], Loss: 69.9398
Epoch [29], Loss: 74.6160
Epoch [30], Loss: 71.5581


In [28]:
from sklearn.metrics import classification_report, accuracy_score
import torch

def evaluate_model(model, test_dataloader, cnn_model, device):
    """
    Evaluate the trained model on the test dataset.
    
    Args:
        model: Trained LSTM model.
        test_dataloader: DataLoader for the test dataset.
        cnn_model: Pre-trained CNN used for feature extraction.
        device: 'cuda' or 'cpu' for computation.
    """
    model.eval()  # Set the model to evaluation mode
    cnn_model.eval()  # Set the CNN to evaluation mode
    
    all_labels = []
    all_preds = []
    
    with torch.no_grad():  # Disable gradient calculation
        for video_frames, labels in test_dataloader:
            video_frames, labels = video_frames.to(device), labels.to(device)
            
            # Extract spatial features using the CNN
            features = extract_spatial_features(video_frames, cnn_model)
            
            # Pad sequences and create mask
            padded_features, mask = create_mask([f.squeeze(0) for f in features])
            padded_features = padded_features.to(device)
            
            # Forward pass
            outputs = model(padded_features)
            _, preds = torch.max(outputs, dim=1)  # Get class predictions
            
            # Collect labels and predictions
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=["Fighting", "Shoplifting", "Other"])
    
    print(f"\nTest Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    
    return accuracy, report

# Example: Evaluate the model
test_dataset = FrameDataset(root_dir="Test", transform=transform)

test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # Create test DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
evaluate_model(model, test_dataloader, cnn_model, device)


Initializing dataset...
Dataset initialized. Found 49 videos.

Test Accuracy: 0.1837

Classification Report:
              precision    recall  f1-score   support

    Fighting       0.08      0.60      0.14         5
 Shoplifting       0.00      0.00      0.00        21
       Other       0.55      0.26      0.35        23

    accuracy                           0.18        49
   macro avg       0.21      0.29      0.16        49
weighted avg       0.26      0.18      0.18        49



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.1836734693877551,
 '              precision    recall  f1-score   support\n\n    Fighting       0.08      0.60      0.14         5\n Shoplifting       0.00      0.00      0.00        21\n       Other       0.55      0.26      0.35        23\n\n    accuracy                           0.18        49\n   macro avg       0.21      0.29      0.16        49\nweighted avg       0.26      0.18      0.18        49\n')