# Implementation

In [5]:
!wget -q https://git.io/JGc31 -O ucf101_top5.tar.gz
!tar xf ucf101_top5.tar.gz

In [25]:
import torch
from torch.utils.data import Dataset
import cv2
import numpy as np
import os

In [26]:
import torch
from torch.utils.data import Dataset
import cv2
import numpy as np
import os

class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, sequence_length=20, img_size=224, transform=None):
        """
        Args:
            video_paths (list): List of file paths to video files.
            labels (list): List of integer labels corresponding to the videos.
            sequence_length (int): The number of frames to extract per video.
            img_size (int): The height and width to resize frames to.
            transform (callable, optional): PyTorch transforms for augmentation.
        """
        self.video_paths = video_paths
        self.labels = labels
        self.sequence_length = sequence_length
        self.img_size = img_size
        self.transform = transform

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        # 1. Load the video
        frames = self._load_video(video_path)

        # 2. Convert to Tensor
        # Result shape: (Seq_Len, Height, Width, Channels) -> (Seq_Len, Channels, Height, Width)
        # PyTorch expects Channels before Spatial dimensions
        frames = torch.FloatTensor(frames).permute(0, 3, 1, 2)

        # 3. Normalize to [0, 1] usually handled here or in transforms
        frames = frames / 255.0

        # 4. Apply transforms if any (e.g. Normalization to ImageNet mean/std)
        if self.transform:
            # Note: Transforms usually expect (C, H, W), so we might need to loop or use specific video transforms
            pass

        return frames, torch.tensor(label)

    def _load_video(self, path):
        """
        Reads a video and extracts a fixed number of frames using uniform sampling.
        """
        cap = cv2.VideoCapture(path)
        frames = []
        try:
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        except:
            total_frames = 0

        # If we can't read frame count, we fall back to reading all and sampling later (slower)
        # Here we assume we can read it.

        # Calculate the interval to pick frames uniformly
        # e.g., if total=100, seq_len=20, interval=5. We pick frame 0, 5, 10...
        interval = max(1, total_frames // self.sequence_length)

        for i in range(self.sequence_length):
            # Jump to the specific frame position
            frame_id = i * interval
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)

            ret, frame = cap.read()
            if not ret:
                break

            # Resize
            frame = cv2.resize(frame, (self.img_size, self.img_size))

            # BGR (OpenCV default) to RGB
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

        cap.release()

        # Padding: If video is too short (fewer than sequence_length frames), pad with zeros
        frames = np.array(frames)
        if len(frames) < self.sequence_length:
            pad_shape = (self.sequence_length - len(frames), self.img_size, self.img_size, 3)
            padding = np.zeros(pad_shape)
            if len(frames) > 0:
                frames = np.concatenate((frames, padding), axis=0)
            else:
                # Handle completely broken video file
                frames = np.zeros((self.sequence_length, self.img_size, self.img_size, 3))

        return frames

In [27]:
import torch
import torch.nn as nn
from torchvision import models

class FeatureExtractor(nn.Module):
    def __init__(self):
        super(FeatureExtractor, self).__init__()

        # 1. Load Pre-trained Model (DenseNet121)
        # 'weights="DEFAULT"' loads the best available ImageNet weights
        densenet = models.densenet121(weights='DEFAULT')

        # 2. Extract the "Body"
        # In DenseNet, the feature extractor is stored in .features
        # Output shape of this part is (Batch, 1024, 7, 7) for 224x224 input
        self.feature_extractor = densenet.features

        # 3. Freeze Weights (Efficiency)
        # We don't want to retrain the CNN, just use it. This saves massive memory.
        for param in self.feature_extractor.parameters():
            param.requires_grad = False

        # 4. Global Average Pooling
        # Converts spatial map (7x7) into a single vector (1x1)
        self.pooling = nn.AdaptiveAvgPool2d((1, 1))

    def forward(self, x):
        # Input x shape: (Batch_Size, Sequence_Length, 3, 224, 224)

        batch_size, seq_len, c, h, w = x.size()

        # 5. The "Time Distributed" Trick
        # CNNs expect (Batch, Channels, Height, Width). They don't understand "Time".
        # We merge Batch and Time dimensions to treat every frame as an independent image.
        x = x.view(batch_size * seq_len, c, h, w)
        # New shape: (Batch_Size * Seq_Len, 3, 224, 224)

        # 6. Pass through CNN
        out = self.feature_extractor(x)
        # Output shape: (Batch * Seq_Len, 1024, 7, 7)

        # 7. Pool and Flatten
        out = self.pooling(out)
        # Shape: (Batch * Seq_Len, 1024, 1, 1)

        out = out.view(out.size(0), -1)
        # Shape: (Batch * Seq_Len, 1024)

        # 8. Restore Time Dimension
        # We separate the frames back into their original videos
        out = out.view(batch_size, seq_len, -1)
        # Final Shape: (Batch_Size, Seq_Len, 1024)

        return out

To understand what the extractor function is doing from first principles, we must acknowledge the fundamental problem of computer vision: **The Semantic Gap.**

To a computer, an image of a cricket match is not "sport." It is just a massive grid of 150,528 separate numbers (pixels). If you move the camera 1 inch to the left, every single one of those 150,528 numbers changes completely. That is "noise."

The **Feature Extractor** is a machine designed to destroy that noise and preserve only the **meaning**.

Here is the step-by-step breakdown of what happens to your data inside that function.

### 1. The Input: "The Noisy Grid"
* **Data:** One frame of your video.
* **Shape:** $224 \times 224 \times 3$ (Height, Width, Color).
* **First Principle Status:** **High Dimensional, Low Semantic Value.**
    * It has huge detail (you can see every blade of grass).
    * It has zero understanding (it doesn't know "grass" is a thing).

### 2. The Process: "The Hierarchical Filter"
We pass this grid through **DenseNet121**, which is a stack of **Convolutional Filters**. Think of these as a series of sieves or scanners.



* **Layer 1 (The Edge Detectors):**
    * **What it does:** Scans for simple contrast changes. Vertical lines, horizontal lines, color blobs.
    * **Result:** It realizes "there is a vertical edge here (the wicket)" and "there is a horizontal edge here (the bat)."
* **Middle Layers (The Texture Detectors):**
    * **What it does:** Combines edges into shapes.
    * **Result:** It sees "circular patterns" (a head), "woven textures" (clothing), or "green repetitive noise" (grass).
* **Final Layers (The Part Detectors):**
    * **What it does:** Combines shapes into object parts.
    * **Result:** It identifies "a human leg," "a wooden plank," "a sphere."

**Crucial Point:** As the data goes deeper, the *spatial size* shrinks (from $224 \times 224$ to $7 \times 7$), but the *depth of understanding* grows (from 3 color channels to 1024 feature channels).

### 3. The Bottleneck: "Global Average Pooling"
This is the specific line in our code: `nn.AdaptiveAvgPool2d((1, 1))`. This is the most aggressive step in the entire pipeline.

* **The Input:** A $7 \times 7$ grid with 1024 channels. This still has *location* info (e.g., "The bat is in the top-left corner").
* **The Action:** We take the **Average** of the entire $7 \times 7$ grid.
* **The Result:** A single $1 \times 1$ vector.
* **First Principle Logic:** **Invariance.**
    * If the batsman moves slightly to the left, the $7 \times 7$ grid changes.
    * But the *average* of the grid remains roughly the same.
    * We are trading **"Where is it?"** (Spatial location) for **"Is it there?"** (Semantic presence).

### 4. The Output: "The Semantic Summary"
* **Data:** A single vector of length 1024.
* **First Principle Status:** **Low Dimensional, High Semantic Value.**

Think of this 1024-length vector as a **Checklist of Concepts**.
* Index 0 might measure "Greenness/Grass-like".
* Index 1 might measure "Metallic texture".
* Index 2 might measure "Human face shape".
* Index 1023 might measure "Sky blue".

**Example: A "Cricket Shot" Frame**
* **Raw Pixels:** Millions of green and white numbers.
* **Extracted Vector:**
    * Index 0 (Grass): **0.9** (High)
    * Index 50 (Wood texture): **0.8** (High)
    * Index 200 (Water/Ocean): **0.01** (Low)
    * Index 300 (Human shape): **0.95** (High)

### Summary of Transformation

| Stage | Data Representation | What the Computer "Sees" |
| :--- | :--- | :--- |
| **Input** | $224 \times 224$ Pixels | "Pixel (10,10) is Green." |
| **Convolution** | $7 \times 7$ Feature Maps | "There is a vertical edge in the top left." |
| **Output** | **1024 Vector** | **"This image contains: Grass, Person, Bat."** |

**Why do we need this for the Transformer?**
The Transformer is a "Sequence Learner." It's like a grammar checker. It doesn't want to look at pixels; it wants to look at *events*.
By giving it these vectors, we are essentially feeding it a story:
* **Frame 1:** [Person, Grass, Bat]
* **Frame 2:** [Person, Grass, Bat]
* **Frame 3:** [Person, Grass, Bat, **Motion Blur**]
* **Frame 4:** [Person, Grass, Bat, **Impact**]

The Transformer reads this sequence of concepts and concludes: **"This is a Cricket Shot."**

In [44]:
import torch
import torch.nn as nn
import torch.optim as optim

# --- COMPONENT 1: Positional Embedding ---
class PositionalEmbedding(nn.Module):
    def __init__(self, sequence_length, output_dim):
        super().__init__()
        # Keras: layers.Embedding
        # PyTorch: nn.Embedding
        self.position_embeddings = nn.Embedding(num_embeddings=sequence_length, embedding_dim=output_dim)
        self.sequence_length = sequence_length

        # We create a buffer for positions [0, 1, 2... seq_len]
        # register_buffer ensures this tensor is saved with the model but not updated by gradients
        self.register_buffer('position_ids', torch.arange(sequence_length))

    def forward(self, inputs):
        # inputs shape: (Batch, Seq_Len, Features)

        # We slice the position_ids to match the current batch's sequence length
        # (Just in case the input is shorter than max length)
        length = inputs.size(1)
        positions = self.position_ids[:length]

        # Get embeddings: (Seq_Len, Features)
        embedded_positions = self.position_embeddings(positions)

        # Broadcast add: (Batch, Seq, Feat) + (Seq, Feat) works automatically in PyTorch
        return inputs + embedded_positions


# --- COMPONENT 2: Transformer Encoder Block ---
class TransformerEncoder(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads

        # 1. Attention
        # batch_first=True makes it accept (Batch, Seq, Feature)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=0.3, batch_first=True)

        # 2. Feed Forward Network (Dense Project)
        # Keras: [Dense(dense_dim, gelu), Dense(embed_dim)]
        self.dense_proj = nn.Sequential(
            nn.Linear(embed_dim, dense_dim),
            nn.GELU(),
            nn.Linear(dense_dim, embed_dim)
        )

        # 3. Layer Norms
        self.layernorm_1 = nn.LayerNorm(embed_dim)
        self.layernorm_2 = nn.LayerNorm(embed_dim)

    def forward(self, inputs):
        # inputs: (Batch, Seq, Features)

        # A. Attention Block
        # PyTorch MultiheadAttention requires Q, K, V. For self-attention, all are 'inputs'.
        # It returns (attn_output, attn_weights). We only need output.
        attention_output, _ = self.attention(inputs, inputs, inputs)

        # Add & Norm
        proj_input = self.layernorm_1(inputs + attention_output)

        # B. Feed Forward Block
        proj_output = self.dense_proj(proj_input)

        # Add & Norm
        return self.layernorm_2(proj_input + proj_output)


# --- COMPONENT 3: The Full Model Assembly ---
class VideoTransformerClassifier(nn.Module):
    def __init__(self, sequence_length, embed_dim, dense_dim, num_heads, num_classes):
        super().__init__()

        # 1. Positional Embedding
        self.pos_embedding = PositionalEmbedding(sequence_length, embed_dim)

        # 2. Transformer Encoder
        self.transformer_layer = TransformerEncoder(embed_dim, dense_dim, num_heads)

        # 3. Dropout
        self.dropout = nn.Dropout(0.5)

        # 4. Classifier Head
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # x: (Batch, Seq, Embed_Dim)

        x = self.pos_embedding(x)
        x = self.transformer_layer(x)

        # Global Max Pooling 1D
        # Keras GlobalMaxPooling1D finds the max value across the Time dimension.
        # PyTorch: x.max(dim=1) returns (values, indices). We take [0] for values.
        x, _ = x.max(dim=1)

        x = self.dropout(x)

        # Final Dense Layer
        # Note: We do NOT use Softmax here. PyTorch CrossEntropyLoss does Softmax internally.
        output = self.fc(x)
        return output

In [46]:
# --- SETUP: Load the pre-computed Data ---
# Load tensors from Step 2.5
train_features = torch.load("train_features.pt")
train_labels = torch.load("train_labels.pt")

# Create PyTorch Datasets
train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
# test_dataset = torch.utils.data.TensorDataset(test_features, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# --- HYPERPARAMETERS (Matching the Keras Code) ---
MAX_SEQ_LENGTH = 20
NUM_FEATURES = 1024
DENSE_DIM = 4
NUM_HEADS = 1
CLASSES = 5 # Depending on your dataset
EPOCHS = 50 # Adjusted
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- INITIALIZE MODEL ---
model = VideoTransformerClassifier(
    sequence_length=MAX_SEQ_LENGTH,
    embed_dim=NUM_FEATURES,
    dense_dim=DENSE_DIM,
    num_heads=NUM_HEADS,
    num_classes=CLASSES
).to(DEVICE)

# Optimizer and Loss
optimizer = optim.Adam(model.parameters(), lr=1e-4) # Slightly lower LR often helps Transformers
criterion = nn.CrossEntropyLoss() # This includes Softmax

# --- THE TRAINING LOOP (The "fit" replacement) ---
def run_experiment_pytorch():
    best_acc = 0.0

    print(f"Starting training on {DEVICE}...")

    for epoch in range(EPOCHS):
        model.train() # Set to training mode (enables Dropout)
        running_loss = 0.0
        correct = 0
        total = 0

        for features, labels in train_loader:
            features, labels = features.to(DEVICE), labels.to(DEVICE)

            # 1. Zero Gradients
            optimizer.zero_grad()

            # 2. Forward Pass
            outputs = model(features)

            # 3. Calculate Loss
            loss = criterion(outputs, labels)

            # 4. Backward Pass
            loss.backward()

            # 5. Optimize
            optimizer.step()

            # Track stats
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_acc = 100 * correct / total

        # # --- VALIDATION LOOP ---
        # model.eval() # Set to evaluation mode (disables Dropout)
        # val_correct = 0
        # val_total = 0
        # with torch.no_grad():
        #     for features, labels in test_loader:
        #         features, labels = features.to(DEVICE), labels.to(DEVICE)
        #         outputs = model(features)
        #         _, predicted = torch.max(outputs.data, 1)
        #         val_total += labels.size(0)
        #         val_correct += (predicted == labels).sum().item()

        # val_acc = 100 * val_correct / val_total

        # print(f"Epoch [{epoch+1}/{EPOCHS}] "
        #       f"Loss: {running_loss/len(train_loader):.4f} | "
        #       f"Train Acc: {train_acc:.2f}% | "
        #       f"Val Acc: {val_acc:.2f}%")

        # # Save Best Model (Checkpoint)
        # if val_acc > best_acc:
        #     best_acc = val_acc
        #     torch.save(model.state_dict(), "best_video_transformer.pth")
        #     print("  -> Model Saved!")

# Run it!
run_experiment_pytorch()

Starting training on cpu...
