In [1]:
# PIP Install 
!pip install --upgrade transformers
!pip install sentencepiece ftfy regex tqdm opencv-python-headless datasets
!pip install git+https://github.com/openai/CLIP.git
!pip install torch torchvision torchaudio


Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing 

In [2]:

!kaggle datasets download -d vishnutheepb/msrvtt -p /content/msrvtt_videos --unzip


Dataset URL: https://www.kaggle.com/datasets/vishnutheepb/msrvtt
License(s): unknown
Downloading msrvtt.zip to /content/msrvtt_videos
100% 4.25G/4.26G [00:39<00:00, 167MB/s]
100% 4.26G/4.26G [00:39<00:00, 115MB/s]


In [3]:
import cv2
from PIL import Image
from tqdm import tqdm
import numpy as np
import os


video_dir = "/content/msrvtt_videos/TrainValVideo"

if not os.path.exists(video_dir):
    raise ValueError(f"Video directory {video_dir} does not exist.")

# Frame extraction
def extract_frames(video_path, num_frames=16, resize=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        raise ValueError(f"Video {video_path} has only {total_frames} frames, which is less than the required {num_frames} frames.")
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame).resize(resize)
            frames.append(frame)
    cap.release()
    if len(frames) != num_frames:
        raise ValueError(f"Expected {num_frames} frames, but got {len(frames)} from {video_path}")
    return frames


In [4]:
import torch
import clip
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize

device = "cuda" if torch.cuda.is_available() else "cpu"

#  CLIP model
clip_model, preprocess = clip.load("ViT-B/32", device=device)

clip_model = clip_model.float()

# Preprocessing
preprocess = Compose([
    Resize(224, interpolation=Image.BICUBIC),
    CenterCrop(224),
    ToTensor(),
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073),
              std=(0.26862954, 0.26130258, 0.27577711)),
])

# Extract features 
def extract_video_features(video_path, num_frames=16):
    frames = extract_frames(video_path, num_frames=num_frames)
    frame_tensors = torch.stack([preprocess(frame) for frame in frames]).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(frame_tensors)  
        video_features = image_features.mean(dim=0)  
    return video_features.cpu().float() 


100%|███████████████████████████████████████| 338M/338M [00:03<00:00, 96.4MiB/s]


In [5]:
import pandas as pd
from datasets import load_dataset

 MSR-VTT dataset
print("Loading dataset...")
dataset = load_dataset("AlexZigma/msr-vtt", split="train")

captions_df = pd.DataFrame(dataset)
captions_df = captions_df[['video_id', 'caption']]

video_dir = "/content/msrvtt_videos/TrainValVideo"

captions_df['video_path'] = captions_df['video_id'].apply(lambda x: os.path.join(video_dir, f"{x}.mp4"))

# Filter
captions_df = captions_df[captions_df['video_path'].apply(os.path.exists)].reset_index(drop=True)

print(f"Total samples: {len(captions_df)}")

train_size = int(0.8 * len(captions_df))
train_captions_df = captions_df.iloc[:train_size]
test_captions_df = captions_df.iloc[train_size:]
print(f"Training samples: {len(train_captions_df)}, Testing samples: {len(test_captions_df)}")

train_captions_df = train_captions_df.sample(n=100, random_state=42).reset_index(drop=True)
test_captions_df = test_captions_df.sample(n=50, random_state=42).reset_index(drop=True)
print(f"Training samples: {len(train_captions_df)}, Testing samples: {len(test_captions_df)}")


Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/734 [00:00<?, ?B/s]

(…)-00000-of-00001-60e50ff5fbbd1bb5.parquet:   0%|          | 0.00/553k [00:00<?, ?B/s]

(…)-00000-of-00001-01bacdd7064306bc.parquet:   0%|          | 0.00/44.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6513 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/497 [00:00<?, ? examples/s]

Total samples: 6513
Training samples: 5210, Testing samples: 1303
Training samples: 100, Testing samples: 50


In [6]:
import torch
from torch import nn
from transformers import BartTokenizer, BartForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"

# BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)


bart_model = bart_model.float()

for param in bart_model.parameters():
    param.requires_grad = False

# Video Captioning Model
class VideoCaptioningModel(nn.Module):
    def __init__(self, clip_feature_dim, decoder_model, hidden_dim=768, seq_length=10):
        super(VideoCaptioningModel, self).__init__()
        self.decoder = decoder_model
        self.linear = nn.Linear(clip_feature_dim, hidden_dim * seq_length)
        self.relu = nn.ReLU()
        self.seq_length = seq_length
        self.hidden_dim = hidden_dim

    def forward(self, video_features, input_ids, attention_mask, labels=None):

      video_features = video_features.float()

    # Map video features to hidden states
      mapped_features = self.relu(self.linear(video_features))  

      encoder_hidden_states = mapped_features.view(-1, self.seq_length, self.hidden_dim)

      decoder_outputs = self.decoder(
          input_ids=input_ids,
          attention_mask=attention_mask,
          labels=labels
      )

      return decoder_outputs


# Initialization
clip_feature_dim = 512  
seq_length = 10  
model = VideoCaptioningModel(clip_feature_dim=clip_feature_dim, decoder_model=bart_model, seq_length=seq_length).to(device)

model = model.float()


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [7]:
max_length = 20  


In [8]:
from torch.utils.data import Dataset, DataLoader

class VideoCaptionDataset(Dataset):
    def __init__(self, captions_df, tokenizer, clip_model, device, num_frames=16, max_length=20):
        self.captions_df = captions_df
        self.tokenizer = tokenizer
        self.clip_model = clip_model
        self.device = device
        self.num_frames = num_frames
        self.max_length = max_length

    def __len__(self):
        return len(self.captions_df)

    def __getitem__(self, idx):
        video_path = self.captions_df.iloc[idx]['video_path']
        caption = self.captions_df.iloc[idx]['caption']

        # Extract video features
        video_features = extract_video_features(video_path, num_frames=self.num_frames) 

        # Tokenize caption
        tokens = self.tokenizer(
            caption,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = tokens.input_ids.squeeze(0)
        attention_mask = tokens.attention_mask.squeeze(0)

        return {
            'video_features': video_features,
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }
#
# Datasets
train_dataset = VideoCaptionDataset(train_captions_df, tokenizer, clip_model, device)
test_dataset = VideoCaptionDataset(test_captions_df, tokenizer, clip_model, device)

# Dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [9]:

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

from tqdm import tqdm

EPOCHS = 12

In [10]:
import torch

for param in bart_model.parameters():
    param.requires_grad = True


batch = next(iter(train_dataloader))
video_features = batch['video_features'].to(device).float()
video_features = torch.autograd.Variable(video_features, requires_grad=True)

# Forward pass
model.train()
outputs = model(
    video_features=video_features,
    input_ids=batch['input_ids'].to(device),
    attention_mask=batch['attention_mask'].to(device),
    labels=batch['input_ids'].to(device)
)

# Check loss gradients
loss = outputs.loss
print(f"Loss requires_grad: {loss.requires_grad}")
try:
    loss.backward()
    print("Backward pass succeeded.")
except Exception as e:
    print(f"Backward pass failed: {e}")


Loss requires_grad: True
Backward pass succeeded.


In [11]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        video_features = batch['video_features'].to(device).float()
        video_features = torch.autograd.Variable(video_features, requires_grad=True)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()

        optimizer.zero_grad()
        outputs = model(video_features=video_features, input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} completed. Average Loss: {total_loss / len(train_dataloader):.4f}")


Epoch 1 completed. Average Loss: 4.7951
Epoch 2 completed. Average Loss: 1.8920
Epoch 3 completed. Average Loss: 0.5556
Epoch 4 completed. Average Loss: 0.0987
Epoch 5 completed. Average Loss: 0.0453
Epoch 6 completed. Average Loss: 0.0260
Epoch 7 completed. Average Loss: 0.0340
Epoch 8 completed. Average Loss: 0.0923
Epoch 9 completed. Average Loss: 0.0861
Epoch 10 completed. Average Loss: 0.0592
Epoch 11 completed. Average Loss: 0.0476
Epoch 12 completed. Average Loss: 0.0163


In [None]:
# Generate captions
def generate_caption(model, tokenizer, video_path, clip_model, device, num_frames=16, max_length=20):
    model.eval()
    with torch.no_grad():
        # Extract video features
        video_features = extract_video_features(video_path, num_frames=num_frames).to(device)  
        video_features = video_features.unsqueeze(0)  

        
        mapped_features = model.linear(video_features).float()  
        mapped_features = model.relu(mapped_features) 
        encoder_hidden_states = mapped_features.view(-1, model.seq_length, model.hidden_dim)  

        
        encoder_attention_mask = torch.ones(encoder_hidden_states.size()[:2], dtype=torch.long).to(device)

        # Initialization
        input_ids = torch.tensor([[tokenizer.bos_token_id]]).to(device)
        attention_mask = torch.ones_like(input_ids).to(device)

        generated_ids = model.decoder.generate(
          input_ids=input_ids,
          attention_mask=attention_mask,
          max_length=max_length,
          num_beams=5,  
          early_stopping=True
        )


        for _ in range(max_length):
          
            outputs = model(
                video_features=video_features,
                input_ids=generated_ids,
                attention_mask=attention_mask,
                labels=None  
            )

           
            logits = outputs.logits[:, -1, :]  

            # Prediction
            next_token_id = logits.argmax(dim=-1).unsqueeze(-1) 

            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
            attention_mask = torch.cat([attention_mask, torch.ones_like(next_token_id)], dim=-1)

            if next_token_id.item() == tokenizer.eos_token_id:
                break

        # Decode the generated tokens
        caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return caption


print("\n=== Example Inference on Test Set ===\n")
for idx in range(1):  
    video_path = test_captions_df.iloc[idx]['video_path']
    true_caption = test_captions_df.iloc[idx]['caption']
    generated_caption = generate_caption(model, tokenizer, video_path, clip_model, device)
    print(f"Video {idx+1}:")
    print(f"True Caption: {true_caption}")
    print(f"Generated Caption: {generated_caption}\n")
