In [None]:
# Install necessary libraries
!pip install --no-cache-dir kagglehub kaggle datasets transformers

# Import necessary libraries
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from transformers import ViTModel, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.cuda.amp import GradScaler, autocast
import cv2
from datasets import load_dataset
from torchvision.models import resnet50, ResNet50_Weights
from transformers import get_linear_schedule_with_warmup

# Set up and download MSR-VTT dataset
import kagglehub
path = kagglehub.dataset_download("vishnutheepb/msrvtt")
dest_path = "/content/msrvtt"

!mkdir -p $dest_path
!mv $path/* $dest_path
print("Dataset files path:", dest_path)

# ===========================
# Step 1: Load MSR-VTT Dataset
# ===========================
dataset = load_dataset("AlexZigma/msr-vtt", split="train")
print("Dataset structure:", dataset)
print("First example:", dataset[0])

# Save video captions to JSON file
captions_dict = {example["video_id"]: example["caption"] for example in dataset}
captions_path = "/content/msrvtt_captions.json"
with open(captions_path, "w") as f:
    json.dump(captions_dict, f)
print("Captions saved to:", captions_path)

# Set video directory
video_dir = "/content/msrvtt/TrainValVideo"
os.makedirs(video_dir, exist_ok=True)

# ===========================
# Step 2: Define DataLoader
# ===========================
class MSRVTTDataset(Dataset):
    def __init__(self, video_dir, captions_path, frame_size=(224, 224), frames_per_clip=16):
        self.video_dir = video_dir
        self.frame_size = frame_size
        self.frames_per_clip = frames_per_clip
        with open(captions_path, 'r') as f:
            self.captions_data = json.load(f)
        self.video_files = list(self.captions_data.keys())
        print(f"Total videos found in captions file: {len(self.video_files)}")

    def __len__(self):
        return len(self.video_files)

    # Updated to skip files not locally available
    def load_video_frames(self, video_path):
        if not os.path.exists(video_path):
            print(f"Skipping video file not found : {video_path}")
            return None
        cap = cv2.VideoCapture(video_path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret: break
            frame = cv2.resize(frame, self.frame_size)
            frame = torch.tensor(frame).permute(2, 0, 1) / 255.0
            frames.append(frame)
        cap.release()

        if len(frames) == 0:
          print(f"Warning: No frames read from video file {video_path}.")
          return None
        frames = frames[:self.frames_per_clip]
        return torch.stack(frames) if len(frames) > 0 else None

    def __getitem__(self, idx):
        while True:
            video_file = self.video_files[idx]
            video_path = os.path.join(self.video_dir, f"{video_file}.mp4")
            frames = self.load_video_frames(video_path)
            if frames is not None:
                caption = self.captions_data[video_file]
                return frames, caption
            else:
                print(f"Skipping index {idx} due to missing frames.")
                idx = (idx + 1) % len(self)

dataset = MSRVTTDataset(video_dir, captions_path)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# ===========================
# Step 3: Model Definition
# ===========================
resnet50 = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
resnet_feature_dim = resnet50.fc.in_features
resnet50.fc = nn.Identity()  # Remove final layer
resnet50 = resnet50.cuda()
vit = ViTModel.from_pretrained("google/vit-base-patch16-224").cuda()
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").cuda()

# combined model
class ConViTCaptioning(nn.Module):
    def __init__(self, resnet50, vit, gpt2_model, resnet_feature_dim, embed_dim=768):
        super(ConViTCaptioning, self).__init__()
        self.resnet = resnet50
        self.vit = vit
        self.gpt2 = gpt2_model
        self.proj_layer = nn.Linear(resnet_feature_dim + vit.config.hidden_size, embed_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, frames):
        # Extract local features with ResNet
        local_features = self.resnet(frames)  # Process batch of frames
        local_features = local_features.view(frames.size(0), -1)  # Flatten if necessary

        # Extract global features with ViT
        vit_features = self.vit(pixel_values=frames).pooler_output  # Shape: (batch_size, vit_hidden_size)

        # Combine features and project to GPT-2 embedding size
        combined_features = torch.cat((local_features, vit_features), dim=1)  # Shape: (batch_size, resnet_feat_dim + vit_hidden_size)
        projected_features = self.proj_layer(combined_features)  # Shape: (batch_size, embed_dim)
        projected_features = self.dropout(projected_features)  # Apply dropout here

        return projected_features

model = ConViTCaptioning(resnet50, vit, gpt2_model, resnet_feature_dim).cuda()
optimizer = torch.optim.Adam(gpt2_model.parameters(), lr=1e-4)
scaler = GradScaler()

# ===========================
# Step 4: Reward Model Definition + Caption Generation
# ===========================
def reward_function(generated_caption, reference_caption):
    generated_tokens = set(generated_caption.lower().split())
    reference_tokens = set(reference_caption[0].lower().split())
    overlap = len(generated_tokens.intersection(reference_tokens))
    return overlap / len(reference_tokens)

def generate_caption(features, reference_caption, threshold=0.7, max_attempts=5):
    best_caption = ""
    best_reward = 0
    for attempt in range(max_attempts):
        input_ids = tokenizer("Video summary:", return_tensors="pt", padding="max_length", truncation=True, max_length=20).input_ids.cuda()
        attention_mask = (input_ids != tokenizer.pad_token_id).long().cuda()

        with torch.amp.autocast('cuda'):
            output_ids = gpt2_model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=100,
                num_return_sequences=1,
                no_repeat_ngram_size=3,
                top_k=50,
                top_p=0.9,
                temperature=0.6,
                num_beams=7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )
        generated_caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Calculate reward
        reward = reward_function(generated_caption, reference_caption)
        if reward > best_reward:
            best_caption = generated_caption
            best_reward = reward
        if reward >= threshold:
            return generated_caption, reward
    return best_caption, best_reward  # Return the best caption after attempts


# ===========================
# Step 5: Training Loop (Updated for Multi-Video Batch Processing and Cross-Entropy Loss)
# ===========================
num_training_steps = len(dataloader) * 3  # 3 epochs
num_warmup_steps = int(0.1 * num_training_steps)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
epochs = 3
batch_size = 4
model = ConViTCaptioning(resnet50, vit, gpt2_model, resnet_feature_dim).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scaler = torch.amp.GradScaler()

accumulation_steps = 64
for epoch in range(epochs):
    model.train()
    valid_batches = 0
    for batch_idx, (frames, captions) in enumerate(dataloader):
        if frames is None or captions is None:
            continue
        frames = frames.cuda()
        current_batch_size, num_frames, channels, height, width = frames.shape

        # Reshape frames for batch processing
        frames = frames.view(current_batch_size * num_frames, channels, height, width)

        optimizer.zero_grad()
        with torch.amp.autocast('cuda'):
            # Forward pass to get projected features
            projected_features = model(frames)
            projected_features = projected_features.view(current_batch_size, num_frames, -1)
            video_features = projected_features.mean(dim=1)  # Average over frames

            # Prepare input and labels for GPT-2
            inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=True).input_ids.cuda()
            labels = inputs.clone()

            # Generate output from GPT-2 based on projected features
            outputs = gpt2_model(inputs_embeds=video_features.unsqueeze(1).repeat(1, inputs.shape[1], 1), labels=labels)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        if (batch_idx + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

            print(f"Epoch [{epoch+1}/{epochs}], Batch [{batch_idx+1}/{len(dataloader)}], Loss: {loss.item()}")
    torch.save(model.state_dict(), f"convit_captioning_model_epoch_{epoch+1}.pth")
    print(f"Model saved for epoch {epoch+1}")
    print(f"Epoch {epoch+1} complete.")
# ===========================
# Step 6: Save Model
# ===========================
torch.save(model.state_dict(), "convit_captioning_model.pth")
print("Model saved.")

print("\nTesting Generated Captions Against Original Captions:\n")
model.eval()
test_count = 3
with torch.no_grad():
    for idx, (frames, captions) in enumerate(dataloader):
        if idx >= test_count:
            break
        frames = frames.cuda()
        batch_size, num_frames, channels, height, width = frames.shape
        frames = frames.view(batch_size * num_frames, channels, height, width)
        features = vit(pixel_values=frames)
        features = features.last_hidden_state
        features = features.view(batch_size, num_frames, -1)
        video_features = features.mean(dim=1)
        generated_caption, _ = generate_caption(video_features, captions)
        print(f"Video ID: {dataset.video_files[idx]}")
        print(f"Original Caption: {captions}")
        print(f"Generated Caption: {generated_caption}\n")


mv: cannot stat '/root/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/*': No such file or directory
Dataset files path: /content/msrvtt
Dataset structure: Dataset({
    features: ['video_id', 'caption', 'sen_id', 'category', 'url', 'start time', 'end time', 'split', 'id', '__index_level_0__'],
    num_rows: 6513
})
First example: {'video_id': 'video0', 'caption': 'a car is shown', 'sen_id': 77300, 'category': 9, 'url': 'https://www.youtube.com/watch?v=9lZi22qLlEo', 'start time': 137.72, 'end time': 149.44, 'split': 'train', 'id': 0, '__index_level_0__': 0}
Captions saved to: /content/msrvtt_captions.json
Total videos found in captions file: 6513


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Epoch [1/3], Batch [64/1629], Loss: 0.12935574352741241
Epoch [1/3], Batch [128/1629], Loss: 0.11462084203958511
Epoch [1/3], Batch [192/1629], Loss: 0.12598364055156708
Epoch [1/3], Batch [256/1629], Loss: 0.1266632080078125
Epoch [1/3], Batch [320/1629], Loss: 0.11484319716691971
Epoch [1/3], Batch [384/1629], Loss: 0.10541076958179474
Epoch [1/3], Batch [448/1629], Loss: 0.11478233337402344
Epoch [1/3], Batch [512/1629], Loss: 0.08833789825439453
Epoch [1/3], Batch [576/1629], Loss: 0.10030937194824219
Epoch [1/3], Batch [640/1629], Loss: 0.09641341865062714
Epoch [1/3], Batch [704/1629], Loss: 0.09109151363372803
Epoch [1/3], Batch [768/1629], Loss: 0.09230327606201172
Epoch [1/3], Batch [832/1629], Loss: 0.08915825933218002
Epoch [1/3], Batch [896/1629], Loss: 0.10894598066806793
Epoch [1/3], Batch [960/1629], Loss: 0.09387648850679398
Epoch [1/3], Batch [1024/1629], Loss: 0.0773005336523056
Epoch [1/3], Batch [1088/1629], Loss: 0.11694017797708511
Epoch [1/3], Batch [1152/1629], 

In [1]:
# torch.save(model.state_dict(), "convit_captioning_model.pth")
# print("Model saved.")

print("\nTesting Generated Captions Against Original Captions:\n")
model.eval()
test_count = 3
with torch.no_grad():
    for idx, (frames, captions) in enumerate(dataloader):
        if idx >= test_count:
            break
        frames = frames.cuda()
        batch_size, num_frames, channels, height, width = frames.shape
        frames = frames.view(batch_size * num_frames, channels, height, width)
        features = vit(pixel_values=frames)
        features = features.last_hidden_state
        features = features.view(batch_size, num_frames, -1)
        video_features = features.mean(dim=1)
        generated_caption, _ = generate_caption(video_features, captions)
        print(f"Video ID: {dataset.video_files[idx]}")
        print(f"Original Caption: {captions}")
        print(f"Generated Caption: {generated_caption}\n")


Testing Generated Captions Against Original Captions:



NameError: name 'model' is not defined