# Set the Working Directory

In [None]:
import os

working_dir = os.getcwd()

if os.path.isdir(working_dir):
    print("Working directory is ready!")
else:
    raise ValueError("Working directory does not exist")

# Set the Trace Directory

In [None]:
import yaml

# Load configuration file
config_path = os.path.join(working_dir, "configuration.yaml")
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# Resolve trace directory
trace_dir = os.path.join(working_dir, "data", config["working_trace"])

if os.path.isdir(trace_dir):
    print(f"✅ Trace directory ready: {trace_dir}")
else:
    raise FileNotFoundError(f"❌ Trace directory not found: {trace_dir}\n")

In [None]:
from utils import read_file_ids

# Read file IDs from the trace directory
file_ids = read_file_ids(trace_dir=trace_dir, overwrite=False)

print(f"📁 Total video files found: {len(file_ids)}")

# Change to the `Video-LLaMA` Directory

In [None]:
%cd {working_dir}/models/Video-LLaMA

# Generate LLM-Generated Description

In [None]:
model_size_to_cfg = {
    "7b": "eval_configs/video_llama_eval_withaudio_7b.yaml",
    "13b": "eval_configs/video_llama_eval_withaudio_13b.yaml"
}

selected_model_size = ""
while selected_model_size not in model_size_to_cfg:
    selected_model_size = input("Select model size ('7b' or '13b'): ").strip().lower()
    if selected_model_size not in model_size_to_cfg:
        print("❌ Invalid input. Please enter '7b' or '13b'.")

# Assign the corresponding config path
resolved_cfg_path = model_size_to_cfg[selected_model_size]
print(f"✅ Using configuration: {resolved_cfg_path}")

In [None]:
import argparse
import gradio as gr
import numpy as np
import os
import random
import torch
import torch.backends.cudnn as cudnn

import decord
decord.bridge.set_bridge("torch")

from video_llama.common.config import Config
from video_llama.common.dist_utils import get_rank
from video_llama.common.registry import registry
from video_llama.conversation.conversation_video import (
    Chat, Conversation, default_conversation, SeparatorStyle, conv_llava_llama_2
)
from video_llama.datasets.builders import *
from video_llama.models import *
from video_llama.processors import *
from video_llama.runners import *
from video_llama.tasks import *

def parse_args():
    parser = argparse.ArgumentParser(description="Demo")
    parser.add_argument("--cfg-path", default=resolved_cfg_path)
    parser.add_argument("--gpu-id", type=int, default=0)
    parser.add_argument("--model_type", type=str, default="llama_2")
    parser.add_argument("--options", nargs="+")
    args, _ = parser.parse_known_args()
    return args

def upload_imgorvideo(chat, gr_video, chatbot, chat_state, audio_flag):
    chatbot.append(((gr_video,), None))
    chat_state = conv_llava_llama_2.copy()
    chat_state.system = ""
    img_list = []
    chat.upload_video(gr_video, chat_state, img_list)
    return chatbot, chat_state, img_list

def gradio_ask(chat, user_message, chatbot, chat_state):
    if not user_message.strip():
        return gr.update(interactive=True, placeholder="Input should not be empty!"), chatbot, chat_state
    chat.ask(user_message, chat_state)
    chatbot.append([user_message, None])
    return chatbot, chat_state

def gradio_answer(chat, chatbot, chat_state, img_list, num_beams, temperature):
    response = chat.answer(
        conv=chat_state,
        img_list=img_list,
        num_beams=num_beams,
        temperature=temperature,
        max_new_tokens=300,
        max_length=2000
    )[0]
    return response

This step might take around several minutes — hang tight! ⏲😴
- 📟 `video_llama_eval_withaudio_7b` requires approximately **24 GB** of GPU memory.
- 📟 `video_llama_eval_withaudio_13b` requires approximately **40 GB** of GPU memory.
- ⚠️ Please your system has sufficient GPU memory and select the appropriate model accordingly.

In [None]:
# Parse arguments and load configuration
args = parse_args()
cfg = Config(args)

# Load model from configuration
model_cfg = cfg.model_cfg
model_cfg.device_8bit = args.gpu_id

model_cls = registry.get_model_class(model_cfg.arch)
device_str = f"cuda:{args.gpu_id}"
model = model_cls.from_config(model_cfg).to(device_str)
model.eval()

# Initialize visual processor
vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
vis_processor_cls = registry.get_processor_class(vis_processor_cfg.name)
vis_processor = vis_processor_cls.from_config(vis_processor_cfg)

# Initialize chat interface
chat = Chat(model, vis_processor, device=device_str)

print("✅ Model and chat interface initialized successfully.")

In [None]:
import time

feature_dir = os.path.join(trace_dir, "features", "llm_generated_description")
os.makedirs(feature_dir, exist_ok=True)

for idx, file_id in enumerate(file_ids, start=1):
    output_file = os.path.join(feature_dir, f"{file_id}.txt")

    if os.path.exists(output_file):
        print(f"✅ Description already exists: {file_id}.txt")
        continue
    
    video_path = os.path.join(trace_dir, "videos", f"{file_id}.mp4")
    print(f"🛠️ [{idx}/{len(file_ids)}] Generating: {file_id}.txt")

    start_time = time.time()

    # Initialize new chat sessions
    chat = Chat(model, vis_processor, device=f"cuda:{args.gpu_id}")

    # Upload video and prepare conversation
    chatbot, chat_state, img_list = upload_imgorvideo(chat, video_path, [], None, audio_flag=True)

    # Submit the prompt
    chatbot, chat_state = gradio_ask(chat, "Describe this video", chatbot, chat_state)

    # Get the model's responses
    response = gradio_answer(chat, chatbot, chat_state, img_list, num_beams=1, temperature=1)

    # Save the responses
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(response)
    
    elapsed = time.time() - start_time
    
    print(f"📦 Saved description: {file_id}.txt (elapsed: {elapsed:.2f} seconds)")
    print(f"📝 Description {idx}/{len(file_ids)}: {response}")

In [None]:
import pickle
from tqdm.notebook import tqdm

description_dir = os.path.join(trace_dir, "features", "llm_generated_description")
output_path = os.path.join(trace_dir, "descriptions.pickle")

descriptions = {}

for file_id in tqdm(file_ids, desc="Loading descriptions"):
    description_file = os.path.join(description_dir, f"{file_id}.txt")

    with open(description_file, "r", encoding="utf-8") as f:
        text = f.read()
        if not isinstance(text, str):
            text = text.decode()

    descriptions[file_id] = text

# Save descriptions to pickle
with open(output_path, "wb") as f:
    pickle.dump(descriptions, f)

print(f"✅ Saved {len(descriptions)} descriptions to {output_path}")