In [1]:
import sys
sys.path.insert(0, "/Users/abali/Documents/github projects/semantic-video-retrieval")
from clip_embedding import embed_frame,embed_text
import os
import numpy as np
import faiss
import pickle



In [2]:


from extract_chunks_updated import extract_chunks
# ✅ Set paths
video_folder = "/Users/abali/Documents/github projects/semantic-video-retrieval/data/videos"
frames_output_folder = "/Users/abali/Documents/github projects/semantic-video-retrieval/data/frames"
# index_path = "/Users/abali/github projects/semantic-video-retrieval/embeddings/faiss_index"
# metadata_path = "/Users/abali/github projects/semantic-video-retrieval/embeddings/metadata"

# ✅ Extract chunks from videos
all_metadata = []
for fname in os.listdir(video_folder):
    if fname.endswith(".mp4"):
        video_path = os.path.join(video_folder, fname)
        print(f"📦 Processing {fname}...")
        chunks = extract_chunks(video_path, frames_output_folder, chunk_duration=15, fps=1)
        all_metadata.extend(chunks)



📦 Processing sample2.mp4...
📦 Processing sample1.mp4...


In [4]:


def get_chunk_embeddings(chunk_folder_root):
    """
    Given a directory of chunks (each containing frames), compute the average embedding
    for each chunk and return a list of {chunk_id, embedding}.
    """
    chunk_embeddings = []

    for chunk_name in sorted(os.listdir(chunk_folder_root)):
        chunk_path = os.path.join(chunk_folder_root, chunk_name)
        if not os.path.isdir(chunk_path):
            continue

        print(f"🔍 Processing {chunk_name}...")
        frame_embeddings = []

        for fname in sorted(os.listdir(chunk_path)):
            if fname.lower().endswith((".jpg", ".jpeg", ".png")):
                image_path = os.path.join(chunk_path, fname)
                emb = embed_frame(image_path)
                if emb is not None and not np.isnan(emb).any():
                    frame_embeddings.append(emb)
        
        if frame_embeddings:
            mean_emb = np.mean(frame_embeddings, axis=0).astype("float32")
            chunk_embeddings.append({
                "chunk_id": chunk_name,
                "embedding": mean_emb
            })
            print(f"✅ Embedded: {chunk_name} ({len(frame_embeddings)} frames)")
        else:
            print(f"⚠️ No valid frames found for {chunk_name}")

    return chunk_embeddings


In [5]:
chunk_embeddings=get_chunk_embeddings('/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks')

🔍 Processing sample1_chunk0...
✅ Embedded: sample1_chunk0 (5 frames)
🔍 Processing sample2_chunk0...
✅ Embedded: sample2_chunk0 (15 frames)


In [6]:
chunk_embeddings

[{'chunk_id': 'sample1_chunk0',
  'embedding': array([ 1.31965401e-02, -1.28017098e-01,  1.43216014e-01, -2.96000510e-01,
          2.50013292e-01, -3.38098709e-03,  3.97343993e-01,  5.85803270e-01,
         -1.30579039e-01,  4.07086670e-01,  3.59473556e-01, -1.30832046e-01,
         -9.47870612e-01, -5.57159074e-02, -8.37708861e-02,  1.23554496e-02,
         -9.94147956e-01,  2.12496474e-01,  1.18775807e-01, -3.38319386e-03,
          1.13974154e+00, -1.98749632e-01, -3.40611428e-01,  5.88242471e-01,
          6.88612908e-02,  6.78199083e-02,  2.36649252e-02, -1.32871181e-01,
          1.91118509e-01, -1.33325383e-01, -3.09974045e-01,  4.98383492e-02,
         -2.27982715e-01, -1.53725624e-01,  1.99300796e-01,  1.61316574e-01,
         -8.09692740e-02, -1.72532931e-01, -9.98654813e-02,  4.86957014e-01,
         -3.69622558e-01,  5.63748404e-02,  1.94802493e-01,  2.98628986e-01,
          4.90416959e-02, -8.19855392e-01,  3.73647422e-01,  1.59093574e-01,
         -3.76491472e-02,  1.15

In [7]:
import os
import pickle
import numpy as np
import faiss

def store_embeddings_to_faiss(embedding_data, index_path, metadata_path):
    if not embedding_data:
        print("⚠️ No embeddings to store.")
        return

    os.makedirs(os.path.dirname(index_path), exist_ok=True)
    os.makedirs(os.path.dirname(metadata_path), exist_ok=True)

    embeddings = [item["embedding"] for item in embedding_data]
    metadata = [{"chunk_id": item["chunk_id"]} for item in embedding_data]

    arr = np.vstack(embeddings).astype("float32")
    print(f"📐 FAISS index shape: {arr.shape}")

    index = faiss.IndexFlatL2(arr.shape[1])
    index.add(arr)

    faiss.write_index(index, index_path)
    with open(metadata_path, "wb") as f:
        pickle.dump(metadata, f)

    print(f"✅ Stored {len(arr)} vectors in FAISS and metadata")


In [8]:
store_embeddings_to_faiss(chunk_embeddings, '/Users/abali/github projects/semantic-video-retrieval/embeddings/faiss_index/video_chunks.index', '/Users/abali/github projects/semantic-video-retrieval/embeddings/metadata/chunk_metadata.pkl')  

📐 FAISS index shape: (2, 512)
✅ Stored 2 vectors in FAISS and metadata


In [9]:
def search_top_chunks(query, index_path, metadata_path, k=5):
    # Load index
    index = faiss.read_index(index_path)

    # Load metadata
    with open(metadata_path, "rb") as f:
        metadata = pickle.load(f)

    # Embed query
    query_embedding = embed_text(query).astype("float32").reshape(1, -1)

    # Search
    D, I = index.search(query_embedding, k)

    # Collect results
    results = []
    for i in I[0]:
        if i < len(metadata):
            results.append(metadata[i])
    
    return results

In [10]:
results = search_top_chunks(
    query="moving cars",
    index_path="/Users/abali/github projects/semantic-video-retrieval/embeddings/faiss_index/video_chunks.index",
    metadata_path="/Users/abali/github projects/semantic-video-retrieval/embeddings/metadata/chunk_metadata.pkl",k=1)

for r in results:
    print(f"🎬 Chunk: {r['chunk_id']}")


🎬 Chunk: sample1_chunk0


In [12]:
import os
import subprocess

# Loop through the search results and play the video
for r in results:
    # Extract the base video file name (e.g., sample1.mp4 from sample1_chunk0)
    video_name = r['chunk_id'].split('_')[0] + '.mp4'
    
    # Construct the full video path
    video_path = os.path.join("/Users/abali/Documents/github projects/semantic-video-retrieval/data/videos", video_name)

    print(f"🎬 Chunk: {r['chunk_id']}")

    # Ensure the video exists
    if os.path.exists(video_path):
        print(f"▶️ Opening video {video_name}...")

        # Use ffplay to open the entire video
        subprocess.Popen(["open", video_path])  # macOS-specific command to open video with default player
    else:
        print(f"⚠️ Video not found: {video_path}")


🎬 Chunk: sample1_chunk0
▶️ Opening video sample1.mp4...


In [27]:
import pickle

with open("/Users/abali/Documents/github projects/semantic-video-retrieval/embeddings/metadata/enriched_chunk_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

print(metadata[0])


{'chunk_id': 'sample1_chunk0', 'video_id': 'sample1', 'frame_paths': ['/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_0.jpg', '/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_1.jpg', '/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_2.jpg', '/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_3.jpg', '/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_4.jpg'], 'num_frames': 5, 'start_frame': 0, 'end_frame': 4}


In [13]:
# Sample enhancement (you can adapt this based on your real folder structure)
from pathlib import Path

updated_metadata = []
for entry in metadata:
    chunk_id = entry['chunk_id']
    video_id = chunk_id.split('_')[0]
    chunk_folder = Path(f"/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/{chunk_id}")
    frame_files = sorted(chunk_folder.glob("frame_*.jpg"))

    enriched = {
        "chunk_id": chunk_id,
        "video_id": video_id,
        "frame_paths": [str(p) for p in frame_files],
        "num_frames": len(frame_files),
        "start_frame": int(frame_files[0].stem.split('_')[-1]) if frame_files else None,
        "end_frame": int(frame_files[-1].stem.split('_')[-1]) if frame_files else None,
    }
    updated_metadata.append(enriched)

# Save this enriched metadata for graph construction
import pickle
with open("/Users/abali/Documents/github projects/semantic-video-retrieval/embeddings/metadata/enriched_chunk_metadata.pkl", "wb") as f:
    pickle.dump(updated_metadata, f)


In [1]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

local_path = "/Users/abali/Documents/github projects/semantic-video-retrieval/local_models/vit-gpt2-image-captioning"

model = VisionEncoderDecoderModel.from_pretrained(local_path)
processor = ViTImageProcessor.from_pretrained(local_path)
tokenizer = AutoTokenizer.from_pretrained(local_path)


  from .autonotebook import tqdm as notebook_tqdm
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.2"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop":

## Adding Image Description to Enrich Metadata

In [4]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

local_path = "/Users/abali/Documents/github projects/semantic-video-retrieval/local_models/blip-image-captioning-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained(local_path)
model = BlipForConditionalGeneration.from_pretrained(local_path).to(device)


## Adding Description,Entities/ Objects to metadata

In [9]:
import os
import pickle
import torch
from PIL import Image
from transformers import (
    VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer,
    BlipProcessor, BlipForConditionalGeneration
)
import spacy
from ultralytics import YOLO

# === CONFIG ===
USE_BLIP = True  # 🔁 Toggle this to False to use ViT-GPT2
device = "cuda" if torch.cuda.is_available() else "cpu"

# === Load Captioning Model ===
if USE_BLIP:
    blip_path = "/Users/abali/Documents/github projects/semantic-video-retrieval/local_models/blip-image-captioning-base"
    processor = BlipProcessor.from_pretrained(blip_path)
    caption_model = BlipForConditionalGeneration.from_pretrained(blip_path).to(device)
    tokenizer = processor.tokenizer  # BLIP uses same tokenizer object
else:
    vit_path = "/Users/abali/Documents/github projects/semantic-video-retrieval/local_models/vit-gpt2-image-captioning"
    caption_model = VisionEncoderDecoderModel.from_pretrained(vit_path).to(device)
    processor = ViTImageProcessor.from_pretrained(vit_path)
    tokenizer = AutoTokenizer.from_pretrained(vit_path)

# === Load NLP and Object Detection Models ===
nlp = spacy.load("en_core_web_sm")
yolo_model = YOLO("yolov8n.pt")  # Swap with yolov8m.pt if needed

# === Frame Processors ===
def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

        with torch.no_grad():
            output_ids = caption_model.generate(pixel_values, max_length=20, num_beams=4)

        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        return caption
    except Exception as e:
        print(f"❌ Caption error on {image_path}: {e}")
        return None

def extract_entities(texts):
    entities = set()
    for txt in texts:
        doc = nlp(txt)
        entities.update([ent.text.lower() for ent in doc.ents])
        entities.update([chunk.text.lower() for chunk in doc.noun_chunks])
    return list(entities)

def detect_objects(image_path):
    try:
        results = yolo_model(image_path)
        objects = set()
        for r in results:
            for box in r.boxes:
                cls_id = int(box.cls[0])
                label = r.names[cls_id]
                objects.add(label.lower())
        return list(objects)
    except Exception as e:
        print(f"❌ Detection error on {image_path}: {e}")
        return []

# === Load Chunk Metadata ===
with open("/Users/abali/Documents/github projects/semantic-video-retrieval/embeddings/metadata/enriched_chunk_metadata.pkl", "rb") as f:
    chunks = pickle.load(f)

# === Enrichment Loop ===
for chunk in chunks:
    frame_paths = chunk.get("frame_paths", [])[:2]
    captions = []
    all_objects = set()

    for frame_path in frame_paths:
        if not os.path.exists(frame_path):
            continue

        caption = generate_caption(frame_path)
        if caption:
            captions.append(caption)

        detected = detect_objects(frame_path)
        all_objects.update(detected)

    chunk["captions"] = captions
    chunk["caption_entities"] = extract_entities(captions)
    chunk["objects"] = list(all_objects)
    chunk["all_entities"] = list(set(chunk["caption_entities"]) | set(chunk["objects"]))

# === Save Output ===
with open("/Users/abali/Documents/github projects/semantic-video-retrieval/embeddings/metadata/enriched_with_all.pkl", "wb") as f:
    pickle.dump(chunks, f)

print("✅ Full enrichment complete (captions + entities + objects)!")



image 1/1 /Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_0.jpg: 384x640 2 cars, 1 truck, 61.4ms
Speed: 3.3ms preprocess, 61.4ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_1.jpg: 384x640 2 cars, 1 truck, 64.0ms
Speed: 3.3ms preprocess, 64.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample2_chunk0/frame_0.jpg: 384x640 1 person, 1 dog, 1 bear, 66.7ms
Speed: 2.7ms preprocess, 66.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample2_chunk0/frame_1.jpg: 384x640 1 person, 1 dog, 1 bear, 55.6ms
Speed: 3.5ms preprocess, 55.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
✅ Full e

In [10]:
for chunk in chunks[:2]:
    print(f"\n🧩 {chunk['chunk_id']}")
    print(f"🖼️  Captions: {chunk['captions']}")
    print(f"🔍 Objects: {chunk['objects']}")
    print(f"🧠 Caption Entities: {chunk['caption_entities']}")
    print(f"📦 All Entities (merged): {chunk['all_entities']}")



🧩 sample1_chunk0
🖼️  Captions: ['green grass on the ground', 'the trees are green']
🔍 Objects: ['truck', 'car']
🧠 Caption Entities: ['green grass', 'the trees', 'the ground']
📦 All Entities (merged): ['green grass', 'the trees', 'truck', 'the ground', 'car']

🧩 sample2_chunk0
🖼️  Captions: ['a person feeding a squirrel on the ground', 'a person feeding a squirrel on the ground']
🔍 Objects: ['dog', 'person', 'bear']
🧠 Caption Entities: ['a squirrel', 'the ground', 'a person']
📦 All Entities (merged): ['dog', 'bear', 'person', 'a squirrel', 'the ground', 'a person']


In [11]:
with open("/Users/abali/Documents/github projects/semantic-video-retrieval/embeddings/metadata/enriched_with_all.pkl", "rb") as f:
    chunks = pickle.load(f)

In [12]:
chunks

[{'chunk_id': 'sample1_chunk0',
  'video_id': 'sample1',
  'frame_paths': ['/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_0.jpg',
   '/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_1.jpg',
   '/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_2.jpg',
   '/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_3.jpg',
   '/Users/abali/Documents/github projects/semantic-video-retrieval/data/chunks/sample1_chunk0/frame_4.jpg'],
  'num_frames': 5,
  'start_frame': 0,
  'end_frame': 4,
  'captions': ['green grass on the ground', 'the trees are green'],
  'caption_entities': ['green grass', 'the trees', 'the ground'],
  'objects': ['truck', 'car'],
  'all_entities': ['green grass', 'the trees', 'truck', 'the ground', 'car']},
 {'chunk_id': 'sample2_chunk0',
  'video_id': 'sample2',
  'frame_paths': ['/User

## Graph Building

import pickle
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# === Load enriched chunks ===
with open("/Users/abali/Documents/github projects/semantic-video-retrieval/embeddings/metadata/enriched_with_all.pkl", "rb") as f:
    chunks = pickle.load(f)

# === Initialize graph and model ===
G = nx.Graph()
caption_model = SentenceTransformer('all-MiniLM-L6-v2')

# === Add nodes to graph ===
for chunk in chunks:
    G.add_node(chunk["chunk_id"], **chunk)  # All metadata goes in node

# === Helper to compute caption embedding ===
def embed_captions(captions):
    if not captions:
        return np.zeros((384,))
    caption_text = " ".join(captions)
    embedding = caption_model.encode(caption_text, normalize_embeddings=True)
    return embedding

# === Compute pairwise edges ===
alpha, beta = 0.6, 0.4
threshold = 0.3  # only add edges with weight > threshold

embeddings = {c["chunk_id"]: embed_captions(c["captions"]) for c in chunks}
entities = {c["chunk_id"]: set(c["all_entities"]) for c in chunks}

chunk_ids = [c["chunk_id"] for c in chunks]

for i in range(len(chunk_ids)):
    for j in range(i + 1, len(chunk_ids)):
        id1, id2 = chunk_ids[i], chunk_ids[j]
        
        emb_sim = cosine_similarity([embeddings[id1]], [embeddings[id2]])[0][0]
        
        ent1, ent2 = entities[id1], entities[id2]
        if ent1 or ent2:
            jaccard_sim = len(ent1 & ent2) / len(ent1 | ent2)
        else:
            jaccard_sim = 0.0
        
        final_weight = alpha * emb_sim + beta * jaccard_sim

        if final_weight > threshold:
            G.add_edge(id1, id2, weight=final_weight)

print(f"✅ Graph built: {len(G.nodes)} nodes, {len(G.edges)} edges")
