In [None]:
import os
import yt_dlp
import chromadb
from flask import Flask, request, jsonify
from flask_cors import CORS
from pathlib import Path
import hashlib
import json
import re
import subprocess
import cv2
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer
from sentence_transformers import SentenceTransformer
import whisper


class VideoRAG:
    def __init__(self, video_links, chromadb_path="chromadb"):
        """Initialize directories, models, and database connection."""
        self.video_links = video_links

        # ✅ Get the absolute path of the backend directory (where this script runs)
        self.backend_dir = os.getcwd()  # Replaces __file__ for Jupyter Notebook compatibility

        # ✅ Move one level up to find `data/` as a sibling of `backend/`
        self.data_dir = os.path.abspath(os.path.join(self.backend_dir, "..", "data"))

        # ✅ Define directory paths inside `data/`
        self.video_dir = os.path.join(self.data_dir, "videos")
        self.audio_dir = os.path.join(self.data_dir, "audios")
        self.keyframes_dir = os.path.join(self.data_dir, "keyframes")
        self.transcripts_dir = os.path.join(self.data_dir, "transcripts")

        # ✅ Create directories if they don't exist
        for directory in [self.video_dir, self.audio_dir, self.keyframes_dir, self.transcripts_dir]:
            os.makedirs(directory, exist_ok=True)

        # ✅ Initialize models
        self.sentence_model = SentenceTransformer("all-mpnet-base-v2")
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        self.caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        self.whisper_model = whisper.load_model("medium")  # ✅ Load once

        # ✅ Initialize ChromaDB
        self.client = chromadb.PersistentClient(path=chromadb_path)
        self.collection = self.client.get_or_create_collection(name="video_transcripts")

        # ✅ Debugging: Print paths to verify correctness
        print(f"✅ DATA_DIR: {self.data_dir}")
        print(f"✅ VIDEO_DIR: {self.video_dir}")
        print(f"✅ AUDIO_DIR: {self.audio_dir}")
        print(f"✅ KEYFRAMES_DIR: {self.keyframes_dir}")
        print(f"✅ TRANSCRIPTS_DIR: {self.transcripts_dir}")
        print("✅ VideoRAG initialized successfully.")


    def generate_video_id(self, video_url):
        """Generate a unique hash-based video ID."""
        return hashlib.md5(video_url.encode()).hexdigest()

    def check_video_exists(self, video_id):
        """Check if a video and its metadata already exist."""
        transcript_path = os.path.join(self.transcripts_dir, f"{video_id}.json")
        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")

        existing_data = self.collection.get()
        existing_ids = set(existing_data.get("ids", []))

        return f"{video_id}_0" in existing_ids and os.path.exists(transcript_path) and os.path.exists(video_path)

    def download_video(self, video_url, video_id):
        """Download video if not already present."""
        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")

        if os.path.exists(video_path):
            print(f"✅ Video exists: {video_path}. Skipping download.")
            return video_path

        print(f"⬇️ Downloading video: {video_url}")
        ydl_opts = {"outtmpl": video_path, "format": "best", "noplaylist": True}
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])

        return video_path

    def extract_audio(self, video_id):
        """Extract audio from video."""
        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")
        audio_path = os.path.join(self.audio_dir, f"{video_id}.wav")

        if os.path.exists(audio_path):
            return audio_path

        command = ["ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_path]
        subprocess.run(command, check=True)
        return audio_path

    def transcribe_audio(self, audio_path):
        """Transcribe audio using Whisper, but reuse existing transcription if available."""
        transcript_path = os.path.join(self.transcripts_dir, os.path.basename(audio_path).replace(".wav", ".json"))

        print(f"Checking for existing transcript at: {transcript_path}")  # Debug log

        # Explicitly verify whether the transcript file exists
        if os.path.exists(transcript_path):
            print(f"✅ Transcript exists, loading: {transcript_path}")
            with open(transcript_path, "r", encoding="utf-8") as file:
                data = json.load(file)
            return data["text"], data["timestamps"]
        else:
            print(f"{transcript_path} was not present.")
            print(f"🚀 Transcribing audio: {audio_path}")  # Only logs if transcription is happening
            result = self.whisper_model.transcribe(audio_path, word_timestamps=True)

            transcript_data = {
                "text": result["text"],
                "timestamps": [{"start": seg["start"], "end": seg["end"], "text": seg["text"]} for seg in result["segments"]],
            }

            # Ensure directory exists before saving
            os.makedirs(self.transcripts_dir, exist_ok=True)

            with open(transcript_path, "w", encoding="utf-8") as file:
                json.dump(transcript_data, file, ensure_ascii=False, indent=4)

            print(f"✅ Transcript saved to: {transcript_path}")
            return result["text"], transcript_data["timestamps"]

    def extract_keyframes(self, video_id, interval=30):
        """Extract keyframes every `interval` seconds."""
        video_path = os.path.join(self.video_dir, f"{video_id}.mp4")
        cap = cv2.VideoCapture(video_path)
        frame_rate = cap.get(cv2.CAP_PROP_FPS)
        frame_count, frames = 0, []

        success, frame = cap.read()
        while success:
            if frame_count % int(frame_rate * interval) == 0:
                frame_file = os.path.join(self.keyframes_dir, f"{video_id}_frame_{frame_count}.jpg")
                cv2.imwrite(frame_file, frame)
                frames.append(frame_file)
            success, frame = cap.read()
            frame_count += 1
        cap.release()
        return frames

    def generate_frame_description(self, frame_path):
        """Generate frame descriptions using BLIP."""
        raw_image = Image.open(frame_path).convert("RGB")
        inputs = self.processor(raw_image, return_tensors="pt")
        out = self.caption_model.generate(**inputs)
        return self.processor.decode(out[0], skip_special_tokens=True)
    
    def chunk_transcript(self, transcript, transcript_times, chunk_size=200, overlap=20):
        """Splits transcript into smaller chunks for better retrieval."""
        sentences = re.split(r"(?<=[.!?])\s+", transcript)  
        chunks, chunk_timestamps = [], []

        temp_chunk, temp_words = [], 0
        start_time, end_time = None, None

        for sentence in sentences:
            words = sentence.split()
            temp_chunk.append(sentence)
            temp_words += len(words)

            if transcript_times:
                for seg in transcript_times:
                    if seg["text"].strip() in sentence:
                        if not start_time:
                            start_time = seg["start"]
                        end_time = seg["end"]

            if temp_words >= chunk_size:
                chunks.append(" ".join(temp_chunk))
                chunk_timestamps.append({"start_time": start_time, "end_time": end_time})
                temp_chunk = temp_chunk[-overlap:]
                temp_words = sum(len(sent.split()) for sent in temp_chunk)
                start_time, end_time = None, None

        if temp_chunk:
            chunks.append(" ".join(temp_chunk))
            chunk_timestamps.append({"start_time": start_time, "end_time": end_time})

        return chunks, chunk_timestamps

    def store_metadata(self, video_id, video_url, transcript, transcript_times):
        """Store transcript and metadata into ChromaDB."""
        chunks, chunk_timestamps = self.chunk_transcript(transcript, transcript_times)
        chunk_metadatas = [{"video_url": video_url, "start_time": chunk_timestamps[i]["start_time"], "end_time": chunk_timestamps[i]["end_time"]} for i in range(len(chunks))]

        chunk_vectors = self.sentence_model.encode(chunks)
        self.collection.add(documents=chunks, metadatas=chunk_metadatas, embeddings=chunk_vectors, ids=[f"{video_id}_{i}" for i in range(len(chunks))])

    def retrieve_answer(self, query, confidence_threshold=0.5):
        """Retrieve answers based on semantic similarity."""
        query_vector = self.sentence_model.encode(query)
        results = self.collection.query(query_embeddings=[query_vector], n_results=10)

        response_candidates = []
        for doc_list, metadata_list, score_list in zip(results["documents"], results["metadatas"], results["distances"]):
            for doc, metadata, score in zip(doc_list, metadata_list or [{}], score_list):
                if score >= confidence_threshold:
                    response_candidates.append((score, f"🎬 Video URL: {metadata.get('video_url', 'Unknown')} ⏳ Start Time: {metadata.get('start_time', 'Unknown')}s\n📜 Excerpt: {doc}"))

        response_candidates.sort(reverse=True, key=lambda x: x[0])
        return response_candidates[0][1] if response_candidates else "❌ No high-confidence results found."

    def process_videos(self):
        """Loops through video list and processes them only if metadata is missing in ChromaDB."""
        for video_url in self.video_links:
            video_id = self.generate_video_id(video_url)
            if self.check_video_exists(video_id):
                print(f"✅ Skipping {video_url} (Already Processed)")
                continue
            print(f"🎬 Processing: {video_url}")
            video_path = self.download_video(video_url, video_id)
            audio_path = self.extract_audio(video_id)
            transcript, transcript_times = self.transcribe_audio(audio_path)
            keyframes = self.extract_keyframes(video_id)
            keyframe_descriptions = [self.generate_frame_description(frame) for frame in keyframes]
            self.store_metadata(video_id, video_url, transcript, transcript_times)
            print(f"✅ Processed and stored metadata for: {video_url}")


app = Flask(__name__)
CORS(app)
video_links = [
    "https://www.youtube.com/watch?v=Kf57KGwKa0w",
    "https://www.youtube.com/watch?v=ftDsSB3F5kg",
    "https://www.youtube.com/watch?v=kKFrbhZGNNI",
    "https://www.youtube.com/watch?v=6qUxwZcTXHY",
    "https://www.youtube.com/watch?v=MspNdsh0QcM"
]

video_rag = VideoRAG(video_links)
video_rag.process_videos()

@app.route("/query", methods=["POST"])
def query():
    return jsonify({"responses": [video_rag.retrieve_answer(q) for q in request.get_json().get("queries", [])]})

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=8084, debug=False)


✅ DATA_DIR: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\data
✅ VIDEO_DIR: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\data\videos
✅ AUDIO_DIR: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\data\audios
✅ KEYFRAMES_DIR: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\data\keyframes
✅ TRANSCRIPTS_DIR: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\data\transcripts
✅ VideoRAG initialized successfully.
✅ Skipping https://www.youtube.com/watch?v=Kf57KGwKa0w (Already Processed)
✅ Skipping https://www.youtube.com/watch?v=ftDsSB3F5kg (Already Processed)
✅ Skipping https://www.youtube.com/watch?v=kKFrbhZGNNI (Already Processed)
✅ Skipping https://www.youtube.com/watch?v=6qUxwZcTXHY (Already Processed)
🎬 Processing: https://www.youtube.com/watch?v=MspNdsh0QcM
✅ Video exists: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\data\videos\2aab073be753413b4a9c63d8b3b25403.mp4. Skipping download.
Checking for existing transcript at: c:\Users\aryan\Desktop\Mindfli

