In [1]:
import os
import yt_dlp
import chromadb
from flask import Flask, request, jsonify
from flask_cors import CORS
from pathlib import Path
import hashlib
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer
from sentence_transformers import SentenceTransformer
import cv2
from PIL import Image
import subprocess
import re
import json


os.environ["TEMP"] = "D:/temp"
os.environ["TMP"] = "D:/temp"
os.environ["TRANSFORMERS_CACHE"] = "D:/whisper_cache"
os.environ["TORCH_HOME"] = "D:/whisper_cache"  # ✅ Forces PyTorch models to D drive
os.environ["HF_HOME"] = "D:/whisper_cache"  # ✅ Hugging Face cache location

import whisper

app = Flask(__name__)
CORS(app)

# # Directory setup (make sure these directories are on your D drive)
# VIDEO_DIR = "D:/rag/videos"
# AUDIO_DIR = "D:/rag/audios"
# KEYFRAMES_DIR = "D:/rag/keyframes"
# os.makedirs(VIDEO_DIR, exist_ok=True)
# os.makedirs(AUDIO_DIR, exist_ok=True)
# os.makedirs(KEYFRAMES_DIR, exist_ok=True)

# Get the current working directory (where the notebook is running)
CURRENT_DIR = os.getcwd()  # This will return the current directory where the notebook is located

# Set the data folder as a sibling to the current working directory
DATA_DIR = os.path.join(CURRENT_DIR, '..', 'data')  # ".." steps up one directory and looks for the "data" folder

# Define your directories under the 'data' folder
VIDEO_DIR = os.path.join(DATA_DIR, "videos")
AUDIO_DIR = os.path.join(DATA_DIR, "audios")
KEYFRAMES_DIR = os.path.join(DATA_DIR, "keyframes")
TRANSCRIPTS_DIR = os.path.join(DATA_DIR, "transcripts")
os.makedirs(VIDEO_DIR, exist_ok=True)
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(KEYFRAMES_DIR, exist_ok=True)
os.makedirs(TRANSCRIPTS_DIR, exist_ok=True)

# Initialize Sentence-BERT for vectorization
sentence_model = SentenceTransformer('all-mpnet-base-v2')  # ✅ More robust retrieval model
print("Sentence-BERT model loaded.")

# Initialize ChromaDB
client = chromadb.PersistentClient(path="chromadb")
collection = client.get_or_create_collection(name="video_transcripts")
print("ChromaDB client initialized.")

# Initialize BLIP for frame descriptions
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
print("BLIP model loaded.")

# ✅ Load BERT Tokenizer for Sentence Splitting
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

def generate_video_id(video_url):
    """Generate a unique video ID using a hash of the URL."""
    return hashlib.md5(video_url.encode()).hexdigest()

def check_video_exists(video_id):
    """Check if the video transcript, metadata, and video file already exist in ChromaDB."""
    transcript_path = os.path.join(TRANSCRIPTS_DIR, f"{video_id}.txt")
    video_path = os.path.join(VIDEO_DIR, f"{video_id}.mp4")

    # ✅ Get all stored documents
    existing_data = collection.get()
    existing_ids = set(existing_data.get("ids", []))

    # ✅ Ensure all required components exist
    if f"{video_id}_0" in existing_ids and os.path.exists(transcript_path) and os.path.exists(video_path):
        print(f"✅ Video {video_id} already processed. Skipping...")
        return True  # ✅ Skip reprocessing

    return False  # ❌ Reprocess video

def download_video(video_url, video_id):
    """Download the full video only if it does not exist."""
    video_path = os.path.join(VIDEO_DIR, f"{video_id}.mp4")  # Use video_id for filename

    if os.path.exists(video_path):
        print(f"✅ Video already exists at {video_path}. Skipping download.")
        return video_path  # ✅ Skip re-downloading

    print(f"⬇️ Downloading video from: {video_url}")

    ydl_opts = {
        'outtmpl': video_path,  
        'format': 'best',
        'noplaylist': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

    print(f"✅ Video downloaded: {video_path}")
    return video_path

def sent_tokenize_regex(text):
    return re.split(r"(?<=[.!?])\s+", text)

def download_audio_from_video(video_id):
    """Extracts audio from the existing video file and converts it to .wav."""
    video_path = os.path.join(VIDEO_DIR, f"{video_id}.mp4")
    audio_path = os.path.join(AUDIO_DIR, f"{video_id}.wav")

    if os.path.exists(audio_path):
        print(f"✅ Audio already exists at {audio_path}. Skipping extraction.")
        return audio_path  # ✅ Skip re-extraction

    print(f"🎵 Extracting audio from: {video_path} -> {audio_path}")

    command = [
        "ffmpeg", "-i", video_path,  # ✅ Use corrected video file path
        "-vn",  # Disable video recording (extract only audio)
        "-acodec", "pcm_s16le",  # PCM audio codec for WAV format
        "-ar", "16000",  # Set audio sample rate to 16kHz
        "-ac", "1",  # Mono channel
        audio_path  # ✅ Save output to correct filename
    ]

    try:
        subprocess.run(command, check=True)
        print(f"✅ Audio extracted and saved as .wav: {audio_path}")
    except subprocess.CalledProcessError as e:
        print(f"❌ Error during audio extraction: {e}")
        return None  # ✅ Return None on failure

    return audio_path
def transcribe_audio(audio_path):
    """Transcribe audio using Whisper and store/reuse transcription with timestamps."""
    # Define the path to store the transcript
    transcript_path = os.path.join(TRANSCRIPTS_DIR, os.path.basename(audio_path).replace(".wav", ".json"))

    # Check if transcript already exists
    if os.path.exists(transcript_path):
        print(f"Loading existing transcript: {transcript_path}")
        # Load the existing transcript and transcript_times from the saved JSON file
        with open(transcript_path, "r", encoding="utf-8") as file:
            data = json.load(file)
            transcript = data["text"]
            transcript_times = data["timestamps"]
        return transcript, transcript_times  # Return both text and timestamps

    print(f"Transcribing audio: {audio_path}")
    result = whisper_model.transcribe(audio_path, word_timestamps=True)

    # Save the transcript and timestamps as a JSON file
    os.makedirs(TRANSCRIPTS_DIR, exist_ok=True)
    with open(transcript_path, "w", encoding="utf-8") as file:
        # Store both text and timestamps in the JSON file
        json.dump({
            "text": result["text"],
            "timestamps": [
                {"start": segment["start"], "end": segment["end"], "text": segment["text"]}
                for segment in result["segments"]
            ]
        }, file, ensure_ascii=False, indent=4)

    print(f"Transcript and timestamps saved to: {transcript_path}")
    return result["text"], [
        {"start": segment["start"], "end": segment["end"], "text": segment["text"]}
        for segment in result["segments"]
    ]

def extract_keyframes(video_path, interval=30):
    """Extract keyframes every `interval` seconds"""
    print(f"Extracting keyframes from video: {video_path}")
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    
    frame_count = 0
    success, frame = cap.read()
    while success:
        if frame_count % int(frame_rate * interval) == 0:
            frame_file = os.path.join(KEYFRAMES_DIR, f"frame_{frame_count}.jpg")
            cv2.imwrite(frame_file, frame)
            frames.append(frame_file)
        success, frame = cap.read()
        frame_count += 1
    cap.release()
    print(f"{len(frames)} keyframes extracted.")
    return frames

def generate_frame_description(frame_path):
    """Generate a description of the frame using BLIP"""
    print(f"Generating description for frame: {frame_path}")
    raw_image = Image.open(frame_path).convert("RGB")
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs)
    description = processor.decode(out[0], skip_special_tokens=True)
    print(f"Description generated for frame: {frame_path}")
    return description

def sent_tokenize_regex(text):
    return re.split(r"(?<=[.!?])\s+", text)

# ✅ Improved Chunking using BERT Tokenization
def chunk_transcript(transcript, transcript_times, chunk_size=200, overlap=20):
    sentences = sent_tokenize_regex(transcript)  # ✅ Use Transformer-based tokenization
    chunks, chunk_timestamps = [], []
    
    temp_chunk, temp_words = [], 0
    start_time, end_time = None, None

    for sentence in sentences:
        words = sentence.split()
        temp_chunk.append(sentence)
        temp_words += len(words)

        if transcript_times:
            for seg in transcript_times:
                if seg["text"].strip() in sentence:
                    if not start_time:
                        start_time = seg["start"]
                    end_time = seg["end"]

        if temp_words >= chunk_size:
            chunks.append(" ".join(temp_chunk))
            chunk_timestamps.append({"start_time": start_time, "end_time": end_time})
            temp_chunk = temp_chunk[-overlap:]
            temp_words = sum(len(sent.split()) for sent in temp_chunk)
            start_time, end_time = None, None

    if temp_chunk:
        chunks.append(" ".join(temp_chunk))
        chunk_timestamps.append({"start_time": start_time, "end_time": end_time})

    return chunks, chunk_timestamps



def store_metadata(video_id, title, description, video_url, transcript, transcript_times, keyframe_descriptions):
    """Store video transcript metadata in ChromaDB as vectorized data."""
    print(f"🔍 Storing metadata for video: {title}")

    # ✅ Get existing metadata safely
    existing_metadata = collection.get(ids=[f"{video_id}_0"])
    
    if existing_metadata and "documents" in existing_metadata and existing_metadata["documents"]:
        print(f"✅ Metadata already exists for {title}. Checking metadata...")

        # ✅ Fix: Check if metadata exists properly
        existing_metadatas = existing_metadata.get("metadatas", None)
        if existing_metadatas:
            print("✅ Metadata exists! Skipping processing.")
            return
        else:
            print("⚠️ Warning: Metadata is missing! Re-storing metadata...")

    # ✅ Ensure transcript chunks exist before adding to ChromaDB
    chunks, chunk_timestamps = chunk_transcript(transcript, transcript_times)

    if not chunks:
        print("❌ Error: No transcript chunks available for storage.")
        return

    # ✅ Fix: Replace `None` with empty strings or default values
    chunk_metadatas = [
        {
            "video_url": video_url or "Unknown",  # ✅ Ensure `video_url` is not None
            "start_time": chunk_timestamps[i]["start_time"] if chunk_timestamps[i]["start_time"] is not None else 0,
            "end_time": chunk_timestamps[i]["end_time"] if chunk_timestamps[i]["end_time"] is not None else 0
        }
        for i in range(len(chunks))
    ]

    # ✅ Check if embeddings already exist before inserting
    existing_ids = set(collection.get().get("ids", []))  
    new_ids = [f"{video_id}_{i}" for i in range(len(chunks))]

    # ✅ Filter out IDs that already exist
    unique_chunks, unique_metadatas, unique_ids = [], [], []
    for i, chunk_id in enumerate(new_ids):
        if chunk_id not in existing_ids:
            unique_chunks.append(chunks[i])
            unique_metadatas.append(chunk_metadatas[i])
            unique_ids.append(chunk_id)

    if unique_chunks:
        print(f"✅ Adding {len(unique_chunks)} new transcript chunks to ChromaDB.")
        chunk_vectors = sentence_model.encode(unique_chunks)

        collection.add(
            documents=unique_chunks,
            metadatas=unique_metadatas,  # ✅ No `None` values now
            embeddings=chunk_vectors,
            ids=unique_ids,
        )
    else:
        print("✅ No new transcript chunks to add.")

    print(f"✅ Metadata successfully stored for {title}.")
    print(f"✅ Storing metadata for {video_id}. Metadata: {chunk_metadatas}")

def retrieve_answer(query, confidence_threshold=0.5):  # 🔽 Lowered threshold
    query_vector = sentence_model.encode(query)

    results = collection.query(
        query_embeddings=[query_vector],
        n_results=10  
    )

    print("🔍 Retrieved Documents:", results.get("documents", []))
    print("🔍 Retrieved Metadata:", results.get("metadatas", []))
    print("🔍 Retrieved Similarity Scores:", results.get("distances", []))

    if not results or not results.get("documents") or not results.get("metadatas"):
        return "❌ No relevant results found."

    response = []
    for doc_list, metadata_list, score_list in zip(results["documents"], results["metadatas"], results["distances"]):
        for doc, metadata, score in zip(doc_list, metadata_list or [{}], score_list):
            similarity = score  # ✅ Directly use ChromaDB similarity

            print(f"🔍 Checking Score: {similarity} for Document: {doc[:100]}...")

            if similarity < confidence_threshold:  # ✅ Allow lower threshold
                continue  

            video_url = metadata.get("video_url", "Unknown Video")
            start_time = metadata.get("start_time", "Unknown Time")
            response.append((similarity, f"🎬 Video URL: {video_url} ⏳ Start Time: {start_time}s\n📜 Excerpt: {doc}"))

    print("🔍 Filtered Responses:", response)

    return response[0][1] if response else "❌ No high-confidence results found."



def clear_chromadb():
    """Completely reset ChromaDB by deleting and recreating the collection."""
    global collection  # Ensure we're modifying the global variable

    try:
        # ✅ Delete the entire collection
        client.delete_collection(name="video_transcripts")
        print("✅ ChromaDB collection deleted.")

        # ✅ Recreate the collection (this resets the embedding dimension)
        collection = client.get_or_create_collection(name="video_transcripts")
        print("✅ New ChromaDB collection created with fresh settings.")

    except Exception as e:
        print(f"❌ Error while clearing ChromaDB: {e}")


@app.route("/query", methods=["POST"])
def query():
    data = request.get_json()
    queries = data.get("queries", [])  # Accepting a list of queries
    responses = []

    # Loop through each query and retrieve the answer
    for query in queries:
        response = retrieve_answer(query)
        responses.append(response)

    return jsonify({"responses": responses})  # Return list of responses


if __name__ == "__main__":
    firstiter=0
    video_links = [
        "https://www.youtube.com/watch?v=Kf57KGwKa0w",
        "https://www.youtube.com/watch?v=ftDsSB3F5kg",
        "https://www.youtube.com/watch?v=kKFrbhZGNNI",
        "https://www.youtube.com/watch?v=6qUxwZcTXHY",
        "https://www.youtube.com/watch?v=MspNdsh0QcM"
    ]
    # video_links = [
    #     "https://www.youtube.com/watch?v=ftDsSB3F5kg"
    # ]
    #clear_chromadb()
    for link in video_links:

        video_id = generate_video_id(link)  # Generate a unique ID for the video
    
        if check_video_exists(video_id):
            print(f"Skipping {link} (Already Processed)")
            continue  # Skip processing if transcript exists

        # ✅ Pass both `link` (video_url) and `video_id`
        video_path = download_video(link, video_id)
        print(f"Downloaded video at: {video_path}")

        # Extract and convert audio from the downloaded video
        audio_path = download_audio_from_video(video_id)
        if audio_path:
            print(f"Audio extracted and converted to .wav at: {audio_path}")
        else:
            print(f"Failed to extract audio for {link}. Skipping.")
            continue
        
        # Transcribe audio
        if(firstiter==0):
            whisper_model = whisper.load_model("medium") #Initialize Whisper model
            print("Whisper model loaded.")
            firstiter=1
        transcript, transcript_times = transcribe_audio(audio_path)
        
        # Extract keyframes from the video
        keyframes = extract_keyframes(video_path)
        
        # Generate descriptions for keyframes
        keyframe_descriptions = [generate_frame_description(frame) for frame in keyframes]
        
        # Store all metadata in ChromaDB
        store_metadata(video_id, link, "Description placeholder", link, transcript, transcript_times, keyframe_descriptions)
        
        print(f"Processed and stored metadata for: {link}")

    # Example User Queries
    # test_queries = [
    # "वीडियो बनाने के लिए एक अच्छी टीम क्यों जरूरी होती है?",
    # "अच्छे निर्देशन के लिए क्या चीजें जरूरी हैं?",
    # "वीडियो एडिटिंग में सबसे महत्वपूर्ण क्या होता है?"
    # ]

    # # Run the retriever on each query
    # for query in test_queries:
    #     print(f"🔍 **User Query:** {query}\n")
    #     print(retrieve_answer(query))
    #     print("=" * 80)
    app.run(host="0.0.0.0", port=8084, debug=False)

  from .autonotebook import tqdm as notebook_tqdm


Sentence-BERT model loaded.
ChromaDB client initialized.
BLIP model loaded.
✅ Video already exists at c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\backend\..\data\videos\482c5328669da57f350550cdf4c10c3d.mp4. Skipping download.
Downloaded video at: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\backend\..\data\videos\482c5328669da57f350550cdf4c10c3d.mp4
✅ Audio already exists at c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\backend\..\data\audios\482c5328669da57f350550cdf4c10c3d.wav. Skipping extraction.
Audio extracted and converted to .wav at: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\backend\..\data\audios\482c5328669da57f350550cdf4c10c3d.wav
Whisper model loaded.
Loading existing transcript: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\backend\..\data\transcripts\482c5328669da57f350550cdf4c10c3d.json
Extracting keyframes from video: c:\Users\aryan\Desktop\Mindflix_AI_Aryan_Jain_B22092\backend\..\data\videos\482c5328669da57f350550cdf4c10c3d.mp4



KeyboardInterrupt: 