In [1]:
!pip install -q git+https://github.com/openai/whisper.git ffmpeg-python yt-dlp transformers sentencepiece scikit-learn weasyprint langid nltk


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
import os
import re
import whisper
import langid
import nltk
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from weasyprint import HTML
from tqdm.auto import tqdm
from nltk.corpus import stopwords


In [3]:
import whisper
from transformers import pipeline, AutoTokenizer
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import subprocess
from weasyprint import HTML

# Global models dictionary
_MODELS = {
    "whisper": None,
    "summarizer": None,
    "qa": None,
    "sentiment": None
}

def load_models():
    """Load all models once with progress tracking"""
    if not _MODELS["whisper"]:
        print("🔧 Loading Whisper speech-to-text model...")
        _MODELS["whisper"] = whisper.load_model("medium")

    if not _MODELS["summarizer"]:
        print("🔧 Loading BART Summarizer...")
        _MODELS["summarizer"] = pipeline("summarization",
                                      model="facebook/bart-large-cnn")

    if not _MODELS["qa"]:
        print("🔧 Loading FLAN-T5 for Q&A...")
        _MODELS["qa"] = pipeline("text2text-generation",
                             model="google/flan-t5-large")

    if not _MODELS["sentiment"]:
        print("🔧 Loading Sentiment Analyzer...")
        _MODELS["sentiment"] = pipeline("sentiment-analysis")



In [4]:
class MeetingProcessor:
    def __init__(self):
        load_models()
        nltk.download('stopwords', quiet=True)
        self.stopwords = set(stopwords.words('english')).union({
            'like', 'just', 'really', 'okay', 'um', 'uh', 'yeah'
        })
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
        self.enable_advanced = False  # Advanced features off by default

    def enable_advanced_features(self, enable=True):
        """Toggle advanced analysis features on/off"""
        self.enable_advanced = enable

    def youtube_to_text(self, url):
        """Convert YouTube video to cleaned transcript"""
        print("🎥 Processing YouTube video...")

        try:
            # Download audio using subprocess
            subprocess.run(
                ['yt-dlp', '-f', 'bestaudio', url, '-o', 'audio_temp.mp4', '--quiet'],
                check=True
            )
            subprocess.run(
                ['ffmpeg', '-y', '-i', 'audio_temp.mp4', '-ar', '16000',
                 '-ac', '1', '-c:a', 'pcm_s16le', 'audio_temp.wav', '-v', 'quiet'],
                check=True
            )

            # Transcribe
            result = _MODELS["whisper"].transcribe("audio_temp.wav")

            # Clean up temporary files
            if os.path.exists("audio_temp.mp4"):
                os.remove("audio_temp.mp4")
            if os.path.exists("audio_temp.wav"):
                os.remove("audio_temp.wav")

            # Merge segments into paragraphs
            transcript = []
            current_para = []
            for seg in result['segments']:
                if len(current_para) > 0 and seg['start'] - current_para[-1]['end'] > 2.0:
                    transcript.append({
                        'start': current_para[0]['start'],
                        'end': current_para[-1]['end'],
                        'text': ' '.join([s['text'] for s in current_para])
                    })
                    current_para = []
                current_para.append(seg)

            if current_para:
                transcript.append({
                    'start': current_para[0]['start'],
                    'end': current_para[-1]['end'],
                    'text': ' '.join([s['text'] for s in current_para])
                })

            return transcript

        except Exception as e:
            print(f"❌ Error processing video: {e}")
            if os.path.exists("audio_temp.mp4"):
                os.remove("audio_temp.mp4")
            if os.path.exists("audio_temp.wav"):
                os.remove("audio_temp.wav")
            return []

    def analyze_meeting(self, text, summary_level='detailed', style='bullet'):
        """Run analysis with optional advanced features"""
        print("\n🔍 Analyzing content...")

        base_results = {
            'summary': self._summarize(text, summary_level, style),
            'keywords': self._extract_keywords(text),
            'actions': self._extract_actions(text),
            'sentiment': self._analyze_sentiment(text),
        }

        if self.enable_advanced:
            print("🔧 Running advanced analysis...")
            base_results.update({
                'discussion_topics': self._extract_discussion_topics(text),
                'decisions': self._extract_decisions(text),
                'participant_sentiments': self._analyze_participant_sentiments(text),
                'quotes': self._extract_quotes(text)
            })

        return base_results

    # ===== ADVANCED ANALYSIS METHODS =====
    def _extract_discussion_topics(self, text):
        """Extract main discussion topics from text"""
        prompt = f"""Extract 3-5 main discussion topics from this meeting:
                    {text[:3000]}... [truncated]
                    Format as bullet points"""
        return _MODELS["qa"](prompt, max_length=200)[0]['generated_text']

    def _extract_decisions(self, text):
        """Extract decisions made from text"""
        prompt = f"""List key decisions made in this meeting:
                    {text[:3000]}... [truncated]
                    Format as bullet points"""
        return _MODELS["qa"](prompt, max_length=200)[0]['generated_text']

    def _analyze_participant_sentiments(self, text):
        """Analyze sentiment by participant"""
        prompt = f"""Analyze participant sentiments from:
                    {text[:3000]}... [truncated]
                    Format as: [Name]: [Sentiment] [Confidence%]"""
        analysis = _MODELS["qa"](prompt, max_length=400)[0]['generated_text']

        participants = {}
        for line in analysis.splitlines():
            if ':' in line and '%' in line:
                try:
                    name, rest = line.split(':', 1)
                    sentiment, confidence = rest.strip().rsplit(' ', 1)
                    participants[name.strip()] = (
                        sentiment.strip(),
                        float(confidence.strip('%'))/100
                    )
                except:
                    continue
        return participants

    def _extract_quotes(self, text):
        """Extract notable quotes from the meeting"""
        prompt = f"""Extract 3-5 notable quotes from this meeting:
                    {text[:3000]}... [truncated]
                    Format one quote per line"""
        quotes = _MODELS["qa"](prompt, max_length=300)[0]['generated_text']
        return [q.strip() for q in quotes.splitlines() if q.strip()]

    # ===== CORE ANALYSIS METHODS =====
    def _summarize(self, text, mode, style):
        """Generate formatted summary with dynamic length calculation"""
        chunks = self._chunk_text(text)
        summaries = []

        for chunk in tqdm(chunks, desc=f"Summarizing ({mode})"):
            input_length = len(self.tokenizer(chunk)['input_ids'])

            if mode == 'short':
                max_len = max(30, int(input_length * 0.3))
                min_len = max(10, int(input_length * 0.15))
            elif mode == 'medium':
                max_len = max(60, int(input_length * 0.5))
                min_len = max(30, int(input_length * 0.25))
            else:  # detailed
                max_len = max(100, int(input_length * 0.7))
                min_len = max(50, int(input_length * 0.4))

            summary = _MODELS["summarizer"](
                chunk,
                max_length=max_len,
                min_length=min_len,
                do_sample=False
            )[0]['summary_text']
            summaries.append(summary)

        # Final summary
        final = _MODELS["summarizer"](
            ' '.join(summaries),
            max_length=max(50, int(len(' '.join(summaries).split())*0.6)),
            min_length=max(25, int(len(' '.join(summaries).split())*0.3)),
            do_sample=False
        )[0]['summary_text']

        return self._format_summary(final, style)

    def _format_summary(self, text, style):
        """Convert to requested format with fixed numbering"""
        # First split into sentences
        sentences = [s.strip() for s in re.split(r'(?<=[.!?]) +', text) if s.strip()]

        # Clean each sentence by removing any existing numbering
        cleaned_sentences = []
        for s in sentences:
            # Remove any existing numbering patterns (1., 2. etc.)
            cleaned = re.sub(r'^\d+\.\s*', '', s)
            # Remove any bullet points or other markers
            cleaned = re.sub(r'^[•♦›➢✓]\s*', '', cleaned)
            cleaned_sentences.append(cleaned.strip())

        # Apply the requested formatting style
        if style == 'bullet':
            return '\n'.join(f'• {s}' for s in cleaned_sentences)
        elif style == 'numbered':
            return '\n'.join(f'{i+1}. {s}' for i, s in enumerate(cleaned_sentences))
        elif style == 'highlight':
            return '\n'.join(f'✨ {s} ✨' for s in cleaned_sentences)
        elif style == 'executive':
            return 'KEY TAKEAWAYS:\n' + '\n'.join(f'✓ {s}' for s in cleaned_sentences)
        return '\n'.join(cleaned_sentences)

    def _extract_actions(self, text):
        """Identify action items with owners"""
        prompt = f"""Extract action items from:
                    {text[:3000]}... [truncated]
                    Format as: - [Owner] [Action] [Deadline]"""
        return _MODELS["qa"](prompt, max_length=300)[0]['generated_text']

    def _extract_keywords(self, text, top_n=15):
        """Get important terms with TF-IDF"""
        vectorizer = TfidfVectorizer(stop_words=list(self.stopwords),
                                   ngram_range=(1, 2))
        X = vectorizer.fit_transform([text])
        return [vectorizer.get_feature_names_out()[i]
               for i in X.toarray()[0].argsort()[-top_n:][::-1]]

    def _analyze_sentiment(self, text):
        """Enhanced sentiment analysis"""
        result = _MODELS["sentiment"](text[:1000])[0]
        pos_score = result['score'] if result['label'] == 'POSITIVE' else 1 - result['score']
        return ('NEUTRAL', pos_score) if abs(pos_score - 0.5) < 0.15 else (result['label'], result['score'])

    def _chunk_text(self, text, chunk_size=400):
        """Split text preserving sentence boundaries"""
        sentences = re.split(r'(?<=[.!?]) +', text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sent in sentences:
            words = sent.split()
            if current_length + len(words) > chunk_size and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_length = 0
            current_chunk.append(sent)
            current_length += len(words)

        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks



In [5]:
def generate_pdf_report(results, transcript="", output_file="meeting_report.pdf",
                      summary_level="detailed", style="bullet"):
    """
    Generate adaptive PDF report with:
    - Blue title styling
    - Fixed numbering issues
    - Dynamic sections based on available data
    """
    sentiment_color = {
        'POSITIVE': '#2e7d32',
        'NEGATIVE': '#c62828',
        'NEUTRAL': '#f9a825'
    }.get(results['sentiment'][0], '#777')

    # Format action items
    action_items = [line.strip() for line in results['actions'].splitlines()
                   if line.strip() and not line.startswith("[Owner]")]

    # Format summary content
    def format_content(text, content_style):
        lines = [re.sub(r'^\d+\.\s*', '', line.strip())  # Remove existing numbers
                for line in text.splitlines() if line.strip()]
        if content_style == 'numbered':
            return "<br>".join(f"{i+1}. {line}" for i, line in enumerate(lines))
        elif content_style == 'bullet':
            return "<br>".join(f"• {line}" for line in lines)
        elif content_style == 'highlight':
            return "<br>".join(f"✨ {line} ✨" for line in lines)
        elif content_style == 'executive':
            return "<br>".join(f"✓ {line}" for line in lines)
        return "<br>".join(lines)

    # Generate dynamic sections
    advanced_sections = ""
    if 'discussion_topics' in results:
        advanced_sections += f"""
        <h2>Discussion Topics</h2>
        <div class="section">
            {format_content(results['discussion_topics'], 'bullet')}
        </div>"""

    if 'decisions' in results:
        advanced_sections += f"""
        <h2>Key Decisions</h2>
        <div class="section">
            {format_content(results['decisions'], 'bullet')}
        </div>"""

    if 'participant_sentiments' in results:
        advanced_sections += f"""
        <h2>Participant Sentiments</h2>
        <div class="section">
            {format_participant_sentiments(results['participant_sentiments'])}
        </div>"""

    if 'quotes' in results and results['quotes']:
        advanced_sections += f"""
        <h2>Highlighted Quotes</h2>
        <div class="section">
            {"".join(f'<div class="quote">"{q}"</div>' for q in results['quotes'])}
        </div>"""

    html = f"""
    <html>
    <head>
        <meta charset="utf-8">
        <style>
            body {{
                font-family: 'Segoe UI', sans-serif;
                margin: 30px;
                color: #333;
                line-height: 1.6;
            }}
            h1 {{
                color: #1a73e8;
                border-bottom: 2px solid #1a73e8;
                padding-bottom: 10px;
            }}
            h2 {{
                color: #1a73e8;
                margin-top: 25px;
                border-left: 4px solid #1a73e8;
                padding-left: 10px;
            }}
            .section {{
                background: #f8f9fa;
                padding: 15px 20px;
                border-radius: 6px;
                margin-bottom: 20px;
            }}
            .badge {{
                background: {sentiment_color};
                color: white;
                padding: 3px 10px;
                border-radius: 12px;
                font-weight: bold;
                display: inline-block;
            }}
            .keywords span {{
                background: #e8f0fe;
                color: #1a73e8;
                padding: 4px 12px;
                border-radius: 16px;
                display: inline-block;
                margin: 4px;
                font-size: 0.9em;
            }}
            .transcript {{
                font-family: monospace;
                background: #f1f3f4;
                padding: 15px;
                border-radius: 5px;
                white-space: pre-wrap;
                line-height: 1.4;
            }}
            .quote {{
                font-style: italic;
                color: #555;
                border-left: 3px solid #6fa8dc;
                padding-left: 15px;
                margin: 10px 0;
            }}
        </style>
    </head>
    <body>
        <h1>🤖 Meeting Intelligence Report</h1>
        <p><strong>Overall Sentiment:</strong> <span class="badge">{results['sentiment'][0]} ({results['sentiment'][1]:.0%})</span></p>

        <h2>Meeting Summary</h2>
        <div class="section">
            {format_content(results['summary'], style)}
        </div>

        {advanced_sections}

        <h2>Action Items</h2>
        <div class="section">
            <ol>{"".join(f"<li>{item}</li>" for item in action_items)}</ol>
        </div>

        <h2>Key Terms</h2>
        <div class="section keywords">
            {" ".join(f"<span>{kw}</span>" for kw in results['keywords'])}
        </div>

        {f'<h2>Complete Transcript</h2><div class="transcript">{transcript}</div>' if transcript else ''}
    </body>
    </html>
    """
    HTML(string=html).write_pdf(output_file)
    print(f"✅ Report saved as: {output_file}")

def format_participant_sentiments(sentiments):
    """Format participant sentiment analysis"""
    if not sentiments:
        return "<p>No participant-level analysis available.</p>"

    items = []
    for name, (sentiment, score) in sentiments.items():
        color = {
            'POSITIVE': '#2e7d32',
            'NEGATIVE': '#c62828',
            'NEUTRAL': '#f9a825'
        }.get(sentiment, '#777')
        items.append(f"""
        <div style="margin-bottom: 8px;">
            <strong>{name}:</strong>
            <span style="background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.9em;">
                {sentiment} ({score:.0%})
            </span>
        </div>
        """)
    return "".join(items)

def generate_transcript_pdf(transcript, output_file="meeting_transcript.pdf"):
    """Generate transcript-only PDF"""
    html = f"""
    <html>
    <head>
        <meta charset="utf-8">
        <style>
            body {{
                font-family: monospace;
                margin: 30px;
                color: #333;
                background: #fff;
                white-space: pre-wrap;
                line-height: 1.4;
            }}
            h1 {{
                color: #1a73e8;
                border-bottom: 2px solid #1a73e8;
                padding-bottom: 10px;
            }}
            .transcript {{
                background: #f1f3f4;
                padding: 15px;
                border-radius: 5px;
                margin-top: 20px;
            }}
        </style>
    </head>
    <body>
        <h1>📝 Complete Transcript</h1>
        <div class="transcript">{transcript}</div>
    </body>
    </html>
    """
    HTML(string=html).write_pdf(output_file)
    print(f"📄 Transcript saved as: {output_file}")



In [6]:
!pip install langchain langchain-community sentence-transformers faiss-cpu huggingface_hub



In [7]:
processor = MeetingProcessor()

🔧 Loading Whisper speech-to-text model...
🔧 Loading BART Summarizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


🔧 Loading FLAN-T5 for Q&A...


Device set to use cuda:0
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


🔧 Loading Sentiment Analyzer...


Device set to use cuda:0


In [8]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains import RetrievalQA
#Rag Pipeline
# Extract transcript text from MeetingProcessor.youtube_to_text
transcript_data = processor.youtube_to_text("https://www.youtube.com/watch?v=U5HvuKEjH6g")

# Join all segments into one long string
full_text = " ".join(seg["text"] for seg in transcript_data if "text" in seg)

# Use the full_text variable generated by the youtube_to_text function
documents = [type('obj', (object,), {'page_content': full_text, 'metadata': {'source': 'transcript'}})()]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)
print(f"📝 Document split into {len(docs)} chunks.")

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
print(f"🔎 Loading embedding model: {embedding_model_name}")
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
print("📦 Creating vector store...")
db = FAISS.from_documents(docs, embeddings)
print("✅ Vector store is ready.")

model_id = "google/flan-t5-base"
print(f"🤖 Loading language model: {model_id} (this may take a moment...)")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
    max_new_tokens=256,
    device=0 if torch.cuda.is_available() else -1
)
llm = HuggingFacePipeline(pipeline=pipe)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(),
    return_source_documents=True
)
print("🚀 RAG system is fully initialized and ready to answer questions.")

try:
    print("\n--- Testing RAG System ---")
    queries = [
        "What is Jack Ma’s main advice about spending time wisely?",
        "What mistakes does Jack Ma recommend avoiding while you're young",
        "What does Jack Ma say about competition and failure?",
        "How does Jack Ma recommend balancing work and personal development?"
    ]
    for query in queries:
        result = qa_chain.invoke({"query": query})
        print(f"\nQuery: {query}")
        print("Answer:", result["result"])
except Exception as e:
    print(f"\n❌ An error occurred while querying the RAG chain: {e}")

🎥 Processing YouTube video...
📝 Document split into 19 chunks.
🔎 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)


📦 Creating vector store...
✅ Vector store is ready.
🤖 Loading language model: google/flan-t5-base (this may take a moment...)


Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)
Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors


🚀 RAG system is fully initialized and ready to answer questions.

--- Testing RAG System ---

Query: What is Jack Ma’s main advice about spending time wisely?
Answer: He says, if you don't get better, you will be stuck.

Query: What mistakes does Jack Ma recommend avoiding while you're young
Answer: No money, no connections, no good grades

Query: What does Jack Ma say about competition and failure?
Answer: Jack Ma says competition is not about giant leaps. It's about small steps taken every day.

Query: How does Jack Ma recommend balancing work and personal development?
Answer: Helpful


In [10]:
!pip install pyTelegramBotAPI openai moviepy #put it up with others when you finish




In [11]:
def handle_file_upload(message):
    transcript_paragraphs = processor.youtube_to_text(video_url)
    full_text = '\n\n'.join([f"[{para['start']:.1f}s] {para['text']}" for para in transcript_paragraphs])
    re

In [20]:
import telebot
import os
from moviepy.editor import VideoFileClip
from telebot import types
import subprocess
import uuid
from yt_dlp import YoutubeDL
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch

API_TOKEN = '7030233167:AAGDX6fAbSva8vPCa9t-5GmMvDAmA_0NuS8'
bot = telebot.TeleBot(API_TOKEN)
chat_id_to_transcript = {}
chat_rag_sessions = {}
# Step 1: Welcome and Options
@bot.message_handler(commands=['start'])
def send_welcome(message):
    chat_id = message.chat.id
    bot.reply_to(message, (
        "🎉 Welcome to the Meeting Analysis Bot!\n\n"
        "This bot will help you wrap up your meetings by providing transcription, summarization, and Q&A.\n\n"
        "Would you like to send a YouTube link or upload a file?\n\n"
        "Please reply with:\n"
        "`youtube` - to send a YouTube video link\n"
        "`file` - to upload a video file"
    ), parse_mode='Markdown')

    bot.register_next_step_handler(message, handle_input_choice)


def handle_input_choice(message):
    choice = message.text.strip().lower()

    if choice == 'youtube' or choice == 'Youtube':
        bot.register_next_step_handler(message, handle_youtube_option)
    elif choice == 'file' or choice == 'File':
        bot.register_next_step_handler(message, handle_file_upload_option)
    else:
        bot.send_message(message.chat.id, "❌ Invalid choice. Please reply with `youtube` or `file`.")
        bot.register_next_step_handler(message, handle_input_choice)


# Step 2: Handle "Upload Video" Option
@bot.message_handler(func=lambda msg: msg.text == "File")
def handle_file_upload_option(message):
    bot.send_message(message.chat.id, "📤 Great! Please upload the video file now.")

# Step 3: Handle video or document upload
@bot.message_handler(content_types=['video', 'document'])
def handle_media(message):
    try:
        file_id = message.video.file_id if message.content_type == 'video' else message.document.file_id
        file_info = bot.get_file(file_id)
        downloaded_file = bot.download_file(file_info.file_path)

        os.makedirs("temp", exist_ok=True)
        input_path = f"temp/{file_info.file_unique_id}.mp4"
        with open(input_path, 'wb') as f:
            f.write(downloaded_file)

        bot.reply_to(message, "🔄 Converting video to audio...")

        # Convert to WAV
        clip = VideoFileClip(input_path)
        audio_path = input_path.replace(".mp4", ".wav")
        clip.audio.write_audiofile(audio_path)
        clip.close()

        bot.reply_to(message, "🧠 Transcribing meeting audio please wait (this will take some minutes)...")

        # Transcribe with Whisper
        result = _MODELS['whisper'].transcribe(audio_path)
        full_text = '\n\n'.join([f"[{para['start']:.1f}s] {para['text']}" for para in result])
        transcript = result["text"]

        # Clean up
        os.remove(input_path)
        os.remove(audio_path)


        bot.reply_to(message, f"✅ Transcription complete!")
        chat_id_to_transcript[message.chat.id] = transcript  # Save transcript for later
        start_config(message)  # Start the interactive config steps


    except Exception as e:
        bot.reply_to(message, f"❌ An error occurred: {e}")



# Step 1: When user selects "Send YouTube Link"
@bot.message_handler(func=lambda msg: msg.text == "YouTube")
def handle_youtube_option(message):
    bot.send_message(message.chat.id, "🔗 Please send the YouTube video link now.")
    bot.register_next_step_handler(message, process_youtube_link)

# Step 2: Process the link
def process_youtube_link(message):
    url = message.text.strip()
    chat_id = message.chat.id

    if not url.startswith("http"):
        bot.send_message(chat_id, "❌ That doesn’t look like a valid link. Please try again.")
        return

    try:
        bot.send_message(chat_id, "🧠 Transcribing meeting audio please wait (this will take some minutes)...")

        transcript_paragraphs = processor.youtube_to_text(url)
        full_text = '\n\n'.join([f"[{para['start']:.1f}s] {para['text']}" for para in transcript_paragraphs])


        bot.send_message(chat_id, f"✅ Transcription complete!")
        chat_id_to_transcript[message.chat.id] = full_text  # Save transcript for later
        start_config(message)  # Start the interactive config steps


    except Exception as e:
        bot.send_message(chat_id, f"❌ An error occurred: {str(e)}")

user_config = {}

@bot.message_handler(func=lambda m: m.text.startswith("✅ Transcription complete"))
def start_config(message):
    chat_id = message.chat.id
    user_config[chat_id] = {"transcript":chat_id_to_transcript.get(chat_id, "")}  # Save transcript for later

    bot.send_message(chat_id, "Enable advanced analysis, ex: like summarization? (yes/no) ")
    bot.register_next_step_handler(message, handle_advanced_analysis)

def handle_advanced_analysis(message):
    chat_id = message.chat.id
    user_config[chat_id]["advanced"] = message.text.strip().lower() == "yes"

    bot.send_message(chat_id, "📝 Summary Preferences:\nLevel (short/medium/detailed)")
    bot.register_next_step_handler(message, handle_summary_level)

def handle_summary_level(message):
    chat_id = message.chat.id
    level = message.text.strip().lower()
    user_config[chat_id]["summary_level"] = level if level else "detailed"

    bot.send_message(chat_id, "Style (bullet/numbered/executive/highlight)")
    bot.register_next_step_handler(message, handle_style)

def handle_style(message):
    chat_id = message.chat.id
    style = message.text.strip().lower()
    user_config[chat_id]["style"] = style if style else "bullet"

    bot.send_message(chat_id, "Include full transcript? (yes/no)")
    bot.register_next_step_handler(message, handle_include_transcript)

def handle_include_transcript(message):
    chat_id = message.chat.id
    user_config[chat_id]["include_transcript"] = message.text.strip().lower() == "yes"

    bot.send_message(chat_id, "Output filename (without extension)")
    bot.register_next_step_handler(message, handle_filename)

def handle_filename(message):
    chat_id = message.chat.id
    filename = message.text.strip() or "meeting_report"
    user_config[chat_id]["output_name"] = filename

    if not user_config[chat_id]["include_transcript"]:
        bot.send_message(chat_id, "Save transcript as separate file? (yes/no) ")
        bot.register_next_step_handler(message, handle_separate_transcript)
    else:
        finalize_analysis(message)

def handle_separate_transcript(message):
    chat_id = message.chat.id
    user_config[chat_id]["separate_transcript"] = message.text.strip().lower() == "yes"
    finalize_analysis(message)

def finalize_analysis(message):
    chat_id = message.chat.id
    config = user_config[chat_id]

    # 🧠 Here you would call your processing pipeline:
    results = processor.analyze_meeting(
        config["transcript"],
        config["summary_level"],
        config["style"]
    )

    if config["include_transcript"]:
        bot.send_message(chat_id, "Generating your pdf...")
        output_file = f"{config['output_name']}.pdf"  # ✅ Define this

        generate_pdf_report(
            results,
            transcript=config["transcript"],
            output_file=f"{config['output_name']}_full.pdf",
            summary_level=config["summary_level"],
            style=config["style"]
        )
    else:
        bot.send_message(chat_id, "Generating your pdf...")
        output_file = f"{config['output_name']}.pdf"  # ✅ Define this

        generate_pdf_report(
            results,
            output_file=f"{config['output_name']}.pdf",
            summary_level=config["summary_level"],
            style=config["style"]
        )
        if config.get("separate_transcript"):
            generate_transcript_pdf(config["transcript"], f"{config['output_name']}_transcript.pdf")

    bot.send_message(chat_id, "✅ Analysis complete! Report is generated.")
    with open(output_file, "rb") as pdf_file:
      bot.send_document(chat_id, pdf_file)
    initialize_rag_pipeline(message)




def initialize_rag_pipeline(message):
    try:
        chat_id = message.chat.id
        bot.send_message(chat_id,"\n🧠 Initializing RAG pipeline...")

        # Step 1: Document creation and splitting
        chat_id = message.chat.id
        transcript_text = chat_id_to_transcript.get(chat_id, "")
        documents = [type('Doc', (), {'page_content': transcript_text, 'metadata': {'source': 'transcript'}})()]
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        docs = text_splitter.split_documents(documents)
        print(f"📝 Document split into {len(docs)} chunks.")

        # Step 2: Embedding model and FAISS vector store
        embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
        print(f"🔎 Loading embedding model: {embedding_model_name}")
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
        db = FAISS.from_documents(docs, embeddings)
        print("✅ Vector store is ready.")

        # Step 3: Load the LLM pipeline
        model_id = "google/flan-t5-base"
        bot.send_message(chat_id,f"🤖 Loading language model: {model_id}")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")
        hf_pipe = pipeline(
            "text2text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=256,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            device=0 if torch.cuda.is_available() else -1
        )
        llm = HuggingFacePipeline(pipeline=hf_pipe)

        # Step 4: Create RAG chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=db.as_retriever(),
            return_source_documents=True
        )
        bot.send_message(chat_id,"🚀 RAG system is fully initialized.")
        bot.send_message(chat_id,"Send ask_command if you have any questions regarding the meeting.")
        chat_rag_sessions[chat_id] = qa_chain
        return qa_chain

    except Exception as e:
        bot.send_message(chat_id,f"\n❌ An error occurred while initializing RAG: {e}")
        return None

@bot.message_handler(commands=['ask'])
def handle_question(message):
    chat_id = message.chat.id
    qa_chain = chat_rag_sessions.get(chat_id)

    if not qa_chain:
        bot.reply_to(message, "❗️Please analyze a meeting first before asking questions.")
        return

    bot.reply_to(message, "💬 Ask your question about the meeting:")
    bot.register_next_step_handler(message, process_user_question)


def process_user_question(message):
    chat_id = message.chat.id
    question = message.text
    qa_chain = chat_rag_sessions.get(chat_id)

    if not qa_chain:
        bot.reply_to(message, "⚠️ RAG session not initialized.")
        return

    try:
        result = qa_chain.invoke({"query": question})
        answer = result["result"]
        bot.send_message(chat_id, f"📌 Answer: {answer}")
        bot.send_message(chat_id,"Send ask_command if you have any questions regarding the meeting.")

    except Exception as e:
        bot.send_message(chat_id, f"❌ Error while answering: {str(e)}")






bot.polling()


🎥 Processing YouTube video...
