In [None]:
# @title Default title text
# Google Colab Flask Video Processor Setup
# Run each cell in sequence

# ===== CELL 1: Install Dependencies =====
# ===== CELL 1: Installation with specific versions =====
!pip install flask flask-cors pyngrok
!pip install transformers openpyxl pandas
!pip install git+https://github.com/csebuetnlp/normalizer
!pip install banglanlptoolkit
!pip install moviepy
!pip install SpeechRecognition==3.10.0
!pip install pydub==0.25.1

!apt-get update
!apt-get install -y ffmpeg

# ===== CELL 2: Setup Ngrok (for public URL) =====
from pyngrok import ngrok
import getpass

# Set your ngrok auth token
ngrok_token = getpass.getpass("Enter your ngrok auth token: ")
ngrok.set_auth_token(ngrok_token)

# ===== CELL 3: Fixed Flask Application =====
from flask import Flask, request, jsonify, send_file, render_template_string
from flask_cors import CORS
import os
import io
import json
import tempfile
import threading
import time
from datetime import datetime
import traceback
import zipfile
import re
import shutil

# Import libraries
import speech_recognition as sr
from tqdm.auto import tqdm
from pydub import AudioSegment
from pydub.utils import make_chunks
from moviepy.editor import VideoFileClip
import pandas as pd
import torch
import re
import gc
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from moviepy.editor import VideoFileClip
import math
from collections import Counter

# Try to import BNLP toolkit (optional)
try:
    from bnlp import BasicTokenizer
    try:
        from bnlp.stemmer import BanglaStemmer
    except ImportError:
        BanglaStemmer = None
    TOKENIZER = BasicTokenizer()
    STEMMER = BanglaStemmer() if BanglaStemmer else None
except ImportError:
    TOKENIZER = None
    STEMMER = None




# Enable tqdm for pandas
tqdm.pandas()

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

def split_into_sentences(text):
    """Split Bangla text into sentences"""
    if not text or not isinstance(text, str):
        return []

    # Remove extra whitespace
    text = ' '.join(text.split())

    # Split by Bangla sentence endings
    sentences = re.split(r'[‡•§?!]+', text)

    # Clean and filter sentences
    cleaned_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and len(sentence) > 3:  # Minimum sentence length
            cleaned_sentences.append(sentence)

    return cleaned_sentences

class EfficientCombinedTFIDF:
    def __init__(self):
        self.document_unigram_counts = []  # List of Counter objects for unigrams
        self.document_bigram_counts = []   # List of Counter objects for bigrams
        self.document_names = []
        self.global_unigram_vocab = set()
        self.global_bigram_vocab = set()
        self.unigram_document_frequencies = {}
        self.bigram_document_frequencies = {}
        self.total_documents = 0
        self.original_forms = {}  # Mapping of normalized -> original forms

    def add_document(self, doc_name, normalized_unigrams_list, normalized_bigrams_list, original_forms_mapping):
        """
        Add a document's unigrams and bigrams (already processed and normalized)
        """
        # Count unigrams and bigrams in this document
        unigram_counts = Counter(normalized_unigrams_list)
        bigram_counts = Counter(normalized_bigrams_list)

        self.document_unigram_counts.append(unigram_counts)
        self.document_bigram_counts.append(bigram_counts)
        self.document_names.append(doc_name)

        # Update global vocabularies
        unique_unigrams_in_doc = set(unigram_counts.keys())
        unique_bigrams_in_doc = set(bigram_counts.keys())

        self.global_unigram_vocab.update(unique_unigrams_in_doc)
        self.global_bigram_vocab.update(unique_bigrams_in_doc)

        # Update document frequencies
        for unigram in unique_unigrams_in_doc:
            self.unigram_document_frequencies[unigram] = self.unigram_document_frequencies.get(unigram, 0) + 1

        for bigram in unique_bigrams_in_doc:
            self.bigram_document_frequencies[bigram] = self.bigram_document_frequencies.get(bigram, 0) + 1

        self.total_documents += 1

        # Store original forms mapping
        self.original_forms.update(original_forms_mapping)

        print(f"Added document: {doc_name} with {len(normalized_unigrams_list)} unigrams, {len(normalized_bigrams_list)} bigrams")

    def calculate_pmi(self, bigram, doc_index):
        """
        Calculate Pointwise Mutual Information for a bigram
        """
        if ' ' not in bigram:
            return 0  # Not a bigram

        word1, word2 = bigram.split(' ', 1)

        # Get counts from current document
        unigram_counts = self.document_unigram_counts[doc_index]
        bigram_counts = self.document_bigram_counts[doc_index]

        # Calculate total tokens in document
        total_unigrams = sum(unigram_counts.values())

        # Get frequencies
        bigram_freq = bigram_counts.get(bigram, 0)
        word1_freq = unigram_counts.get(word1, 0)
        word2_freq = unigram_counts.get(word2, 0)

        if bigram_freq == 0 or word1_freq == 0 or word2_freq == 0:
            return 0

        # Calculate probabilities
        p_bigram = bigram_freq / total_unigrams
        p_word1 = word1_freq / total_unigrams
        p_word2 = word2_freq / total_unigrams

        # Calculate PMI
        if p_word1 * p_word2 > 0:
            pmi = math.log(p_bigram / (p_word1 * p_word2))
            return pmi
        else:
            return 0

    def decide_unigram_vs_bigram(self, bigram, doc_index, min_pmi=1.0, min_freq=3):
        """
        Decide whether to keep unigrams or bigram based on PMI and frequency
        Returns: ('keep_bigram', 'keep_unigrams', 'keep_both') and decision reason
        """
        if ' ' not in bigram:
            return 'keep_unigrams', 'Not a valid bigram'

        words = bigram.split(' ', 1)
        if len(words) != 2:
            return 'keep_unigrams', 'Invalid bigram format'

        word1, word2 = words

        # Get counts
        bigram_counts = self.document_bigram_counts[doc_index]
        unigram_counts = self.document_unigram_counts[doc_index]

        bigram_freq = bigram_counts.get(bigram, 0)
        word1_freq = unigram_counts.get(word1, 0)
        word2_freq = unigram_counts.get(word2, 0)

        # Check minimum frequency threshold
        if bigram_freq < min_freq:
            return 'keep_unigrams', f'Bigram too rare (freq={bigram_freq})'

        # Calculate PMI
        pmi = self.calculate_pmi(bigram, doc_index)

        # Decision logic
        if pmi > min_pmi:
            # Strong association - keep bigram, remove constituent unigrams
            return 'keep_bigram', f'Strong association (PMI={pmi:.2f})'
        elif pmi > 0.5:
            # Moderate association - keep both
            return 'keep_both', f'Moderate association (PMI={pmi:.2f})'
        else:
            # Weak association - keep individual unigrams
            return 'keep_unigrams', f'Weak association (PMI={pmi:.2f})'

    def calculate_tfidf_with_decisions(self, doc_index, top_k=None, percentage_based=True,
                                     base_percentage=2.0, min_keywords=5, max_keywords=50):
        """
        Calculate TF-IDF scores with unigram vs bigram decisions
        """
        if doc_index >= len(self.document_unigram_counts):
            return None

        unigram_counts = self.document_unigram_counts[doc_index]
        bigram_counts = self.document_bigram_counts[doc_index]

        total_unigrams = sum(unigram_counts.values())

        # Calculate dynamic top_k if not provided
        if top_k is None and percentage_based:
            calculated_k = int(total_unigrams * (base_percentage / 100))
            top_k = max(min_keywords, min(calculated_k, max_keywords))
            print(f"Document length: {total_unigrams} tokens, Dynamic keywords: {top_k}")
        elif top_k is None:
            top_k = 20

        # Process all terms and make decisions
        final_keywords = []
        eliminated_unigrams = set()  # Track which unigrams to skip
        decision_log = []

        # First pass: Process all bigrams and make decisions
        for bigram, count in bigram_counts.items():
            decision, reason = self.decide_unigram_vs_bigram(bigram, doc_index)

            if decision == 'keep_bigram':
                # Calculate TF-IDF for bigram
                tf = count / total_unigrams
                df = self.bigram_document_frequencies[bigram]
                idf = math.log(self.total_documents / df)
                tfidf_score = tf * idf

                # Get original forms
                bigram_words = bigram.split()
                original_parts = []
                has_mappings = False

                for word in bigram_words:
                    if word in self.original_forms and len(self.original_forms[word]) > 1:
                        original_parts.append(f"{word}‚Üê{sorted(list(self.original_forms[word]))}")
                        has_mappings = True
                    else:
                        original_parts.append(word)

                original_forms = " | ".join(original_parts) if has_mappings else bigram

                final_keywords.append({
                    'Keyword': bigram,
                    'TF-IDF_Score': round(tfidf_score, 4),
                    'TF': round(tf, 4),
                    'IDF': round(idf, 4),
                    'Count_in_Doc': count,
                    'IDF_Count': df,
                    'Term_Type': 'Bigram',
                    'Decision': decision,
                    'Decision_Reason': reason,
                    'PMI': round(self.calculate_pmi(bigram, doc_index), 3),
                    'Original_Forms': original_forms
                })

                # Mark constituent unigrams for elimination
                word1, word2 = bigram.split(' ', 1)
                eliminated_unigrams.add(word1)
                eliminated_unigrams.add(word2)

            elif decision == 'keep_both':
                # Calculate TF-IDF for bigram
                tf = count / total_unigrams
                df = self.bigram_document_frequencies[bigram]
                idf = math.log(self.total_documents / df)
                tfidf_score = tf * idf

                # Get original forms
                bigram_words = bigram.split()
                original_parts = []
                has_mappings = False

                for word in bigram_words:
                    if word in self.original_forms and len(self.original_forms[word]) > 1:
                        original_parts.append(f"{word}‚Üê{sorted(list(self.original_forms[word]))}")
                        has_mappings = True
                    else:
                        original_parts.append(word)

                original_forms = " | ".join(original_parts) if has_mappings else bigram

                final_keywords.append({
                    'Keyword': bigram,
                    'TF-IDF_Score': round(tfidf_score, 4),
                    'TF': round(tf, 4),
                    'IDF': round(idf, 4),
                    'Count_in_Doc': count,
                    'IDF_Count': df,
                    'Term_Type': 'Bigram',
                    'Decision': decision,
                    'Decision_Reason': reason,
                    'PMI': round(self.calculate_pmi(bigram, doc_index), 3),
                    'Original_Forms': original_forms
                })

                # Don't eliminate unigrams in this case

            decision_log.append((bigram, decision, reason))

        # Second pass: Process remaining unigrams
        for unigram, count in unigram_counts.items():
            if unigram not in eliminated_unigrams:
                # Calculate TF-IDF for unigram
                tf = count / total_unigrams
                df = self.unigram_document_frequencies[unigram]
                idf = math.log(self.total_documents / df)
                tfidf_score = tf * idf

                # Get original forms
                original_forms = unigram
                if unigram in self.original_forms and len(self.original_forms[unigram]) > 1:
                    original_forms = f"{unigram}‚Üê{sorted(list(self.original_forms[unigram]))}"

                final_keywords.append({
                    'Keyword': unigram,
                    'TF-IDF_Score': round(tfidf_score, 4),
                    'TF': round(tf, 4),
                    'IDF': round(idf, 4),
                    'Count_in_Doc': count,
                    'IDF_Count': df,
                    'Term_Type': 'Unigram',
                    'Decision': 'keep_unigrams',
                    'Decision_Reason': 'No competing bigram or bigram rejected',
                    'PMI': 0.0,
                    'Original_Forms': original_forms
                })

        # Sort by TF-IDF score
        final_keywords.sort(key=lambda x: x['TF-IDF_Score'], reverse=True)

        # Calculate relative importance
        if final_keywords:
            max_tfidf = final_keywords[0]['TF-IDF_Score']
            for item in final_keywords:
                item['Relative_Importance'] = round(item['TF-IDF_Score'] / max_tfidf, 4) if max_tfidf > 0 else 0

        # Create results DataFrame
        results_df = pd.DataFrame(final_keywords[:top_k])
        results_df.attrs['document_name'] = self.document_names[doc_index]
        results_df.attrs['document_index'] = doc_index
        results_df.attrs['total_keywords_available'] = len(final_keywords)
        results_df.attrs['document_length'] = total_unigrams
        results_df.attrs['eliminated_unigrams'] = len(eliminated_unigrams)
        results_df.attrs['decision_log'] = decision_log

        return results_df

    def analyze_all_documents(self, top_k=None, save_to_excel=True, output_path=None):
        """
        Analyze keywords for all documents with unigram vs bigram decisions
        """
        print(f"Analyzing combined unigram-bigram keywords for all {self.total_documents} documents...")

        all_analyses = {}
        excel_data = []
        decision_stats = {'keep_bigram': 0, 'keep_unigrams': 0, 'keep_both': 0}

        for doc_idx in range(self.total_documents):
            keywords_df = self.calculate_tfidf_with_decisions(doc_idx, top_k=top_k)

            if keywords_df is not None and not keywords_df.empty:
                doc_name = keywords_df.attrs.get('document_name', f'Document_{doc_idx}')
                all_analyses[doc_name] = keywords_df

                # Collect decision statistics
                decision_log = keywords_df.attrs.get('decision_log', [])
                for _, decision, _ in decision_log:
                    decision_stats[decision] = decision_stats.get(decision, 0) + 1

                # Prepare data for Excel
                for idx, row in keywords_df.iterrows():
                    excel_row = {
                        'Document_Name': doc_name,
                        'Document_Index': doc_idx,
                        'Rank': idx + 1,
                        **row.to_dict()
                    }
                    excel_data.append(excel_row)

            if (doc_idx + 1) % 25 == 0:
                print(f"Processed {doc_idx + 1}/{self.total_documents} documents...")

        print("‚úì Combined unigram-bigram analysis complete!")

        # Print decision statistics
        print(f"\nDecision Statistics:")
        print(f"  - Keep bigram (eliminate unigrams): {decision_stats.get('keep_bigram', 0)}")
        print(f"  - Keep unigrams (eliminate bigram): {decision_stats.get('keep_unigrams', 0)}")
        print(f"  - Keep both: {decision_stats.get('keep_both', 0)}")

        if save_to_excel and excel_data:
            if output_path is None:
                output_path = '/content/drive/MyDrive/tfidf/all_document_combined_keywords.xlsx'

            try:
                # Create Excel file with multiple sheets
                with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                    # Summary sheet
                    summary_df = pd.DataFrame(excel_data)
                    summary_df.to_excel(writer, sheet_name='All_Documents_Combined', index=False)

                    # Individual document sheets (first 10 documents)
                    for i, (doc_name, keywords_df) in enumerate(list(all_analyses.items())[:10]):
                        sheet_name = f'Doc_{i+1}'[:31]  # Excel sheet name limit
                        keywords_df.to_excel(writer, sheet_name=sheet_name, index=False)

                    # Decision summary sheet
                    decision_df = pd.DataFrame([
                        {'Decision_Type': k, 'Count': v} for k, v in decision_stats.items()
                    ])
                    decision_df.to_excel(writer, sheet_name='Decision_Summary', index=False)

                print(f"Results saved to: {output_path}")

            except Exception as e:
                print(f"Error saving to Excel: {e}")

        return all_analyses

    def get_vocabulary_stats(self):
        """Get statistics about the vocabulary"""
        total_unigrams = sum(sum(doc_counts.values()) for doc_counts in self.document_unigram_counts)
        total_bigrams = sum(sum(doc_counts.values()) for doc_counts in self.document_bigram_counts)

        avg_unigrams = total_unigrams / self.total_documents if self.total_documents > 0 else 0
        avg_bigrams = total_bigrams / self.total_documents if self.total_documents > 0 else 0

        return {
            'total_documents': self.total_documents,
            'unique_unigrams': len(self.global_unigram_vocab),
            'unique_bigrams': len(self.global_bigram_vocab),
            'total_unigrams': total_unigrams,
            'total_bigrams': total_bigrams,
            'avg_unigrams_per_doc': round(avg_unigrams, 1),
            'avg_bigrams_per_doc': round(avg_bigrams, 1),
            'terms_with_mappings': len(self.original_forms)
        }

class EnhancedBanglaCombinedKeywordExtractor:
    def __init__(self, stopwords_file_path='/content/drive/MyDrive/tfidf/stop.xlsx'):
        """
        Initialize the Enhanced Bangla Combined (Unigram + Bigram) Keyword Extractor
        """
        try:
            if BasicTokenizer is None:
                raise ImportError("BasicTokenizer not available")

            self.tokenizer = BasicTokenizer()

            if BanglaStemmer is not None:
                self.stemmer = BanglaStemmer()
                self.use_stemmer = True
                print("Tokenizer and Stemmer initialized successfully")
            else:
                self.stemmer = None
                self.use_stemmer = False
                print("Tokenizer initialized successfully (Stemmer not available - will use suffix removal)")

        except Exception as e:
            print(f"Error initializing BNLP components: {e}")
            self.tokenizer = None
            self.stemmer = None
            self.use_stemmer = False
            print("Using fallback tokenization (simple split)")

        # Common Bangla suffixes for normalization (sorted by length for proper matching)
        self.bangla_suffixes = [
            '‡¶ó‡ßÅ‡¶≤‡ßã‡¶ï‡ßá','‡¶ó‡ßÅ‡¶≤‡¶ø‡¶ï‡ßá','‡¶ó‡ßÅ‡¶≤‡¶æ‡¶ï‡ßá','‡¶ó‡ßÅ‡¶≤‡ßã', '‡¶ó‡ßÅ‡¶≤‡¶ø', '‡¶ó‡ßÅ‡¶≤‡¶æ', '‡ßá‡¶∞‡¶æ', '‡ßá‡¶¶‡ßá‡¶∞', '‡¶¶‡ßá‡¶∞', '‡¶∞‡¶æ',
            '‡¶è‡¶∞', '‡ßá‡¶∞', '‡¶∞', '‡¶ï‡ßá', '‡¶§‡ßá', '‡¶Ø‡¶º‡ßá','‡¶¨‡¶æ‡¶®', '‡¶Æ‡¶æ‡¶®',
            '‡¶ñ‡¶æ‡¶®‡¶æ', '‡¶ñ‡¶æ‡¶®‡¶ø', '‡¶ü‡ßÅ‡¶ï‡ßÅ', '‡¶ü‡¶ø‡¶ï‡ßá', '‡¶ü‡¶æ‡¶ï‡ßá', '‡¶ü‡¶ø', '‡¶ü‡¶æ',
            '‡¶ì‡¶Ø‡¶º‡¶æ‡¶≤‡¶æ', '‡¶ì‡¶Ø‡¶º‡¶æ‡¶≤‡¶ø', '‡ßá'
        ]
        self.bangla_suffixes.sort(key=len, reverse=True)

        # Initialize stopwords
        self.stopwords = set()
        if stopwords_file_path:
            self._load_custom_stopwords(stopwords_file_path)

        # Initialize variables
        self.document_paths = []
        self.raw_documents = []
        self.vocabulary_mapping = {}

        # Initialize the efficient TF-IDF calculator
        self.tfidf_calculator = EfficientCombinedTFIDF()

    def _load_custom_stopwords(self, file_path):
        """Load custom stopwords from Excel file"""
        try:
            df = pd.read_excel(file_path, header=None)
            custom_stops = df.iloc[:, 0].dropna().astype(str).str.strip().tolist()
            custom_stops = [stop for stop in custom_stops if stop]

            # Tokenize stopwords only if tokenizer is available
            if self.tokenizer and BasicTokenizer is not None:
                tokenized_stopwords = set()
                for stop in custom_stops:
                    tokens = self.tokenizer.tokenize(stop)
                    tokenized_stopwords.update(tokens)
                self.stopwords.update(tokenized_stopwords)
                print(f"Successfully loaded and tokenized {len(custom_stops)} stopwords")
            else:
                # If no tokenizer, keep stopwords as full words
                self.stopwords.update(custom_stops)
                print(f"Successfully loaded {len(custom_stops)} stopwords (no tokenization - tokenizer not available)")

        except FileNotFoundError:
            print(f"Error: Stopwords file not found at {file_path}")
        except Exception as e:
            print(f"Error loading custom stopwords: {e}")

    def smart_suffix_removal(self, word, corpus_vocabulary=None):
        """
        Smart suffix removal that checks if the root exists in corpus
        """
        if len(word) < 3:
            return word, False

        # Try stemmer first if available
        if self.use_stemmer and self.stemmer is not None:
            try:
                stem = self.stemmer.stem(word)
                if stem and len(stem) > 1:
                    if corpus_vocabulary is None or stem in corpus_vocabulary:
                        return stem, True
                    else:
                        return word, False  # Keep original if stem not prominent
            except Exception:
                pass

        # Fallback to suffix removal
        for suffix in self.bangla_suffixes:
            if word.endswith(suffix) and len(word) > len(suffix) + 1:
                root = word[:-len(suffix)]
                if len(root) > 1:
                    # Check if root exists in corpus vocabulary
                    if corpus_vocabulary is None or root in corpus_vocabulary:
                        return root, True
                    else:
                        return word, False  # Keep original

        return word, False

    def create_unigrams_and_bigrams_from_raw(self, text):
        """
        Create both unigrams and bigrams from raw text within sentences only
        """
        if not text or not isinstance(text, str):
            return [], []

        # Split by sentence delimiters
        sentences = re.split(r'[‡•§!?]+', text)
        all_unigrams = []
        all_bigrams = []

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # Clean sentence but keep basic structure
            sentence = re.sub(r'[^\u0980-\u09FF\s]', ' ', sentence)
            sentence = re.sub(r'\s+', ' ', sentence).strip()

            if not sentence:
                continue

            # Tokenize the sentence
            try:
                if self.tokenizer is not None:
                    tokens = self.tokenizer.tokenize(sentence)
                else:
                    tokens = sentence.split()
            except Exception:
                tokens = sentence.split()

            # Filter tokens (basic filtering, no stopword removal yet)
            filtered_tokens = []
            for token in tokens:
                if token and len(token) >= 2:
                    filtered_tokens.append(token)

            # Create unigrams
            all_unigrams.extend(filtered_tokens)

            # Create bigrams within this sentence
            for i in range(len(filtered_tokens) - 1):
                bigram = (filtered_tokens[i], filtered_tokens[i + 1])
                all_bigrams.append(bigram)

        return all_unigrams, all_bigrams

    def build_corpus_vocabulary(self, min_frequency=3):
        """
        Build a vocabulary of prominent terms from all documents
        """
        all_tokens = []

        # First pass: collect all tokens with basic processing
        for doc_path in self.document_paths:
            try:
                with open(doc_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                # Basic text cleaning
                content = re.sub(r'[^\u0980-\u09FF\s‡•§!?]', ' ', content)
                content = re.sub(r'\s+', ' ', content).strip()

                if self.tokenizer is not None:
                    tokens = self.tokenizer.tokenize(content)
                else:
                    tokens = content.split()

                # Basic filtering
                filtered_tokens = []
                for token in tokens:
                    if len(token) >= 2 and not re.match(r'^[‡•§!?]+$', token):
                        filtered_tokens.append(token)

                all_tokens.extend(filtered_tokens)

            except Exception as e:
                print(f"Error processing {doc_path}: {e}")

        # Count frequencies and find prominent terms
        token_counts = Counter(all_tokens)
        prominent_terms = {token for token, count in token_counts.items() if count >= min_frequency}

        print(f"Built corpus vocabulary: {len(prominent_terms)} prominent terms")
        return prominent_terms

    def preprocess_text_combined(self, text, corpus_vocabulary):
        """
        Preprocess text to create filtered and normalized unigrams and bigrams
        """
        if not text or not isinstance(text, str):
            return [], [], {}

        # Step 1: Create unigrams and bigrams from raw text
        raw_unigrams, raw_bigrams = self.create_unigrams_and_bigrams_from_raw(text)

        if not raw_unigrams and not raw_bigrams:
            return [], [], {}

        # Step 2: Process unigrams
        processed_unigrams = []
        mapping_info = {}

        for word in raw_unigrams:
            # Normalize
            norm_word, was_merged = self.smart_suffix_removal(word, corpus_vocabulary)

            # Check stopword after normalization
            if norm_word not in self.stopwords:
                processed_unigrams.append(norm_word)

                # Update mapping info
                if was_merged and norm_word != word:
                    if norm_word not in mapping_info:
                        mapping_info[norm_word] = set()
                    mapping_info[norm_word].add(word)

        # Step 3: Process bigrams
        processed_bigrams = []

        for word1, word2 in raw_bigrams:
            # Normalize both words
            norm_word1, was_merged1 = self.smart_suffix_removal(word1, corpus_vocabulary)
            norm_word2, was_merged2 = self.smart_suffix_removal(word2, corpus_vocabulary)

            # Check stopwords after normalization
            if norm_word1 not in self.stopwords and norm_word2 not in self.stopwords:
                bigram_string = f"{norm_word1} {norm_word2}"
                processed_bigrams.append(bigram_string)

                # Update mapping info
                if was_merged1 and norm_word1 != word1:
                    if norm_word1 not in mapping_info:
                        mapping_info[norm_word1] = set()
                    mapping_info[norm_word1].add(word1)

                if was_merged2 and norm_word2 != word2:
                    if norm_word2 not in mapping_info:
                        mapping_info[norm_word2] = set()
                    mapping_info[norm_word2].add(word2)

        return processed_unigrams, processed_bigrams, mapping_info

    def load_documents_from_folder(self, folder_path):
        """
        Load and preprocess all .txt files with combined unigram and bigram processing
        """
        self.document_paths = []
        self.raw_documents = []
        self.vocabulary_mapping = {}

        if not os.path.exists(folder_path):
            print(f"Error: Folder {folder_path} does not exist")
            return

        txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

        if not txt_files:
            print(f"No .txt files found in {folder_path}")
            return

        print(f"Found {len(txt_files)} .txt files.")

        # First, build corpus vocabulary
        print("Building corpus vocabulary...")
        self.document_paths = [os.path.join(folder_path, f) for f in txt_files]
        corpus_vocabulary = self.build_corpus_vocabulary(min_frequency=2)

        # Now process documents with combined processing
        print("Processing documents with combined unigram-bigram extraction...")
        processed_count = 0

        for i, filename in enumerate(txt_files):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                self.raw_documents.append(content)

                # Combined preprocessing
                processed_unigrams, processed_bigrams, doc_mappings = self.preprocess_text_combined(content, corpus_vocabulary)

                if processed_unigrams or processed_bigrams:
                    # Add to efficient TF-IDF calculator
                    doc_name = filename
                    self.tfidf_calculator.add_document(doc_name, processed_unigrams, processed_bigrams, doc_mappings)

                    # Update global vocabulary mapping
                    for normalized, originals in doc_mappings.items():
                        if normalized not in self.vocabulary_mapping:
                            self.vocabulary_mapping[normalized] = set()
                        self.vocabulary_mapping[normalized].update(originals)

                    processed_count += 1
                    if processed_count % 20 == 0:
                        print(f"Processed {processed_count}/{len(txt_files)} documents...")
                else:
                    print(f"‚ö† Skipped (empty after processing): {filename}")

            except Exception as e:
                print(f"‚úó Error processing {filename}: {e}")

        print(f"\nSuccessfully processed {processed_count} documents")
        print(f"Vocabulary mappings created: {len(self.vocabulary_mapping)} terms")

        # Show some mapping examples
        if self.vocabulary_mapping:
            print("\nSample suffix mappings:")
            for normalized, originals in list(self.vocabulary_mapping.items())[:10]:
                if len(originals) > 1:
                    print(f"  {normalized} <- {list(originals)}")

    def get_keywords_for_document(self, doc_index, top_k=None):
        """
        Get keywords for a specific document using the combined calculator with decisions
        """
        return self.tfidf_calculator.calculate_tfidf_with_decisions(doc_index, top_k=top_k)

    def analyze_all_documents(self, top_k=None, save_to_excel=True, output_path=None):
        """
        Analyze keywords for all documents using the combined calculator with decisions
        """
        return self.tfidf_calculator.analyze_all_documents(top_k=top_k, save_to_excel=save_to_excel, output_path=output_path)

    def get_statistics(self):
        """Get overall statistics"""
        stats = self.tfidf_calculator.get_vocabulary_stats()
        stats['terms_with_suffix_mappings'] = len(self.vocabulary_mapping)
        return stats

    def get_decision_analysis(self, doc_index):
        """
        Get detailed decision analysis for a specific document
        """
        keywords_df = self.get_keywords_for_document(doc_index)
        if keywords_df is None:
            return None

        decision_log = keywords_df.attrs.get('decision_log', [])

        analysis = {
            'document_name': keywords_df.attrs.get('document_name'),
            'eliminated_unigrams': keywords_df.attrs.get('eliminated_unigrams', 0),
            'total_keywords': keywords_df.attrs.get('total_keywords_available', 0),
            'decisions': {}
        }

        for bigram, decision, reason in decision_log:
            analysis['decisions'][bigram] = {
                'decision': decision,
                'reason': reason,
                'pmi': round(self.tfidf_calculator.calculate_pmi(bigram, doc_index), 3)
            }

        return analysis

# Enhanced main function for combined analysis
def enhanced_combined_main():
    """Enhanced main function for combined unigram-bigram analysis with PMI decisions"""

    # Initialize the enhanced extractor
    print("Initializing Enhanced Bangla Combined Keyword Extractor...")
    extractor = EnhancedBanglaCombinedKeywordExtractor(
        stopwords_file_path='/content/drive/MyDrive/tfidf/temp.xlsx'
    )

    # Load documents
    folder_path = '/content/drive/MyDrive/tfidf/Concatenated_Summary'
    print(f"\nLoading documents from: {folder_path}")
    extractor.load_documents_from_folder(folder_path)

    if extractor.tfidf_calculator.total_documents == 0:
        print("No documents were processed. Exiting.")
        return

    # Analyze all documents and save to Excel
    print(f"\n{'='*60}")
    print("ANALYZING ALL DOCUMENTS - COMBINED UNIGRAM-BIGRAM WITH PMI DECISIONS")
    print(f"{'='*60}")

    all_results = extractor.analyze_all_documents(
        save_to_excel=True,
        output_path='/content/drive/MyDrive/tfidf/all_document_combined_keywords.xlsx'
    )

    # Print overall statistics
    print(f"\n{'='*60}")
    print("OVERALL STATISTICS")
    print(f"{'='*60}")

    stats = extractor.get_statistics()
    print(f"üìä Total documents processed: {stats['total_documents']}")
    print(f"üìö Total unigram vocabulary size: {stats['unique_unigrams']}")
    print(f"üìö Total bigram vocabulary size: {stats['unique_bigrams']}")
    print(f"üîó Terms with suffix mappings: {stats['terms_with_suffix_mappings']}")
    print(f"üìù Average unigrams per document: {stats['avg_unigrams_per_doc']}")
    print(f"üìù Average bigrams per document: {stats['avg_bigrams_per_doc']}")
    print(f"üìà Total unigrams processed: {stats['total_unigrams']}")
    print(f"üìà Total bigrams processed: {stats['total_bigrams']}")

    # Show decision analysis for first document
    if extractor.tfidf_calculator.total_documents > 0:
        print(f"\nüîç DECISION ANALYSIS SAMPLE (First Document):")
        print("-" * 50)

        decision_analysis = extractor.get_decision_analysis(0)
        if decision_analysis:
            print(f"Document: {decision_analysis['document_name']}")
            print(f"Eliminated unigrams: {decision_analysis['eliminated_unigrams']}")
            print(f"Total keywords: {decision_analysis['total_keywords']}")

            print(f"\nTop 10 PMI Decisions:")
            decisions = decision_analysis['decisions']
            sorted_decisions = sorted(decisions.items(), key=lambda x: abs(x[1]['pmi']), reverse=True)

            for i, (bigram, info) in enumerate(sorted_decisions[:10]):
                print(f"  {i+1:2d}. {bigram:<25} ‚Üí {info['decision']:<15} (PMI: {info['pmi']:6.2f}) - {info['reason']}")

    # Show most frequently merged terms
    if extractor.vocabulary_mapping:
        print(f"\nüîÑ TOP SUFFIX MERGING EXAMPLES:")
        print("-" * 40)
        merged_terms = [(k, v) for k, v in extractor.vocabulary_mapping.items() if len(v) > 1]
        merged_terms.sort(key=lambda x: len(x[1]), reverse=True)

        for normalized, originals in merged_terms[:15]:
            print(f"  {normalized} ‚Üê {list(originals)}")

    print(f"\n{'='*60}")
    print("üéâ ENHANCED COMBINED UNIGRAM-BIGRAM ANALYSIS COMPLETE!")
    print("üìÅ Check the Excel file for detailed results with PMI-based decisions")
    print(f"{'='*60}")

    return extractor

class VideoProcessor:
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        self.model, self.tokenizer = None, None   # Summary model
        self.model2, self.tokenizer2 = None, None # Title model

        # Always initialize punct_agent
        self.punct_agent = None

        # Initialize punctuation agent
        try:
            from banglanlptoolkit import BanglaPunctuation
            self.punct_agent = BanglaPunctuation()
            print("‚úÖ BanglaPunctuation loaded")
        except ImportError:
            print("‚ö†Ô∏è BanglaPunctuation not available, using fallback")
        except Exception as e:
            print(f"‚ö†Ô∏è Unexpected error loading BanglaPunctuation: {e}")

    def _load_summary_model(self):
        if self.model is None:
            # Free title model if loaded
            if self.model2 is not None:
                del self.model2
                del self.tokenizer2
                self.model2, self.tokenizer2 = None, None
                torch.cuda.empty_cache()

            model_path = "/content/drive/My Drive/Thesis_Dataset/fine_tuned_bangla_t5"
            self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.model.to(self.device)


    def _load_title_model(self):
        if self.model2 is None:
            # Free summary model if loaded
            if self.model is not None:
                del self.model
                del self.tokenizer
                self.model, self.tokenizer = None, None
                torch.cuda.empty_cache()

            self.model2 = AutoModelForSeq2SeqLM.from_pretrained(
                "/content/drive/My Drive/Thesis_Dataset/academic_title_model"
            )
            self.tokenizer2 = AutoTokenizer.from_pretrained(
                "/content/drive/My Drive/Thesis_Dataset/academic_title_model"
            )
            self.model2.to(self.device)



    def safe_normalize(self, text):
        """Safe normalization that works with or without bnunicodenormalizer"""
        try:
            from bnunicodenormalizer import normalize
            return normalize(text)
        except ImportError:
            import unicodedata
            if not text or not isinstance(text, str):
                return text
            text = unicodedata.normalize('NFC', text)
            text = ' '.join(text.split())
            return text.strip()
        except Exception:
            return text

    def extract_audio_from_video(self, video_path, audio_output_path):
        """Extract audio from video file and save as WAV"""
        try:
            print(f"üé¨ Loading video: {video_path}")

            if not os.path.exists(video_path):
                raise FileNotFoundError(f"Video file not found: {video_path}")

            video = VideoFileClip(video_path)
            duration = video.duration
            print(f"‚è±Ô∏è Video duration: {duration/60:.1f} minutes")

            if video.audio is None:
                raise Exception("Video has no audio track!")

            os.makedirs(os.path.dirname(audio_output_path), exist_ok=True)

            print(f"üéµ Extracting audio to: {audio_output_path}")
            video.audio.write_audiofile(
                audio_output_path,
                codec='pcm_s16le',
                verbose=False,
                logger=None
            )

            video.close()

            if os.path.exists(audio_output_path):
                file_size_mb = os.path.getsize(audio_output_path) / (1024 * 1024)
                print(f"‚úÖ Audio extracted successfully! ({file_size_mb:.1f} MB)")
                return True
            else:
                raise Exception("Audio extraction failed - output file not created")

        except Exception as e:
            print(f"‚ùå Error extracting audio: {e}")
            raise

    def add_punctuation(self, raw_text):
        """Add punctuation to the raw text"""
        if self.punct_agent:
            try:
                raw_text = ' '.join(raw_text.split())
                return self.punct_agent.add_punctuation(raw_text)
            except:
                pass

        # Fallback punctuation logic
        text = raw_text.strip()
        if not text.endswith(('‡•§', '?', '!')):
            text += '‡•§'
        return text

    def format_timestamp(self, ms):
        """Convert milliseconds to MM:SS format"""
        seconds = ms // 1000
        minutes = seconds // 60
        seconds = seconds % 60
        return f"{minutes:02d}:{seconds:02d}"

    def smart_chunk_amplify(self, chunk, max_boost_db=8, target_level_db=-20):
        """
        Enhanced per-chunk amplification with normalization
        - Applies amplification-normalization strategy per chunk
        - Prevents clipping with safety margin
        - Optimizes each chunk individually
        - Maintains consistent loudness across chunks
        """
        # Initial measurements
        current_avg = chunk.dBFS
        peak_level = chunk.max_dBFS

        # Determine desired boost based on average level
        if current_avg > -15:
            # Already loud enough
            desired_boost = 0
        elif current_avg > -25:
            # Moderately quiet
            desired_boost = min(3, max_boost_db)
        elif current_avg > -35:
            # Quiet
            desired_boost = min(6, max_boost_db)
        else:
            # Very quiet
            desired_boost = max_boost_db

        # Apply amplification if needed
        if desired_boost > 0:
            amplified_chunk = chunk + desired_boost
        else:
            amplified_chunk = chunk

        # Normalization step (safety check to prevent clipping)
        if amplified_chunk.max_dBFS > -3:  # Leave 3dB headroom
            normalized_chunk = amplified_chunk.normalize(headroom=3.0)
            return normalized_chunk, desired_boost, current_avg, peak_level, True
        else:
            return amplified_chunk, desired_boost, current_avg, peak_level, False


    def transcribe_audio(self, audio_path, chunk_length_ms=45000, max_tokens=512,
                                        apply_per_chunk_amplification=True, max_boost_db=8):
        """
        Enhanced transcription function with:
        - First: Audio chunking (45 sec) to get sentence-level timestamps
        - Second: Token-based chunking (512 tokens) with proper time mapping
        - Smart per-chunk amplification for better audio quality
        - Overlapping chunks (one sentence overlap)
        """
        # Initialize recognizer with optimized settings
        recognizer = sr.Recognizer()

        # Optimize recognizer settings for better accuracy
        recognizer.energy_threshold = 300
        recognizer.dynamic_energy_threshold = True
        recognizer.pause_threshold = 0.5

        print(f"‚úÖ Optimized speech recognizer initialized")
        print(f"‚ö° Per-chunk amplification: {apply_per_chunk_amplification}")
        print(f"üåç Target language: Bengali (bn-BD)")
        print(f"üéØ Audio chunk size: {chunk_length_ms/1000:.1f} seconds")
        print(f"üìù Token-based final chunks: {max_tokens} tokens max")

        try:
            audio = AudioSegment.from_wav(audio_path)
            print(f"‚úÖ Audio loaded: {len(audio)/1000:.1f} seconds, Avg: {audio.dBFS:.1f} dB, Peak: {audio.max_dBFS:.1f} dB")
        except Exception as e:
            print(f"‚ùå Error loading audio file: {e}")
            return None

        chunks = make_chunks(audio, chunk_length_ms)
        all_chunks_text = []
        audio_chunks_timestamps = []
        amplification_stats = []
        failed_chunks = 0

        print(f"üìù Step 1: Transcribing audio in {len(chunks)} short chunks for sentence mapping...")

        # Step 1: Transcribe all audio chunks (45 seconds each)
        for i, chunk in enumerate(tqdm(chunks, desc="Transcribing audio chunks")):
            chunk_filename = f"temp_chunk_{i}.wav"

            start_time_ms = i * chunk_length_ms
            end_time_ms = min((i + 1) * chunk_length_ms, len(audio))
            audio_chunks_timestamps.append((start_time_ms, end_time_ms))

            try:
                # Apply smart per-chunk amplification
                if apply_per_chunk_amplification:
                    processed_chunk, boost_applied, orig_avg, orig_peak, is_normalized = self.smart_chunk_amplify(
                        chunk, max_boost_db=max_boost_db
                    )

                    amplification_stats.append({
                        'Chunk_ID': i + 1,
                        'Original_Avg_dB': round(orig_avg, 1),
                        'Original_Peak_dB': round(orig_peak, 1),
                        'Boost_Applied_dB': round(boost_applied, 1),
                        'Final_Avg_dB': round(processed_chunk.dBFS, 1),
                        'Final_Peak_dB': round(processed_chunk.max_dBFS, 1),
                        'Was_Normalized': is_normalized
                    })
                else:
                    processed_chunk = chunk
                    boost_applied = 0

                processed_chunk.export(chunk_filename, format="wav")

                with sr.AudioFile(chunk_filename) as source:
                    audio_data = recognizer.record(source)
                    try:
                        text = recognizer.recognize_google(audio_data, language="bn-BD")
                        text_clean = ' '.join(text.split())
                        punctuated_text = self.add_punctuation(text_clean)
                        all_chunks_text.append(punctuated_text)
                    except sr.UnknownValueError:
                        all_chunks_text.append("[Unrecognized Audio]")
                        failed_chunks += 1
                    except sr.RequestError as e:
                        all_chunks_text.append(f"[Request Error: {e}]")
                        failed_chunks += 1

                if os.path.exists(chunk_filename):
                    os.remove(chunk_filename)

            except Exception as e:
                print(f"Error processing chunk {i}: {e}")
                all_chunks_text.append(f"[Processing Error: {e}]")
                failed_chunks += 1
                if os.path.exists(chunk_filename):
                    os.remove(chunk_filename)

        # Step 2: Create sentence-level mapping with timestamps and carry-over handling
        print("üîÑ Step 2: Creating sentence-level timestamp mapping with carry-over handling...")

        sentence_data = []
        sentence_id = 1
        carry_over_text = ""  # For incomplete sentences across audio chunks
        carry_over_start_ms = 0

        for chunk_idx, (text, (start_ms, end_ms)) in enumerate(zip(all_chunks_text, audio_chunks_timestamps)):
            if text.startswith("["):
                # Error chunk - treat as one sentence, but handle carry-over first
                if carry_over_text.strip():
                    # Save carry-over as incomplete sentence before error chunk
                    sentence_data.append({
                        'sentence_id': sentence_id,
                        'text': carry_over_text.strip(),
                        'start_ms': carry_over_start_ms,
                        'end_ms': start_ms,  # End at start of error chunk
                        'chunk_duration_ms': start_ms - carry_over_start_ms,
                        'source_chunk_id': chunk_idx,
                        'is_error': False,
                        'is_complete': False
                    })
                    sentence_id += 1
                    carry_over_text = ""

                # Add error chunk
                sentence_data.append({
                    'sentence_id': sentence_id,
                    'text': text,
                    'start_ms': start_ms,
                    'end_ms': end_ms,
                    'chunk_duration_ms': end_ms - start_ms,
                    'source_chunk_id': chunk_idx + 1,
                    'is_error': True,
                    'is_complete': True
                })
                sentence_id += 1
                continue

            # Combine carry-over text with current chunk
            full_text = carry_over_text + " " + text if carry_over_text else text
            full_text = full_text.strip()

            # Split into sentences
            sentences = split_into_sentences(full_text)
            if not sentences:
                # No sentences found, treat entire text as carry-over
                if not carry_over_text:
                    carry_over_start_ms = start_ms
                carry_over_text = full_text
                continue

            # Check if the last sentence is complete
            last_sentence_complete = False
            if full_text.strip():
                if re.search(r'[‡•§?!]\s*$', full_text.strip()):
                    last_sentence_complete = True

            if last_sentence_complete or chunk_idx == len(all_chunks_text) - 1:
                complete_sentences = sentences
                new_carry_over = ""
            else:
                if len(sentences) > 1:
                    complete_sentences = sentences[:-1]
                    new_carry_over = sentences[-1] if sentences else ""
                else:
                    complete_sentences = []
                    new_carry_over = sentences[0] if sentences else full_text

            if complete_sentences:
                chunk_duration_ms = end_ms - start_ms
                actual_start_ms = carry_over_start_ms if carry_over_text else start_ms

                if carry_over_text:
                    total_duration_ms = end_ms - carry_over_start_ms
                    time_per_sentence = total_duration_ms / len(complete_sentences) if complete_sentences else total_duration_ms
                else:
                    time_per_sentence = chunk_duration_ms / len(complete_sentences) if complete_sentences else chunk_duration_ms

                for sent_idx, sentence in enumerate(complete_sentences):
                    if sentence.strip():
                        sent_start_ms = actual_start_ms + (sent_idx * time_per_sentence)
                        sent_end_ms = actual_start_ms + ((sent_idx + 1) * time_per_sentence)

                        sentence_data.append({
                            'sentence_id': sentence_id,
                            'text': sentence.strip(),
                            'start_ms': int(sent_start_ms),
                            'end_ms': int(sent_end_ms),
                            'chunk_duration_ms': int(sent_end_ms - sent_start_ms),
                            'source_chunk_id': chunk_idx + 1,
                            'is_error': False,
                            'is_complete': True
                        })
                        sentence_id += 1

            carry_over_text = new_carry_over
            if carry_over_text:
                if complete_sentences:
                    carry_over_start_ms = int(actual_start_ms + (len(complete_sentences) * time_per_sentence))
                else:
                    carry_over_start_ms = start_ms

        if carry_over_text.strip():
            last_end_ms = audio_chunks_timestamps[-1][1] if audio_chunks_timestamps else carry_over_start_ms
            sentence_data.append({
                'sentence_id': sentence_id,
                'text': carry_over_text.strip(),
                'start_ms': carry_over_start_ms,
                'end_ms': last_end_ms,
                'chunk_duration_ms': last_end_ms - carry_over_start_ms,
                'source_chunk_id': len(all_chunks_text),
                'is_error': False,
                'is_complete': False
            })
        print(f"‚úÖ Created {len(sentence_data)} sentences with timestamps")

        # Step 3: Create token-based chunks with overlapping and carry-over handling
        print(f"üîÑ Step 3: Creating token-based chunks (max {max_tokens} tokens) with overlap and carry-over...")

        def count_tokens(text):
            try:
                tokens = self.tokenizer.encode(text)
                return len(tokens)
            except:
                return len(text) // 4

        token_based_chunks = []
        current_chunk_sentences = []
        current_chunk_tokens = 0
        previous_last_sentence = None
        incomplete_sentence_buffer = []
        max_tokens = 512

        for sentence in sentence_data:
            sentence_text = sentence['text']
            sentence_tokens = count_tokens(sentence_text)
            is_complete = sentence.get('is_complete', True)

            if not is_complete and not sentence.get('is_error', False):
                incomplete_sentence_buffer.append(sentence)
                continue

            if incomplete_sentence_buffer:
                merged_text_parts = [s['text'] for s in incomplete_sentence_buffer] + [sentence_text]
                merged_text = ' '.join(merged_text_parts).strip()

                merged_sentence = {
                    'sentence_id': incomplete_sentence_buffer[0]['sentence_id'],
                    'text': merged_text,
                    'start_ms': incomplete_sentence_buffer[0]['start_ms'],
                    'end_ms': sentence['end_ms'],
                    'is_complete': True,
                    'is_error': sentence.get('is_error', False),
                    'merged_from_count': len(incomplete_sentence_buffer) + 1
                }

                incomplete_sentence_buffer = []
                sentence_to_process = merged_sentence
                sentence_tokens = count_tokens(merged_text)
            else:
                sentence_to_process = sentence

            overlap_tokens = 0
            if previous_last_sentence is not None:
                overlap_tokens = count_tokens(previous_last_sentence['text'])

            total_tokens_needed = current_chunk_tokens + sentence_tokens + overlap_tokens

            if (total_tokens_needed > max_tokens and current_chunk_sentences) or (sentence_tokens + overlap_tokens > max_tokens):
                if current_chunk_sentences:
                    chunk_sentences_with_overlap = []
                    has_overlap = False
                    overlap_sentence_text = ""
                    actual_overlap_tokens = 0

                    if previous_last_sentence is not None and len(token_based_chunks) > 0:
                        overlap_text = previous_last_sentence['text']
                        actual_overlap_tokens = count_tokens(overlap_text)

                        if current_chunk_tokens + actual_overlap_tokens <= max_tokens:
                            overlap_sentence_dict = {
                                'text': overlap_text,
                                'sentence_id': previous_last_sentence['sentence_id'],
                                'start_ms': previous_last_sentence['start_ms'],
                                'end_ms': previous_last_sentence['end_ms']
                            }
                            chunk_sentences_with_overlap.append(overlap_sentence_dict)
                            has_overlap = True
                            overlap_sentence_text = overlap_text

                    chunk_sentences_with_overlap.extend(current_chunk_sentences)

                    chunk_start_time = chunk_sentences_with_overlap[0]['start_ms']
                    chunk_end_time = chunk_sentences_with_overlap[-1]['end_ms']
                    chunk_text = '‡•§'.join([s['text'] for s in chunk_sentences_with_overlap]) + '‡•§'

                    final_token_count = count_tokens(chunk_text)

                    token_based_chunks.append({
                        'text': chunk_text,
                        'start_ms': chunk_start_time,
                        'end_ms': chunk_end_time,
                        'token_count': final_token_count,
                        'sentence_count': len(current_chunk_sentences),
                        'first_sentence_id': current_chunk_sentences[0]['sentence_id'],
                        'last_sentence_id': current_chunk_sentences[-1]['sentence_id'],
                        'has_overlap': has_overlap,
                        'overlap_sentence': overlap_sentence_text,
                        'overlap_tokens': actual_overlap_tokens,
                        'base_tokens': current_chunk_tokens,
                        'has_merged_sentences': any(s.get('merged_from_count', 0) > 1 for s in current_chunk_sentences)
                    })

                    previous_last_sentence = current_chunk_sentences[-1]

                current_chunk_sentences = [sentence_to_process]
                current_chunk_tokens = sentence_tokens

                if sentence_tokens + overlap_tokens > max_tokens:
                    print(f"‚ö†Ô∏è Warning: Sentence {sentence_to_process.get('sentence_id', 'unknown')} with overlap exceeds max_tokens ({sentence_tokens + overlap_tokens} > {max_tokens})")
            else:
                current_chunk_sentences.append(sentence_to_process)
                current_chunk_tokens += sentence_tokens

        if incomplete_sentence_buffer:
            merged_text = ' '.join([s['text'] for s in incomplete_sentence_buffer]).strip()
            merged_sentence = {
                'sentence_id': incomplete_sentence_buffer[0]['sentence_id'],
                'text': merged_text,
                'start_ms': incomplete_sentence_buffer[0]['start_ms'],
                'end_ms': incomplete_sentence_buffer[-1]['end_ms'],
                'is_complete': False,
                'is_error': False,
                'merged_from_count': len(incomplete_sentence_buffer)
            }
            current_chunk_sentences.append(merged_sentence)
            current_chunk_tokens += count_tokens(merged_text)

        if current_chunk_sentences:
            chunk_sentences_with_overlap = []
            has_overlap = False
            overlap_sentence_text = ""
            actual_overlap_tokens = 0

            if previous_last_sentence is not None and len(token_based_chunks) > 0:
                overlap_text = previous_last_sentence['text']
                actual_overlap_tokens = count_tokens(overlap_text)

                if current_chunk_tokens + actual_overlap_tokens <= max_tokens:
                    overlap_sentence_dict = {
                        'text': overlap_text,
                        'sentence_id': previous_last_sentence['sentence_id'],
                        'start_ms': previous_last_sentence['start_ms'],
                        'end_ms': previous_last_sentence['end_ms']
                    }
                    chunk_sentences_with_overlap.append(overlap_sentence_dict)
                    has_overlap = True
                    overlap_sentence_text = overlap_text

            chunk_sentences_with_overlap.extend(current_chunk_sentences)

            chunk_start_time = chunk_sentences_with_overlap[0]['start_ms']
            chunk_end_time = chunk_sentences_with_overlap[-1]['end_ms']
            chunk_text = '‡•§'.join([s['text'] for s in chunk_sentences_with_overlap]) + '‡•§'

            final_token_count = count_tokens(chunk_text)

            token_based_chunks.append({
                'text': chunk_text,
                'start_ms': chunk_start_time,
                'end_ms': chunk_end_time,
                'token_count': final_token_count,
                'sentence_count': len(current_chunk_sentences),
                'first_sentence_id': current_chunk_sentences[0]['sentence_id'],
                'last_sentence_id': current_chunk_sentences[-1]['sentence_id'],
                'has_overlap': has_overlap,
                'overlap_sentence': overlap_sentence_text,
                'overlap_tokens': actual_overlap_tokens,
                'base_tokens': current_chunk_tokens,
                'has_merged_sentences': any(s.get('merged_from_count', 0) > 1 for s in current_chunk_sentences)
            })

        print(f"‚úÖ Created {len(token_based_chunks)} token-based chunks")

        # Step 4: Create DataFrames
        print("üìä Step 4: Preparing output data...")

        token_chunk_data = []
        for i, chunk in enumerate(token_based_chunks):
            token_chunk_data.append({
                "Chunk_ID": i + 1,
                "Start_Time": self.format_timestamp(chunk["start_ms"]),
                "End_Time": self.format_timestamp(chunk["end_ms"]),
                "Duration_MS": chunk["end_ms"] - chunk["start_ms"],
                "Text": chunk["text"],
                "Token_Count": chunk["token_count"],
                "Sentence_Count": chunk["sentence_count"],
                "First_Sentence_ID": chunk["first_sentence_id"],
                "Last_Sentence_ID": chunk["last_sentence_id"],
                "Has_Overlap": chunk["has_overlap"],
                "Overlap_Sentence": chunk["overlap_sentence"],
                "Overlap_Tokens": chunk.get("overlap_tokens", 0),
                "Base_Tokens": chunk.get("base_tokens", 0),
                "Has_Merged_Sentences": chunk.get("has_merged_sentences", False)
            })




        return pd.DataFrame(token_chunk_data)


    def generate_summary(self, text):
        try:
            self._load_summary_model()  # ensure summary model is loaded

            input_text = f"summarize: {text}"
            inputs = self.tokenizer(
                input_text,
                max_length=512,
                truncation=True,
                return_tensors="pt",
                padding="max_length"
            ).to(self.device)

            with torch.no_grad():
                summary_ids = self.model.generate(
                    inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_length=200,
                    min_length=100,
                    length_penalty=1.0,
                    num_beams=4,
                    no_repeat_ngram_size=3,
                    early_stopping=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    do_sample=False
                )

            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            return summary if summary.strip() else text[:200] + "..."
        except Exception as e:
            print(f"Error generating summary: {e}")
            return text[:200] + "..." if len(text) > 200 else text

    def generate_title(self, summary):
        self._load_title_model()  # ensure title model is loaded

        # Handle long summaries
        tokens = self.tokenizer2.encode(summary, add_special_tokens=False)
        if len(tokens) > 1024:
            tokens = tokens[:1024]
            summary = self.tokenizer2.decode(tokens, skip_special_tokens=True)

        inputs = self.tokenizer2(summary, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            title_ids = self.model2.generate(inputs['input_ids'], num_beams=4, min_length=5, max_length=30)

        return self.tokenizer2.decode(title_ids[0], skip_special_tokens=True)

    def concatenate_pairs(self, df):
        """
        Concatenate pairs of rows in the dataframe
        - Combines text/summaries from two consecutive rows
        - Updates Start_Time and End_Time accordingly
        """
        new_rows = []

        for i in range(0, len(df), 2):
            if i + 1 < len(df):
                # Pair exists - concatenate two rows
                row1 = df.iloc[i]
                row2 = df.iloc[i + 1]

                # Combine text (use Summary if available, otherwise Text)
                text1 = row1.get('Summary', row1.get('Text', ''))
                text2 = row2.get('Summary', row2.get('Text', ''))
                combined_text = f"{text1} {text2}"

                # Create new row with combined data
                new_row = {
                    'Chunk_ID': f"combined_{i//2 + 1}",
                    'Text': combined_text,
                    'Start_Time': row1.get('Start_Time', ''),
                    'End_Time': row2.get('End_Time', '')
                }

                # Preserve any other columns from the first row
                for col in df.columns:
                    if col not in ['Chunk_ID', 'Text', 'Start_Time', 'End_Time', 'Summary']:
                        new_row[col] = row1.get(col, '')

                new_rows.append(new_row)
            else:
                # Odd row - keep as is
                row = df.iloc[i].copy()
                row['Chunk_ID'] = f"single_{i//2 + 1}"

                # Use Summary if available, otherwise Text
                if 'Summary' in row and pd.notna(row['Summary']):
                    row['Text'] = row['Summary']

                new_rows.append(row.to_dict())

        return pd.DataFrame(new_rows)

    # Function to count tokens
    def count_tokens(self, text, tokenizer):
        return len(tokenizer.encode(text, truncation=False, padding=False))

    def word_count(self,text):
      if pd.isna(text):
        return 0
      return len(str(text).split())

    def clean_text(self,text):
      if pd.isna(text):
        return ""
      return re.sub(r"\[[a-zA-Z _]+\]", " ", str(text))





    def process_complete_pipeline(self, video_file_path, temp_dir, max_chunks=50, job_id=None,top_k=20):
        try:
            # File paths
            audio_path = os.path.join(temp_dir, "extracted_audio.wav")
            transcription_path = os.path.join(temp_dir, "transcription.xlsx")
            summary_path = os.path.join(temp_dir, "summaries.xlsx")
            final_text_path = os.path.join(temp_dir, "final_summary.txt")

            def update_status(progress, message):
                """Helper function to update both local results and global status"""
                if job_id and job_id in processing_status:
                    processing_status[job_id].update({
                        'progress': progress,
                        'message': message,
                        'status': 'processing'
                    })

            # Step 1: Extract audio
            update_status(10, 'Extracting audio from video...')
            self.extract_audio_from_video(video_file_path, audio_path)

            # Step 2: Transcribe
            update_status(30, 'Transcribing audio...')
            transcription_df = self.transcribe_audio(audio_path)
            transcription_df["Text"] = transcription_df["Text"].apply(self.clean_text)

            # Save transcription
            transcription_df.to_excel(transcription_path, index=False)

            # Step 3: Generate summaries with recursive logic
            update_status(50, 'Processing summaries...')

            # Check if we need recursive summarization
            if len(transcription_df) > max_chunks:
                update_status(55, f'Input has {len(transcription_df)} chunks (>{max_chunks}). Starting recursive summarization...')
                print(f"‚ö° Input has {len(transcription_df)} rows (>{max_chunks}). Starting recursive summarization...")

                # Perform recursive summarization
                final_df, total_iterations = self.recursive_summarization_with_concatenation(
                    transcription_df, max_chunks, job_id
                )

                update_status(75, f'Completed recursive summarization in {total_iterations} iterations')
                print(f"üéØ Completed recursive summarization in {total_iterations} iterations")

            else:
                update_status(60, f'Input has ‚â§{max_chunks} chunks. Generating summaries directly...')
                print(f"üìÑ Input has ‚â§{max_chunks} rows. Generating summaries directly...")

                # Direct summarization for smaller datasets
                transcription_df['Final_Summary'] = transcription_df['Text'].apply(
                    lambda x: self.generate_summary(str(x)) if not str(x).startswith('[') else x
                )
                final_df = transcription_df
                total_iterations = 1

            # Step 4: Save final summaries
            update_status(80, 'Saving summary results...')

            # Ensure Final_Summary column exists
            if 'Final_Summary' not in final_df.columns:
                final_df['Final_Summary'] = final_df['Text'].apply(
                    lambda x: self.generate_summary(str(x)) if not str(x).startswith('[') else x
                )

            # Save summaries
            final_df.to_excel(summary_path, index=False)
            print(f"DataFrame shape: {final_df.shape}")
            print(f"Number of rows: {len(final_df)}")
            total_text_words = final_df["Text"].apply(self.word_count).sum()
            total_summary_words = final_df["Final_Summary"].apply(self.word_count).sum()
            compression_ratio = total_summary_words / total_text_words if total_text_words > 0 else 0
            # Step 5: Create final text with timestamps
            update_status(90, 'Creating final summary document...')

            tmp_text=""
            for index, row in final_df.iterrows():

                summary = row.get('Final_Summary', row.get('Text', ''))
                tmp_text += f"{summary}"



            TARGET_FOLDER = "/content/drive/MyDrive/tfidf/Concatenated_Summary"
            EXCEL_PATH = "/content/drive/MyDrive/tfidf/all_document_combined_keywords.xlsx"

            tmp_text=tmp_text.strip()
            title=self.generate_title(tmp_text)
            SOURCE_FILE_PATH="/content/drive/MyDrive/tfidf/new.txt"

            with open(SOURCE_FILE_PATH, 'w', encoding='utf-8') as file:
              file.write(tmp_text)  # Or tmp_text if that's what you're using

            print(f"File saved successfully to: {SOURCE_FILE_PATH}")

            source_filename = os.path.basename(SOURCE_FILE_PATH)
            target_file_path = os.path.join(TARGET_FOLDER, source_filename)

            try:
              shutil.copy2(SOURCE_FILE_PATH, target_file_path)
              print(f"‚úÖ File copied to: {target_file_path}")

            except Exception as e:
              print(f"‚ùå Error copying file: {e}")
              exit()

            try:
              extractor = enhanced_combined_main()  # This will process the TARGET_FOLDER
              if extractor is None:
                print("‚ùå enhanced_combined_main() failed")
                exit()
            except Exception as e:
              print(f"‚ùå Error in enhanced_combined_main(): {e}")
              exit()







            try:
              if not os.path.exists(EXCEL_PATH):
                print(f"‚ùå Excel file not found: {EXCEL_PATH}")
                exit()

              df = pd.read_excel(EXCEL_PATH)
              target_filename = source_filename  # This already has extension like doc1.txt
              matching_rows = df[df['Document_Name'] == target_filename]
              if matching_rows.empty:
                unique_docs = df['Document_Name'].unique()
                for i, doc in enumerate(unique_docs[:10]):
                  print(f"   {i+1}. {doc}")
                if len(unique_docs) > 10:
                  print(f"   ... and {len(unique_docs) - 10} more")
              else:
                matching_rows = matching_rows.sort_values('Rank')
                keywords = matching_rows['Keyword'].tolist()
                print("Extracted Keywords:")
                for keyword in keywords:
                  print(keyword)
                doc_name = os.path.splitext(target_filename)[0]
                output_filename = f"{doc_name}_combined_keywords.xlsx"
                output_path = os.path.join("/content/drive/MyDrive/tfidf/", output_filename)
                with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                  matching_rows.to_excel(writer, sheet_name='Keywords', index=False)
                  decision_summary = matching_rows.groupby(['Term_Type', 'Decision']).size().reset_index(name='Count')
                  decision_summary.to_excel(writer, sheet_name='Decision_Summary', index=False)
                  pmi_analysis = matching_rows[matching_rows['PMI'] != 0].sort_values('PMI', ascending=False)
                  if not pmi_analysis.empty:
                    pmi_analysis.to_excel(writer, sheet_name='PMI_Analysis', index=False)
                print(f"üíæ Combined keywords saved to: {output_path}")
            except Exception as e:
              print(f"‚ùå Error extracting keywords: {e}")
            try:
              os.remove(target_file_path)
            except Exception as e:
              print(f"‚ö†Ô∏è  Warning: Could not remove file: {e}")

            keywords_text = ", ".join(keywords)
            final_text = f"Title: {title}\n\n"
            final_text += f"Important Keywords: {keywords_text}\n\n"
            final_text += f"Compression Ratio: {compression_ratio:.4f}\n\n"
            for index, row in final_df.iterrows():
                start_time = row.get('Start_Time', '')
                end_time = row.get('End_Time', '')
                summary = row.get('Final_Summary', row.get('Text', ''))
                chunk = f"[{start_time} - {end_time}]\n{summary}\n\n"
                print(f"Row {index}: {repr(chunk)}")

                final_text += chunk


            # 2. Define a dictionary of regex patterns and replacements
                regex_patterns = {
    r"‡¶ü‡ßÅ ‡¶¶‡¶ø (‡¶™‡¶æ‡¶ì‡ßü‡¶æ‡¶∞|‡¶™‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞)|‡¶ü‡ßÅ ‡¶¶‡¶æ (‡¶™‡¶æ‡¶ì‡ßü‡¶æ‡¶∞|‡¶™‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞)": "^",     # Replace "‡¶ü‡ßÅ ‡¶¶‡¶ø ‡¶™‡¶æ‡¶ì‡ßü‡¶æ‡¶∞" with "^"
    r"‡¶∏‡¶Æ‡¶æ‡¶® ‡¶∏‡¶Æ‡¶æ‡¶®|‡¶á‡¶ï‡ßÅ‡¶Ø‡¶º‡¶æ‡¶≤‡¶∏ ‡¶ü‡ßÅ|‡¶á‡¶ï‡ßÅ‡ßü‡¶æ‡¶≤‡¶∏ ‡¶ü‡ßÅ|‡¶á‡¶ï‡ßÅ‡¶Ø‡¶º‡¶æ‡¶≤ ‡¶ü‡ßÅ|‡¶á‡¶ï‡ßÅ‡ßü‡¶æ‡¶≤ ‡¶ü‡ßÅ": "=",

                 # Replace "‡¶¶‡¶∂" and "‡¶ü‡ßá‡¶®" with "10"
    r"‡¶è‡¶ó‡¶æ‡¶∞‡ßã|‡¶è‡¶≤‡ßá‡¶≠‡ßá‡¶®": "11",      # Replace "‡¶è‡¶ó‡¶æ‡¶∞‡ßã" and "‡¶è‡¶≤‡ßá‡¶≠‡ßá‡¶®" with "11"
    r"‡¶¨‡¶æ‡¶∞‡ßã|‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶≤‡¶≠|‡¶ü‡ßÅ‡ßü‡ßá‡¶≤‡¶≠": "12",        # Replace "‡¶¨‡¶æ‡¶∞‡ßã" and "‡¶ü‡ßÅ‡ßü‡ßá‡¶≤‡¶≠" with "12"
    r"‡¶§‡ßá‡¶∞‡ßã|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø‡¶®": "13",      # Replace "‡¶§‡ßá‡¶∞‡ßã" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø‡¶®" with "13"
    r"‡¶ö‡ßã‡¶¶‡ßç‡¶¶‡ßã|‡¶´‡ßã‡¶∞‡ßç‡¶ü‡¶ø‡¶®": "14",    # Replace "‡¶ö‡ßã‡¶¶‡ßç‡¶¶‡ßã" and "‡¶´‡ßã‡¶∞‡ßç‡¶ü‡¶ø‡¶®" with "14"
    r"‡¶™‡¶®‡ßá‡¶∞‡ßã|‡¶´‡¶ø‡¶´‡¶ü‡¶ø‡¶®": "15",      # Replace "‡¶™‡¶®‡ßá‡¶∞‡ßã" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø‡¶®" with "15"
    r"‡¶∑‡ßã‡¶≤‡ßã|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø‡¶®": "16",     # Replace "‡¶∑‡ßã‡¶≤‡ßã" and "‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø‡¶®" with "16"
    r"‡¶∏‡¶§‡ßá‡¶∞‡ßã|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø‡¶®": "17",   # Replace "‡¶∏‡¶§‡ßá‡¶∞‡ßã" and "‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø‡¶®" with "17"
    r"‡¶Ü‡¶†‡¶æ‡¶∞‡ßã|‡¶è‡¶á‡¶ü‡¶ø‡¶®": "18",       # Replace "‡¶Ü‡¶†‡¶æ‡¶∞‡ßã" and "‡¶è‡¶á‡¶ü‡¶ø‡¶®" with "18"
    r"‡¶ä‡¶®‡¶ø‡¶∂|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø‡¶®": "19",       # Replace "‡¶ä‡¶®‡¶ø‡¶∂" and "‡¶®‡¶ø‡¶®‡¶ü‡¶ø‡¶®" with "19"

    r"‡¶è‡¶ï‡ßÅ‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) (‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®)": "21",# Replace "‡¶è‡¶ï‡ßÅ‡¶∂" and "‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶ì‡ßü‡¶æ‡¶®" with "21"
    r"‡¶¨‡¶æ‡¶á‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) ‡¶ü‡ßÅ": "22",  # Replace "‡¶¨‡¶æ‡¶á‡¶∂" and "‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶ü‡ßÅ" with "22"
    r"‡¶§‡ßá‡¶á‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) ‡¶•‡ßç‡¶∞‡¶ø": "23",# Replace "‡¶§‡ßá‡¶á‡¶∂" and "‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø" with "23"
    r"‡¶ö‡¶¨‡ßç‡¶¨‡¶ø‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) ‡¶´‡ßã‡¶∞": "24",# Replace "‡¶ö‡¶¨‡ßç‡¶¨‡¶ø‡¶∂" and "‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶´‡ßã‡¶∞" with "24"
    r"‡¶™‡¶Å‡¶ö‡¶ø‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) ‡¶´‡¶æ‡¶á‡¶≠": "25",# Replace "‡¶™‡¶Å‡¶á‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠" with "25"
    r"‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏": "26",# Replace "‡¶õ‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏" with "26"
    r"‡¶∏‡¶æ‡¶§‡¶æ‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) ‡¶∏‡ßá‡¶≠‡ßá‡¶®": "27",# Replace "‡¶∏‡¶æ‡¶§‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®" with "27"
    r"‡¶Ü‡¶ü‡¶æ‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) ‡¶è‡¶á‡¶ü": "28",# Replace "‡¶Ü‡¶ü‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶è‡¶á‡¶ü" with "28"
    r"‡¶ä‡¶®‡¶§‡ßç‡¶∞‡¶ø‡¶∂|(‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø) ‡¶®‡¶æ‡¶á‡¶®": "29",# Replace "‡¶ä‡¶®‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®" with "29"

    r"‡¶è‡¶ï‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø (‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®)": "31",# Replace "‡¶è‡¶ï‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶ì‡ßü‡¶æ‡¶®" with "31"
    r"‡¶¨‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶ü‡ßÅ": "32",   # Replace "‡¶¨‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶ü‡ßÅ" with "32"
    r"‡¶§‡ßá‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø": "33",# Replace "‡¶§‡ßá‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø" with "33"
    r"‡¶ö‡ßå‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶´‡ßã‡¶∞": "34",# Replace "‡¶ö‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶´‡ßã‡¶∞" with "34"
    r"‡¶™‡¶Å‡¶á‡ßü‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶™‡¶Å‡¶á‡¶Ø‡¶º‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠": "35",# Replace "‡¶™‡¶Å‡¶á‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠" with "35"
    r"‡¶õ‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏": "36",# Replace "‡¶õ‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏" with "36"
    r"‡¶∏‡¶æ‡¶á‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®": "37",# Replace "‡¶∏‡¶æ‡¶§‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®" with "37"
    r"‡¶Ü‡¶ü‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶è‡¶á‡¶ü": "38",# Replace "‡¶Ü‡¶ü‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶è‡¶á‡¶ü" with "38"
    r"‡¶ä‡¶®‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®": "39",# Replace "‡¶ä‡¶®‡¶™‡¶Å‡¶á‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®" with "39"

    r"‡¶è‡¶ï‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø (‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®)": "41",# Replace "‡¶è‡¶ï‡¶§‡¶≤‡ßç‡¶≤‡¶ø‡¶∂" and "‡¶´‡¶∞‡¶ü‡¶ø ‡¶ì‡ßü‡¶æ‡¶®" with "41"
    r"‡¶¨‡¶ø‡ßü‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶¨‡¶ø‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø ‡¶ü‡ßÅ": "42",# Replace "‡¶¨‡¶ø‡ßü‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂" and "‡¶´‡¶∞‡¶ü‡¶ø ‡¶ü‡ßÅ" with "42"
    r"‡¶§‡ßá‡¶§‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø": "43",# Replace "‡¶§‡ßá‡¶§‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂" and "‡¶´‡¶∞‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø" with "43"
    r"‡¶ö‡ßÅ‡ßü‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø ‡¶´‡ßã‡¶∞": "44",# Replace "‡¶ö‡ßÅ‡ßü‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂" and "‡¶´‡¶∞‡¶ü‡¶ø ‡¶´‡ßã‡¶∞" with "44"
    r"‡¶™‡¶Å‡ßü‡¶§‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶™‡¶Å‡¶Ø‡¶º‡¶§‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠": "45",# Replace "‡¶™‡¶Å‡¶á‡¶§‡ßç‡¶∞‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂" and "‡¶´‡¶∞‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠" with "45"
    r"‡¶õ‡ßá‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏": "46",     # Replace "‡¶õ‡ßá‡¶∑‡¶ü‡ßç‡¶ü‡¶ø" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø" with "46"
    r"‡¶∏‡¶æ‡¶§‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®": "47",# Replace "‡¶∏‡¶æ‡¶§‡¶∑‡¶ü‡ßç‡¶ü‡¶ø" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶ì‡ßü‡¶æ‡¶®" with "47"
    r"‡¶Ü‡¶ü‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø ‡¶è‡¶á‡¶ü": "48",  # Replace "‡¶Ü‡¶ü‡¶∑‡¶ü‡ßç‡¶ü‡¶ø" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶ü‡ßÅ" with "48"
    r"‡¶ä‡¶®‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®": "49",# Replace "‡¶ä‡¶®‡¶®‡¶¨‡ßç‡¶¨‡¶á" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø" with "49"

    r"‡¶è‡¶ï‡¶æ‡¶®‡ßç‡¶®|‡¶´‡¶ø‡¶´‡¶ü‡¶ø (‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®)": "51", # Replace "‡¶è‡¶ï‡¶æ‡¶®‡ßç‡¶®" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠" with "51"
    r"‡¶¨‡¶æ‡ßü‡¶æ‡¶®‡ßç‡¶®|‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶ü‡ßÅ": "52",# Replace "‡¶¨‡¶æ‡ßü‡¶æ‡¶®‡ßç‡¶®" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏" with "52"
    r"‡¶§‡¶ø‡¶™‡ßç‡¶™‡¶æ‡¶®‡ßç‡¶®|‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø": "53",# Replace "‡¶§‡¶ø‡¶™‡ßç‡¶™‡¶æ‡¶®‡ßç‡¶®" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®" with "53"
    r"‡¶ö‡ßÅ‡ßü‡¶æ‡¶®‡ßç‡¶®|‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶®‡ßç‡¶®|‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶´‡ßã‡¶∞": "54",# Replace "‡¶ö‡ßÅ‡ßü‡¶æ‡¶®‡ßç‡¶®" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶è‡¶á‡¶ü" with "54"
    r"‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶®‡ßç‡¶®|‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠": "55",# Replace "‡¶™‡¶Å‡¶á‡¶™‡¶Å‡¶ö‡¶æ‡¶®‡ßç‡¶®" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®" with "55"
    r"‡¶õ‡¶æ‡¶™‡ßç‡¶™‡¶æ‡¶®‡ßç‡¶®|‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏": "56",
    r"‡¶∏‡¶æ‡¶§‡¶æ‡¶®‡ßç‡¶®|‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®": "57",
    r"‡¶è‡¶ü‡¶æ‡¶®‡ßç‡¶®|‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶è‡¶á‡¶ü": "58",
    r"‡¶ä‡¶®‡¶∑‡¶æ‡¶ü|‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®": "59",

    r"‡¶è‡¶ï‡¶∑‡¶ü‡ßç‡¶ü‡¶ø|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø (‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®)": "61",    # Replace "‡¶è‡¶ï‡¶∑‡¶ü‡ßç‡¶ü‡¶ø" and "‡¶∏‡ßá‡¶≠‡ßá‡¶®‡¶ü‡¶ø" with "61"
    r"‡¶¨‡¶æ‡¶∑‡¶ü‡ßç‡¶ü‡¶ø|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø ‡¶ü‡ßÅ": "62",# Replace "‡¶¨‡¶æ‡¶∑‡¶ü‡ßç‡¶ü‡¶ø" and "‡¶∏‡ßá‡¶≠‡ßá‡¶®‡¶ü‡¶ø ‡¶ì‡ßü‡¶æ‡¶®" with "62"
    r"‡¶§‡ßá‡¶∑‡¶ü‡ßç‡¶ü‡¶ø|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø": "63",
    r"‡¶ö‡ßå‡¶∑‡¶ü‡ßç‡¶ü‡¶ø|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø ‡¶´‡ßã‡¶∞": "64",
    r"‡¶™‡¶Å‡¶á‡¶∑‡¶ü‡ßç‡¶ü‡¶ø|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠": "65",
    r"‡¶õ‡ßá‡¶∑‡¶ü‡ßç‡¶ü‡¶ø|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏": "66",
    r"‡¶∏‡¶æ‡¶§‡¶∑‡¶ü‡ßç‡¶ü‡¶ø|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®": "67",
    r"‡¶Ü‡¶ü‡¶∑‡¶ü‡ßç‡¶ü‡¶ø|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø ‡¶è‡¶á‡¶ü": "68",
    r"‡¶ä‡¶®‡¶∏‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®": "69",

    r"‡¶è‡¶ï‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø (‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®)": "71",
    r"‡¶¨‡¶æ‡¶π‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶ü‡ßÅ‡•§‡¶¨‡¶æ‡ßü‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶¨‡¶ø‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞": "72",
    r"‡¶§‡¶ø‡ßü‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶§‡¶ø‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø": "73",
    r"‡¶ö‡ßÅ‡ßü‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶´‡ßã‡¶∞": "74",
    r"‡¶™‡¶Å‡¶ö‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠": "75",
    r"‡¶õ‡¶ø‡ßü‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏": "76",
    r"‡¶∏‡¶æ‡¶§‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®": "77",
    r"‡¶Ü‡¶ü‡¶æ‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶è‡¶á‡¶ü": "78",
    r"‡¶ä‡¶®‡¶Ü‡¶∂‡¶ø|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®": "79",

    r"‡¶è‡¶ï‡¶æ‡¶∂‡¶ø|‡¶è‡¶á‡¶ü‡¶ø (‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®)": "81",
    r"‡¶¨‡¶ø‡¶∞‡¶æ‡¶∂‡¶ø|‡¶è‡¶á‡¶ü‡¶ø ‡¶ü‡ßÅ": "82",
    r"‡¶§‡¶ø‡¶∞‡¶æ‡¶∂‡¶ø|‡¶è‡¶á‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø": "83",
    r"‡¶ö‡ßÅ‡¶∞‡¶æ‡¶∂‡¶ø|‡¶è‡¶á‡¶ü‡¶ø ‡¶´‡ßã‡¶∞": "84",
    r"‡¶™‡¶Å‡¶ö‡¶æ‡¶∂‡¶ø|‡¶è‡¶á‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠": "85",
    r"‡¶õ‡¶ø‡ßü‡¶æ‡¶∂‡¶ø|‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶∂‡¶ø|‡¶è‡¶á‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏": "86",
    r"‡¶∏‡¶æ‡¶§‡¶æ‡¶∂‡¶ø|‡¶è‡¶á‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®": "87",
    r"‡¶Ü‡¶ü‡¶æ‡¶∂‡¶ø|‡¶è‡¶á‡¶ü‡¶ø ‡¶è‡¶á‡¶ü": "88",
    r"‡¶ä‡¶®‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶è‡¶á‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®": "89",

    r"‡¶è‡¶ï‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø (‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®)": "91",
    r"‡¶¨‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø ‡¶ü‡ßÅ": "92",
    r"‡¶§‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø ‡¶•‡ßç‡¶∞‡¶ø": "93",
    r"‡¶ö‡ßÅ‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø ‡¶´‡ßã‡¶∞": "94",
    r"‡¶™‡¶Å‡¶ö‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø ‡¶´‡¶æ‡¶á‡¶≠": "95",
    r"‡¶õ‡¶ø‡ßü‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏": "96",
    r"‡¶∏‡¶æ‡¶§‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø ‡¶∏‡ßá‡¶≠‡ßá‡¶®": "97",
    r"‡¶Ü‡¶ü‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø ‡¶è‡¶á‡¶ü": "98",
    r"‡¶®‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø ‡¶®‡¶æ‡¶á‡¶®": "99",

             # Replace "‡¶¨‡¶ø‡¶∂" and "‡¶ü‡ßÅ‡¶á‡¶®‡ßç‡¶ü‡¶ø" with "20"
    r"‡¶§‡ßç‡¶∞‡¶ø‡¶∂|‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø": "30",      # Replace "‡¶§‡ßç‡¶∞‡¶ø‡¶∂" and "‡¶•‡¶æ‡¶∞‡ßç‡¶ü‡¶ø" with "30"
     r"‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂|‡¶´‡¶∞‡¶ü‡¶ø": "40",      # Replace "‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂" and "‡¶´‡¶∞‡¶ü‡¶ø" with "40"
     r"‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶∂|‡¶´‡¶ø‡¶´‡¶ü‡¶ø": "50",  # Replace "‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶∂" and "‡¶´‡¶ø‡¶´‡¶ü‡¶ø ‡¶´‡ßã‡¶∞" with "50"
     r"‡¶∑‡¶æ‡¶ü|‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø": "60",        # Replace "‡¶∑‡¶æ‡¶ü" and "‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡¶ü‡¶ø" with "60"
    r"‡¶∏‡¶§‡ßç‡¶§‡¶∞|‡¶∏‡ßá‡¶≠‡ßá‡¶®‡ßç‡¶ü‡¶ø": "70",
     r"‡¶®‡¶¨‡ßç‡¶¨‡¶á|‡¶®‡¶æ‡¶á‡¶®‡¶ü‡¶ø": "90",

     r"‡¶ì‡¶Ø‡¶º‡¶æ‡¶®|‡¶ì‡ßü‡¶æ‡¶®": "1" ,      # Replace "‡¶ì‡ßü‡¶æ‡¶®", "‡¶è‡¶ï", and "‡¶•‡ßç‡¶∞‡¶ø" with "1"
     r" ‡¶ü‡ßÅ‡•§": " 2‡•§",
     r" ‡¶ü‡ßÅ\)": " 2)",
     r"‡¶¶‡ßÅ‡¶á": "2",      # Replace "‡¶ü‡ßÅ", "‡¶¶‡ßÅ‡¶á", "two" with "2"
     r" ‡¶ü‡ßÅ ": " 2 ",
     r" ‡¶ü‡ßÅ,": " 2,",
     r"\(‡¶ü‡ßÅ ": "(2 ",

    r" ‡¶§‡¶ø‡¶® | ‡¶•‡ßç‡¶∞‡¶ø ": " 3 ",            # Replace "‡¶§‡¶ø‡¶®" and "‡¶•‡ßç‡¶∞‡¶ø" with "3"
    r" ‡¶ö‡¶æ‡¶∞ | ‡¶´‡ßã‡¶∞ ": " 4 ",             # Replace "‡¶ö‡¶æ‡¶∞" and "‡¶´‡ßã‡¶∞" with "4"
    r" ‡¶™‡¶æ‡¶Å‡¶ö | ‡¶´‡¶æ‡¶á‡¶≠ ": " 5 ",           # Replace "‡¶™‡¶æ‡¶Å‡¶ö" and "‡¶´‡¶æ‡¶á‡¶≠" with "5"
    r" ‡¶õ‡ßü| ‡¶õ‡¶Ø‡¶º | ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏ ": " 6 ",            # Replace "‡¶õ‡ßü" and "‡¶∏‡¶ø‡¶ï‡ßç‡¶∏" with "6"
    r" ‡¶∏‡¶æ‡¶§ | ‡¶∏‡ßá‡¶≠‡ßá‡¶® ": " 7 ",           # Replace "‡¶∏‡¶æ‡¶§" and "‡¶∏‡ßá‡¶≠‡ßá‡¶®" with "7"
    r" ‡¶Ü‡¶ü | ‡¶è‡¶á‡¶ü ": " 8 ",              # Replace "‡¶Ü‡¶ü" and "‡¶è‡¶á‡¶ü" with "8"
    r" ‡¶®‡¶æ‡¶á‡¶® ": " 9 ",             # Replace "‡¶®‡ßü" and "‡¶®‡¶æ‡¶á‡¶®" with "9"

    r" ‡¶§‡¶ø‡¶®,| ‡¶•‡ßç‡¶∞‡¶ø,": " 3,",
    r" ‡¶§‡¶ø‡¶®‡•§| ‡¶•‡ßç‡¶∞‡¶ø‡•§": " 3‡•§",
    r" ‡¶§‡¶ø‡¶®\)| ‡¶•‡ßç‡¶∞‡¶ø\)": " 3)",
    r"^‡¶§‡¶ø‡¶® |^‡¶•‡ßç‡¶∞‡¶ø ": "3 ",
    r"\(‡¶§‡¶ø‡¶® |\(‡¶•‡ßç‡¶∞‡¶ø ": "(3 ",

    # 4
    r" ‡¶ö‡¶æ‡¶∞,| ‡¶´‡ßã‡¶∞,": " 4,",
    r" ‡¶ö‡¶æ‡¶∞‡•§| ‡¶´‡ßã‡¶∞‡•§": " 4‡•§",
    r" ‡¶ö‡¶æ‡¶∞\)| ‡¶´‡ßã‡¶∞\)": " 4)",
    r"^‡¶ö‡¶æ‡¶∞ |^‡¶´‡ßã‡¶∞ ": "4 ",
    r"\(‡¶ö‡¶æ‡¶∞ |\(‡¶´‡ßã‡¶∞ ": "(4 ",

    # 5
    r" ‡¶™‡¶æ‡¶Å‡¶ö,| ‡¶´‡¶æ‡¶á‡¶≠,": " 5,",
    r" ‡¶™‡¶æ‡¶Å‡¶ö‡•§| ‡¶´‡¶æ‡¶á‡¶≠‡•§": " 5‡•§",
    r" ‡¶™‡¶æ‡¶Å‡¶ö\)| ‡¶´‡¶æ‡¶á‡¶≠\)": " 5)",
    r"^‡¶™‡¶æ‡¶Å‡¶ö |^‡¶´‡¶æ‡¶á‡¶≠ ": "5 ",
    r"\(‡¶™‡¶æ‡¶Å‡¶ö |\(‡¶´‡¶æ‡¶á‡¶≠ ": "(5 ",

    # 6
    r" (‡¶õ‡ßü|‡¶õ‡¶Ø‡¶º),| ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏,": " 6,",
    r" (‡¶õ‡ßü|‡¶õ‡¶Ø‡¶º)‡•§| ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏‡•§": " 6‡•§",
    r" (‡¶õ‡ßü|‡¶õ‡¶Ø‡¶º)\)| ‡¶∏‡¶ø‡¶ï‡ßç‡¶∏\)": " 6)",
    r"^(‡¶õ‡ßü|‡¶õ‡¶Ø‡¶º) |^‡¶∏‡¶ø‡¶ï‡ßç‡¶∏ ": "6 ",
    r"\((‡¶õ‡ßü|‡¶õ‡¶Ø‡¶º) |\(‡¶∏‡¶ø‡¶ï‡ßç‡¶∏ ": "(6 ",

    # 7
    r" ‡¶∏‡¶æ‡¶§,| ‡¶∏‡ßá‡¶≠‡ßá‡¶®,": " 7,",
    r" ‡¶∏‡¶æ‡¶§‡•§| ‡¶∏‡ßá‡¶≠‡ßá‡¶®‡•§": " 7‡•§",
    r" ‡¶∏‡¶æ‡¶§\)| ‡¶∏‡ßá‡¶≠‡ßá‡¶®\)": " 7)",
    r"^‡¶∏‡¶æ‡¶§ |^‡¶∏‡ßá‡¶≠‡ßá‡¶® ": "7 ",
    r"\(‡¶∏‡¶æ‡¶§ |\(‡¶∏‡ßá‡¶≠‡ßá‡¶® ": "(7 ",

    # 8
    r" ‡¶Ü‡¶ü,| ‡¶è‡¶á‡¶ü,": " 8,",
    r" ‡¶Ü‡¶ü‡•§| ‡¶è‡¶á‡¶ü‡•§": " 8‡•§",
    r" ‡¶Ü‡¶ü\)| ‡¶è‡¶á‡¶ü\)": " 8)",
    r"^‡¶Ü‡¶ü |^‡¶è‡¶á‡¶ü ": "8 ",
    r"\(‡¶Ü‡¶ü |\(‡¶è‡¶á‡¶ü ": "(8 ",

    # 9
    r" ‡¶®‡¶æ‡¶á‡¶®,": " 9,",
    r" ‡¶®‡¶æ‡¶á‡¶®‡•§": " 9‡•§",
    r" ‡¶®‡¶Ø‡¶º\)| ‡¶®‡ßü\)| ‡¶®‡¶æ‡¶á‡¶®\)": " 9)",
    r"^‡¶®‡¶Ø‡¶º |^‡¶®‡ßü |^‡¶®‡¶æ‡¶á‡¶® ": "9 ",
    r"\(‡¶®‡¶Ø‡¶º |\(‡¶®‡ßü |\(‡¶®‡¶æ‡¶á‡¶® ": "(9 ",

    # 10
   r" ‡¶¶‡¶∂,| ‡¶ü‡ßá‡¶®,": " 10,",
   r" ‡¶¶‡¶∂‡•§| ‡¶ü‡ßá‡¶®‡•§": " 10‡•§",
   r" ‡¶¶‡¶∂\)| ‡¶ü‡ßá‡¶®\)": " 10)",
   r"^‡¶¶‡¶∂ |^‡¶ü‡ßá‡¶® ": "10 ",
   r" ‡¶¶‡¶∂ | ‡¶ü‡ßá‡¶® ": " 10 ",
   r"\(‡¶¶‡¶∂ |\(‡¶ü‡ßá‡¶® ": "(10 ",

  # 20
    r"‡¶ü‡ßÅ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø|‡¶ü‡ßÅ‡ßü‡ßá‡¶®‡ßç‡¶ü‡¶ø": "20",
  r" ‡¶¨‡¶ø‡¶∂,| ‡¶ü‡ßÅ‡¶á‡¶®‡ßç‡¶ü‡¶ø,": " 20,",
  r" ‡¶¨‡¶ø‡¶∂‡•§| ‡¶ü‡ßÅ‡¶á‡¶®‡ßç‡¶ü‡¶ø‡•§": " 20‡•§",
  r" ‡¶¨‡¶ø‡¶∂\)| ‡¶ü‡ßÅ‡¶á‡¶®‡ßç‡¶ü‡¶ø\)": " 20)",
  r"^‡¶¨‡¶ø‡¶∂ |^‡¶ü‡ßÅ‡¶á‡¶®‡ßç‡¶ü‡¶ø ": "20 ",
  r" ‡¶¨‡¶ø‡¶∂ | ‡¶ü‡ßÅ‡¶á‡¶®‡ßç‡¶ü‡¶ø ": " 20 ",
  r"\(‡¶¨‡¶ø‡¶∂ |\(‡¶ü‡ßÅ‡¶á‡¶®‡ßç‡¶ü‡¶ø ": "(20 ",

    # 80
    r" ‡¶Ü‡¶∂‡¶ø,| ‡¶è‡¶á‡¶ü‡¶ø,": " 80,",
    r" ‡¶Ü‡¶∂‡¶ø‡•§| ‡¶è‡¶á‡¶ü‡¶ø‡•§": " 80‡•§",
    r" ‡¶Ü‡¶∂‡¶ø\)| ‡¶è‡¶á‡¶ü‡¶ø\)": " 80)",
    r"^‡¶Ü‡¶∂‡¶ø |^‡¶è‡¶á‡¶ü‡¶ø ": "80 ",
    r" ‡¶Ü‡¶∂‡¶ø | ‡¶è‡¶á‡¶ü‡¶ø ": " 80 ",
    r"\(‡¶Ü‡¶∂‡¶ø |\(‡¶è‡¶á‡¶ü‡¶ø ": "(80 ",

    r"‡¶ú‡¶ø‡¶∞‡ßã": "0",




    r"‡ß¶": "0",
    r"‡ßß": "1",
    r"‡ß®": "2",
    r"‡ß©": "3",
    r"‡ß™": "4",
    r"‡ß´": "5",
    r"‡ß¨": "6",
    r"‡ß≠": "7",
    r"‡ßÆ": "8",
    r"‡ßØ": "9",

    r" ‡¶è ": " a ",                 # Replace "‡¶è" with "a"
    r" ‡¶¨‡¶ø ": " b ",                # Replace "‡¶¨‡¶ø" with "b"
    r" ‡¶∏‡¶ø ": " c ",                # Replace "‡¶∏‡¶ø" with "c"
    r" ‡¶°‡¶ø ": " d ",                # Replace "‡¶°‡¶ø" with "d"
    r" ‡¶á ": " e ",                 # Replace "‡¶á" with "e"
    r" ‡¶è‡¶´ ": " f ",                # Replace "‡¶è‡¶´" with "f"
    r" ‡¶ú‡¶ø ": " g ",                # Replace "‡¶ú‡¶ø" with "g"
    r" ‡¶è‡¶á‡¶ö ": " h ",               # Replace "‡¶è‡¶á‡¶ö" with "h"
    r" ‡¶Ü‡¶á ": " i ",                # Replace "‡¶Ü‡¶á" with "i"
    r" ‡¶ú‡ßá ": " j ",                # Replace "‡¶ú‡ßá" with "j"
    r" ‡¶ï‡ßá ": " k ",                # Replace "‡¶ï‡ßá" with "k"
    r" ‡¶è‡¶≤ ": " l ",                # Replace "‡¶è‡¶≤" with "l"
    r" ‡¶è‡¶Æ ": " m ",                # Replace "‡¶è‡¶Æ" with "m"
    r" ‡¶è‡¶® ": " n ",                # Replace "‡¶è‡¶®" with "n"
   # r"‡¶ì": "o",                 # Replace "‡¶ì" with "o"
    r" ‡¶™‡¶ø ": " p ",                # Replace "‡¶™‡¶ø" with "p"
    r" ‡¶ï‡¶ø‡¶â ": " q ",               # Replace "‡¶ï‡¶ø‡¶â" with "q"
    #r"‡¶Ü‡¶∞": "r",                # Replace "‡¶Ü‡¶∞" with "r"
    r" ‡¶è‡¶∏ ": " s ",                # Replace "‡¶è‡¶∏" with "s"
    r" ‡¶ü‡¶ø ": " t ",                # Replace "‡¶ü‡¶ø" with "t"
    r" ‡¶á‡¶â ": " u ",                # Replace "‡¶á‡¶â" with "u"
    r" ‡¶≠‡¶ø ": " v ",                # Replace "‡¶≠‡¶ø" with "v"
    r" (‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â‡•§‡¶°‡¶æ‡¶¨‡¶≤‡¶ø‡¶â) ": " w ",            # Replace "‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â" with "w"
    r" ‡¶è‡¶ï‡ßç‡¶∏ ": " x ",              # Replace "‡¶è‡¶ï‡ßç‡¶∏" with "x"
    r" ‡¶ì‡¶Ø‡¶º‡¶æ‡¶á | ‡¶ì‡ßü‡¶æ‡¶á ": "y",             # Replace "‡¶ì‡ßü‡¶æ‡¶á" with "y"
    r" ‡¶ú‡ßá‡¶° ": " z ",               # Replace "‡¶ú‡ßá‡¶°" with "z"


    #r"^‡¶è ": "a ",                 # Replace "‡¶è" with "a"
    r"^‡¶¨‡¶ø ": "b ",                # Replace "‡¶¨‡¶ø" with "b"
    r"^‡¶∏‡¶ø ": "c ",                # Replace "‡¶∏‡¶ø" with "c"
    r"^‡¶°‡¶ø ": "d ",                # Replace "‡¶°‡¶ø" with "d"
    r"^‡¶á ": "e ",                 # Replace "‡¶á" with "e"
    r"^‡¶è‡¶´ ": "f ",                # Replace "‡¶è‡¶´" with "f"
    r"^‡¶ú‡¶ø ": "g ",                # Replace "‡¶ú‡¶ø" with "g"
    r"^‡¶è‡¶á‡¶ö ": "h ",               # Replace "‡¶è‡¶á‡¶ö" with "h"
    r"^‡¶Ü‡¶á ": "i ",                # Replace "‡¶Ü‡¶á" with "i"
    r"^‡¶ú‡ßá ": "j ",                # Replace "‡¶ú‡ßá" with "j"
    r"^‡¶ï‡ßá ": "k ",                # Replace "‡¶ï‡ßá" with "k"
    r"^‡¶è‡¶≤ ": "l ",                # Replace "‡¶è‡¶≤" with "l"
    r"^‡¶è‡¶Æ ": "m ",                # Replace "‡¶è‡¶Æ" with "m"
    r"^‡¶è‡¶® ": "n ",                # Replace "‡¶è‡¶®" with "n"
   # r"‡¶ì": "o",                 # Replace "‡¶ì" with "o"
    r"^‡¶™‡¶ø ": "p ",                # Replace "‡¶™‡¶ø" with "p"
    r"^‡¶ï‡¶ø‡¶â ": "q ",               # Replace "‡¶ï‡¶ø‡¶â" with "q"
    #r"‡¶Ü‡¶∞": "r",                # Replace "‡¶Ü‡¶∞" with "r"
    r"^‡¶è‡¶∏ ": "s ",                # Replace "‡¶è‡¶∏" with "s"
    r"^‡¶ü‡¶ø ": "t ",                # Replace "‡¶ü‡¶ø" with "t"
    r"^‡¶á‡¶â ": "u ",                # Replace "‡¶á‡¶â" with "u"
    r"^‡¶≠‡¶ø ": "v ",                # Replace "‡¶≠‡¶ø" with "v"
    r"^(‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â‡•§‡¶°‡¶æ‡¶¨‡¶≤‡¶ø‡¶â) ": "w ",            # Replace "‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â" with "w"
    r"^‡¶è‡¶ï‡ßç‡¶∏ ": "x ",              # Replace "‡¶è‡¶ï‡ßç‡¶∏" with "x"
    r"^‡¶ì‡¶Ø‡¶º‡¶æ‡¶á |^‡¶ì‡ßü‡¶æ‡¶á ": "y ",              # Replace "‡¶ì‡ßü‡¶æ‡¶á" with "y"
    r"^‡¶ú‡ßá‡¶° ": "z ",               # Replace "‡¶ú‡ßá‡¶°" with "z"

    r" ‡¶è,": " a,",                 # Replace "‡¶è" with "a"
    r" ‡¶¨‡¶ø,": " b,",                # Replace "‡¶¨‡¶ø" with "b"
    r" ‡¶∏‡¶ø,": " c,",                # Replace "‡¶∏‡¶ø" with "c"
    r" ‡¶°‡¶ø,": " d,",                # Replace "‡¶°‡¶ø" with "d"
    r" ‡¶á,": " e,",                 # Replace "‡¶á" with "e"
    r" ‡¶è‡¶´,": " f,",                # Replace "‡¶´‡¶ø" with "f"
    r" ‡¶ú‡¶ø,": " g,",                # Replace "‡¶ó‡¶ø" with "g"
    r" ‡¶è‡¶á‡¶ö,": " h,",               # Replace "‡¶è‡¶á‡¶ö" with "h"
    r" ‡¶Ü‡¶á,": " i,",                # Replace "‡¶Ü‡¶á" with "i"
    r" ‡¶ú‡ßá,": " j,",                # Replace "‡¶ú‡ßá" with "j"
    r" ‡¶ï‡ßá,": " k,",                # Replace "‡¶ï‡ßá" with "k"
    r" ‡¶è‡¶≤,": " l,",                # Replace "‡¶è‡¶≤" with "l"
    r" ‡¶è‡¶Æ,": " m,",                # Replace "‡¶è‡¶Æ" with "m"
    r" ‡¶è‡¶®,": " n,",                # Replace "‡¶è‡¶®" with "n"
    #r" ‡¶ì,": " o,",                 # Replace "‡¶ì" with "o"
    r" ‡¶™‡¶ø,": " p,",                # Replace "‡¶™‡¶ø" with "p"
    r" ‡¶ï‡¶ø‡¶â,": " q,",               # Replace "‡¶ï‡¶ø‡¶â" with "q"
    #r" ‡¶Ü‡¶∞,": " r,",                # Replace "‡¶Ü‡¶∞" with "r"
    r" ‡¶è‡¶∏,": " s,",                # Replace "‡¶è‡¶∏" with "s"
    r" ‡¶ü‡¶ø,": " t,",                # Replace "‡¶ü‡¶ø" with "t"
    r" ‡¶á‡¶â,": " u,",                # Replace "‡¶á‡¶â" with "u"
    r" ‡¶≠‡¶ø,": " v,",                # Replace "‡¶≠‡¶ø" with "v"
    r" (‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â‡•§‡¶°‡¶æ‡¶¨‡¶≤‡¶ø‡¶â),": " w,",            # Replace "‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â" with "w"
    r" ‡¶è‡¶ï‡ßç‡¶∏,": " x,",              # Replace "‡¶è‡¶ï‡ßç‡¶∏" with "x"
    r" ‡¶ì‡¶Ø‡¶º‡¶æ‡¶á,| ‡¶ì‡ßü‡¶æ‡¶á,": " y,",              # Replace "‡¶ì‡ßü‡¶æ‡¶á" with "y"
    r" ‡¶ú‡ßá‡¶°,": " z,",               # Replace "‡¶ú‡ßá‡¶°" with "z"

    r" ‡¶è‡•§": " a‡•§",                 # Replace "‡¶è" with "a"
    r" ‡¶¨‡¶ø‡•§": " b‡•§",                # Replace "‡¶¨‡¶ø" with "b"
    r" ‡¶∏‡¶ø‡•§": " c‡•§",                # Replace "‡¶∏‡¶ø" with "c"
    r" ‡¶°‡¶ø‡•§": " d‡•§",
    r" ‡¶á‡•§": " e‡•§",
    r" ‡¶è‡¶´‡•§": " f‡•§",
    r" ‡¶ú‡¶ø‡•§": " g‡•§",
    r" ‡¶è‡¶á‡¶ö‡•§": " h‡•§",
    r" ‡¶Ü‡¶á‡•§": " i‡•§",
    r" ‡¶ú‡ßá‡•§": " j‡•§",
    r" ‡¶ï‡ßá‡•§": " k‡•§",
    r" ‡¶è‡¶≤‡•§": " l‡•§",
    r" ‡¶è‡¶Æ‡•§": " m‡•§",
    r" ‡¶è‡¶®‡•§": " n‡•§",
    r" ‡¶ì‡•§": " o‡•§",
    r" ‡¶™‡¶ø‡•§": " p‡•§",
    r" ‡¶ï‡¶ø‡¶â‡•§": " q‡•§",
    r" ‡¶Ü‡¶∞‡•§": " r‡•§",
    r" ‡¶è‡¶∏‡•§": " s‡•§",
    r" ‡¶ü‡¶ø‡•§": " t‡•§",
    r" ‡¶á‡¶â‡•§": " u‡•§",
    r" ‡¶≠‡¶ø‡•§": " v‡•§",
    r" (‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â‡•§‡¶°‡¶æ‡¶¨‡¶≤‡¶ø‡¶â)‡•§": " w‡•§",
    r" ‡¶è‡¶ï‡ßç‡¶∏‡•§": " x‡•§",
    r" ‡¶ì‡¶Ø‡¶º‡¶æ‡¶á‡•§| ‡¶ì‡ßü‡¶æ‡¶á‡•§": " y‡•§",
    r" ‡¶ú‡ßá‡¶°‡•§": " z‡•§",

    r" ‡¶è\)": " a)",                 # Replace "‡¶è" with "a"
    r" ‡¶¨‡¶ø\)": " b)",                # Replace "‡¶¨‡¶ø" with "b"
    r" ‡¶∏‡¶ø\)": " c)",                # Replace "‡¶∏‡¶ø" with "c"
    r" ‡¶°‡¶ø\)": " d)",
    r" ‡¶á\)": " e)",
    r" ‡¶è‡¶´\)": " f)",
    r" ‡¶ú‡¶ø\)": " g)",
    r" ‡¶è‡¶á‡¶ö\)": " h)",
    r" ‡¶Ü‡¶á\)": " i)",
    r" ‡¶ú‡ßá\)": " j)",
    r" ‡¶ï‡ßá\)": " k)",
    r" ‡¶è‡¶≤\)": " l)",
    r" ‡¶è‡¶Æ\)": " m)",
    r" ‡¶è‡¶®\)": " n)",
    r" ‡¶ì\)": " o)",
    r" ‡¶™‡¶ø\)": " p)",
    r" ‡¶ï‡¶ø‡¶â\)": " q)",
    r" ‡¶Ü‡¶∞\)": " r)",
    r" ‡¶è‡¶∏\)": " s)",
    r" ‡¶ü‡¶ø\)": " t)",
    r" ‡¶á‡¶â\)": " u)",
    r" ‡¶≠‡¶ø\)": " v)",
    r" (‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â‡•§‡¶°‡¶æ‡¶¨‡¶≤‡¶ø‡¶â)\)": " w)",
    r" ‡¶è‡¶ï‡ßç‡¶∏\)": " x)",
    r" ‡¶ì‡¶Ø‡¶º‡¶æ‡¶á\)| ‡¶ì‡ßü‡¶æ‡¶á\)": " y)",
    r" ‡¶ú‡ßá‡¶°\)": " z)",


    r"\(‡¶è ": "\(a ",
    r"\(‡¶¨‡¶ø ": "\(b ",
r"\(‡¶∏‡¶ø ": "\(c ",
r"\(‡¶°‡¶ø ": "\(d ",
r"\(‡¶á ": "\(e ",
r"\(‡¶è‡¶´ ": "\(f ",
r"\(‡¶ú‡¶ø ": "\(g ",
r"\(‡¶è‡¶á‡¶ö ": "\(h ",
r"\(‡¶Ü‡¶á ": "\(i ",
r"\(‡¶ú‡ßá ": "\(j ",
r"\(‡¶ï‡ßá ": "\(k ",
r"\(‡¶è‡¶≤ ": "\(l ",
r"\(‡¶è‡¶Æ ": "\(m ",
r"\(‡¶è‡¶® ": "\(n ",
r"\(‡¶ì ": "\(o ",
r"\(‡¶™‡¶ø ": "\(p ",
r"\(‡¶ï‡¶ø‡¶â ": "\(q ",
r"\(‡¶Ü‡¶∞ ": "\(r ",
r"\(‡¶è‡¶∏ ": "\(s ",
r"\(‡¶ü‡¶ø ": "\(t ",
r"\(‡¶á‡¶â ": "\(u ",
r"\(‡¶≠‡¶ø ": "\(v ",
r"\((‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â‡•§‡¶°‡¶æ‡¶¨‡¶≤‡¶ø‡¶â) ": "\(w ",
r"\(‡¶è‡¶ï‡ßç‡¶∏ ": "\(x ",
r"\(‡¶ì‡¶Ø‡¶º‡¶æ‡¶á |\(‡¶ì‡ßü‡¶æ‡¶á ": "\(y ",
r"\(‡¶ú‡ßá‡¶° ": "\(z ",

    r"‡¶°‡¶ø‡¶è‡¶∏‡¶°‡¶ø": "dsd",
    r"‡¶¨‡¶ø‡¶∏‡¶ø‡¶°‡¶ø": "bcd",
    r"‡¶è‡¶∏‡¶ø‡¶°‡¶ø": "acd",
    r"‡¶è‡¶¨‡¶ø‡¶∏‡¶ø": "abc",
    r"‡¶è‡¶∏‡¶ü‡¶ø‡¶™‡¶ø": "STP",
    r"‡¶è‡¶á‡¶ö‡¶è‡¶∏‡¶∏‡¶ø": "HSC",
    r"‡¶è‡¶∏‡¶è‡¶∏‡¶∏‡¶ø": "SSC",
    r"‡¶™‡¶ø‡¶™‡¶ø‡¶è‡¶Æ": "ppm",
    r"‡¶™‡¶ø‡¶™‡¶ø‡¶¨‡¶ø": "ppb",
    r"‡¶è‡¶∏‡¶™‡¶ø": "sp",
    r"‡¶è‡¶¨‡¶ø": "ab",
    r"‡¶è‡¶°‡¶ø": "ad",
    r"‡¶è‡¶∏‡¶∏‡¶ø": "sc",
    r"‡¶è‡¶∏‡¶è": "sa",
    r"‡¶è‡¶∏‡¶ï‡¶ø‡¶â": "sq",
    r"‡¶è‡¶∏‡¶Ü‡¶∞": "sr",
    r"‡¶è‡¶∏‡¶ü‡¶ø": "st",
    r"‡¶è‡¶∏‡¶è‡¶Æ": "sm",
    r"‡¶è‡¶∏‡¶è‡¶®": "sn",
    r"‡¶è‡¶∏‡¶ì": "so",
    r"‡¶è‡¶∏‡¶°‡¶ø": "sd",
    r"‡¶è‡¶∏‡¶á": "se",
    r"‡¶è‡¶∏‡¶ø ": "ac ",
    r"‡¶¨‡¶ø‡¶∏‡¶ø": "bc",
    r"‡¶™‡¶ø‡¶ï‡¶ø‡¶â": "pq",
    r"‡¶™‡¶ø‡¶Ü‡¶∞": "pr",
    r"‡¶™‡¶ø‡¶è‡¶∏": "ps",
    r"‡¶™‡¶ø‡¶è‡¶Æ": "pm",
    r"‡¶™‡¶ø‡¶è‡¶á‡¶ö": "pH",
    r"‡¶™‡¶ø‡¶è": "pa",

    r"‡¶™‡¶ø‡¶¨‡¶ø": "pb",
    r"‡¶™‡¶ø‡¶ú‡ßá‡¶°": "pz",
    r"‡¶è‡¶ï‡ßç‡¶∏‡¶ì‡¶Ø‡¶º‡¶æ‡¶á|‡¶è‡¶ï‡ßç‡¶∏‡¶ì‡ßü‡¶æ‡¶á": "xy",
    r"‡¶è‡¶ï‡ßç‡¶∏‡¶ì": "xo",
    r"‡¶è‡¶ï‡ßç‡¶∏‡¶ú‡ßá‡¶°": "xz",
    r"‡¶ì‡¶Ø‡¶º‡¶æ‡¶á‡¶ú‡ßá‡¶°|‡¶ì‡ßü‡¶æ‡¶á‡¶ú‡ßá‡¶°": "yz",
    r"‡¶ì‡¶è": "oa",
    r"‡¶ì‡¶¨‡¶ø": "ob",
    r"‡¶ì‡¶∏‡¶ø": "oc",
    r"‡¶ì‡¶°‡¶ø": "od",
    r"‡¶ì‡¶™‡¶ø": "op",
    r"‡¶ì‡¶ï‡¶ø‡¶â": "oq",
    r"‡¶ì‡¶Ü‡¶∞": "or",
    r"‡¶ì‡¶è‡¶∏": "os",
    r"‡¶á‡¶â‡¶≠‡¶ø": "uv",
    r"‡¶Ü‡¶á‡¶Ü‡¶∞": "IR",
    r"‡¶á‡¶è": "ea",
    r"‡¶á‡¶¨‡¶ø": "eb",
    r"‡¶ï‡ßá‡¶™‡¶ø": "kp",
    r"‡¶ï‡ßá‡¶∏‡¶ø": "kc",
    r"‡¶è‡¶Æ‡¶∏‡¶ø‡¶ï‡¶ø‡¶â": "MCQ",
    #r"‡¶∏‡¶ø‡¶ï‡¶ø‡¶â": "CQ",
    r"‡¶°‡¶ø‡¶è‡¶®": "dn",
    r"‡¶°‡¶ø‡¶è‡¶∏": "ds",
    r"‡¶°‡¶ø‡¶ü‡¶ø": "dt",
    r"‡¶°‡¶ø‡¶è‡¶ï‡ßç‡¶∏": "dx",
    r"‡¶°‡¶ø‡¶ì‡¶Ø‡¶º‡¶æ‡¶á|‡¶°‡¶ø‡¶ì‡ßü‡¶æ‡¶á": "dy",
    r"‡¶°‡¶ø‡¶ú‡ßá‡¶°": "dz",
    #r"‡¶∏‡¶ø‡¶™‡¶ø": "Cp",
    #r"‡¶∏‡¶ø‡¶≠‡¶ø": "Cv",
    r"‡¶∏‡¶ø‡¶°‡¶ø": "cd",
    r"‡¶∏‡¶ø‡¶∏‡¶ø": "cc",
    r"‡¶°‡¶ø‡¶°‡¶ø": "dd",




    r"‡¶ï‡¶∏‡ßá‡¶ï|‡¶ï‡ßã‡¶∏‡ßá‡¶ï": "cosec",  # Replace "‡¶ï‡¶∏‡ßá‡¶ï" or "‡¶ï‡ßã‡¶∏‡ßá‡¶ï" with "cosec"
    r" ‡¶≤‡¶ó ": " log ",              # Replace "‡¶≤‡¶ó" with "log"
    r"‡¶∏‡¶æ‡¶á‡¶® ": "sin ",            # Replace "‡¶∏‡¶æ‡¶á‡¶®" with "sin"
    r" ‡¶ï‡¶∏ ‡•§ ‡¶ï‡¶ú ": " cos ",          # Replace "‡¶ï‡¶ú" or "‡¶ï‡¶∏" with "cos"
    r"‡¶∏‡ßá‡¶ï ": "sec ",            # Replace "‡ßç‡¶∏‡ßá‡¶ï" with "sec"

    r"‡¶ï‡¶ü ": "cot ",              # Replace "‡¶ï‡¶ü" with "cot"
    r"‡¶á‡¶®‡¶ü‡ßÅ ": "X ",             # Replace "‡¶á‡¶®‡¶ü‡ßÅ" with "X"


    r"‡¶Æ‡¶ø‡¶â(?=\W|$)": "Œº",               # "‡¶Æ‡¶ø‡¶â" ‚Üí "Œº"
    r"‡¶°‡ßá‡¶≤‡¶ü‡¶æ(?=\W|$)": "·∫ü",             # "‡¶°‡ßá‡¶≤‡¶ü‡¶æ" ‚Üí "·∫ü"
    r"‡¶≤‡ßç‡¶Ø‡¶æ‡¶Æ‡¶°‡¶æ(?=\W|$)": "Œª",           # "‡¶≤‡ßç‡¶Ø‡¶æ‡¶Æ‡¶°‡¶æ" ‚Üí "Œª"
    r"‡¶•‡¶ø‡¶ü‡¶æ|‡¶•‡ßá‡¶ü‡¶æ(?=\W|$)": "Œ∏",         # "‡¶•‡¶ø‡¶ü‡¶æ" or "‡¶•‡ßá‡¶ü‡¶æ" ‚Üí "Œ∏"

    #r"‡¶™‡¶æ‡¶á(?=\W|$)": "œÄ",
    r"‡¶∏‡¶ø‡¶ó‡¶Æ‡¶æ(?=\W|$)": "œÉ",      # Sigma
   # r" ‡¶∞‡ßã(?=\W|$)": " œÅ",         # Rho
    r"‡¶ì‡¶Æ‡ßá‡¶ó‡¶æ(?=\W|$)": "œâ",      # Omega
    r" ‡¶´‡¶æ‡¶á(?=\W|$)": " œÜ",        # Phi
    r"‡¶Ü‡¶≤‡¶´‡¶æ(?=\W|$)": "Œ±",        # ‡¶Ü‡¶≤‡¶´‡¶æ ‚Üí Œ±
    r"‡¶¨‡¶ø‡¶ü‡¶æ|‡¶¨‡ßá‡¶ü‡¶æ(?=\W|$)": "Œ≤",    # ‡¶¨‡¶ø‡¶ü‡¶æ/‡¶¨‡ßá‡¶ü‡¶æ ‚Üí Œ≤
    r"‡¶ó‡¶æ‡¶Æ‡¶æ(?=\W|$)": "Œ≥",         # ‡¶ó‡¶æ‡¶Æ‡¶æ ‚Üí Œ≥


    r"‡¶®‡ßç‡¶Ø‡¶æ‡¶®‡ßã‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞(?=\W|$)": "nm",
    r"‡¶Æ‡¶ø‡¶≤‡¶ø‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞(?=\W|$)": "mm",
    r"‡¶∏‡ßá‡¶®‡ßç‡¶ü‡¶ø‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞(?=\W|$)": "cm",
    r" ‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞(?=\W|$)": " m",
    r"‡¶ï‡¶ø‡¶≤‡ßã‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞(?=\W|$)": "km",

    r"‡¶∏‡ßá‡¶ï‡ßá‡¶®‡ßç‡¶°(?=\W|$)": "s",
    r"‡¶Æ‡¶ø‡¶≤‡¶ø‡¶∏‡ßá‡¶ï‡ßá‡¶®‡ßç‡¶°(?=\W|$)": "ms",
    r"‡¶®‡ßç‡¶Ø‡¶æ‡¶®‡ßã‡¶∏‡ßá‡¶ï‡ßá‡¶®‡ßç‡¶°(?=\W|$)": "ns",
    r"‡¶Æ‡¶æ‡¶á‡¶ï‡ßç‡¶∞‡ßã‡¶∏‡ßá‡¶ï‡ßá‡¶®‡ßç‡¶°(?=\W|$)": "¬µs",

    r"‡¶ï‡ßá‡¶ú‡¶ø(?=\W|$)": "kg",             # "‡¶ï‡ßá‡¶ú‡¶ø" ‚Üí "kg"
    r" ‡¶ó‡ßç‡¶∞‡¶æ‡¶Æ(?=\W|$)": " g",
    r"‡¶Æ‡¶ø‡¶≤‡¶ø‡¶ó‡ßç‡¶∞‡¶æ‡¶Æ(?=\W|$)": "mg",

     r"‡¶Æ‡¶æ‡¶á‡¶®‡¶æ‡¶∏(?=\W|$)": "-",
     r"‡¶™‡ßç‡¶≤‡¶æ‡¶∏(?=\W|$)": "+",
     r"‡¶ï‡¶Æ‡¶æ": ",",

    r"‡¶ï‡¶æ‡¶∞‡ßç‡¶¨‡¶® ‡¶°‡¶æ‡¶á ‡¶Ö‡¶ï‡ßç‡¶∏‡¶æ‡¶á‡¶°(?=\W|$)| ‡¶ï‡¶æ‡¶∞‡ßç‡¶¨‡¶® ‡¶°‡¶æ‡¶á‡¶Ö‡¶ï‡ßç‡¶∏‡¶æ‡¶á‡¶°(?=\W|$)": "CO‚ÇÇ",
    r"‡¶Ö‡¶ï‡ßç‡¶∏‡¶ø‡¶ú‡ßá‡¶® ‡¶ó‡ßç‡¶Ø‡¶æ‡¶∏(?=\W|$)": "O‚ÇÇ",
    r"‡¶π‡¶æ‡¶á‡¶°‡ßç‡¶∞‡ßã‡¶ú‡ßá‡¶® ‡¶ó‡ßç‡¶Ø‡¶æ‡¶∏(?=\W|$)": "H‚ÇÇ",
    r"‡¶®‡¶æ‡¶á‡¶ü‡ßç‡¶∞‡ßã‡¶ú‡ßá‡¶® ‡¶ó‡ßç‡¶Ø‡¶æ‡¶∏(?=\W|$)": "N‚ÇÇ",
    r"‡¶ì‡¶ú‡ßã‡¶®(?=\W|$)": "O‚ÇÉ",
    r"‡¶Ö‡ßç‡¶Ø‡¶æ‡¶Æ‡ßã‡¶®‡¶ø‡¶Ø‡¶º‡¶æ (?=\W|$)|‡¶Ö‡ßç‡¶Ø‡¶æ‡¶Æ‡ßã‡¶®‡¶ø‡ßü‡¶æ (?=\W|$)": "NH‚ÇÉ ",
    r"‡¶Æ‡¶ø‡¶•‡ßá‡¶®(?=\W|$)": "CH‚ÇÑ",
    r"‡¶á‡¶•‡ßá‡¶®(?=\W|$)": "C‚ÇÇH‚ÇÜ",
    r"‡¶á‡¶•‡¶ø‡¶®(?=\W|$)|(‡¶Ö‡ßç‡¶Ø‡¶æ‡¶∏‡¶ø‡¶ü‡¶ø‡¶≤‡¶ø‡¶®|‡¶Ö‡ßç‡¶Ø‡¶æ‡¶∏‡¶ø‡¶ü‡¶ø‡¶≤‡¶ø‡¶®)(?=\W|$)": "C‚ÇÇH‚ÇÇ",
    r"‡¶á‡¶•‡¶æ‡¶®‡¶≤(?=\W|$)": "C‚ÇÇH‚ÇÖOH",
    r"‡¶ó‡ßç‡¶≤‡ßÅ‡¶ï‡ßã‡¶ú(?=\W|$)": "C‚ÇÜH‚ÇÅ‚ÇÇO‚ÇÜ",
    r"‡¶∏‡¶æ‡¶≤‡¶´‡¶æ‡¶∞ ‡¶°‡¶æ‡¶á ‡¶Ö‡¶ï‡ßç‡¶∏‡¶æ‡¶á‡¶°(?=\W|$)": "SO‚ÇÇ",
    r"‡¶∏‡¶æ‡¶≤‡¶´‡¶æ‡¶∞ ‡¶ü‡ßç‡¶∞‡¶æ‡¶á ‡¶Ö‡¶ï‡ßç‡¶∏‡¶æ‡¶á‡¶°(?=\W|$)": "SO‚ÇÉ",
    r"‡¶π‡¶æ‡¶á‡¶°‡ßç‡¶∞‡ßã‡¶ï‡ßç‡¶≤‡ßã‡¶∞‡¶ø‡¶ï (‡¶Ö‡ßç‡¶Ø‡¶æ‡¶∏‡¶ø‡¶°|‡¶Ö‡ßç‡¶Ø‡¶æ‡¶∏‡¶ø‡¶°)(?=\W|$)|‡¶π‡¶æ‡¶á‡¶°‡ßç‡¶∞‡ßã‡¶ï‡ßç‡¶≤‡ßã‡¶∞‡¶ø‡¶ï ‡¶è‡¶∏‡¶ø‡¶°(?=\W|$)|‡¶è‡¶á‡¶ö‡¶∏‡¶ø‡¶è‡¶≤(?=\W|$)": "HCl",
    r"‡¶®‡¶æ‡¶á‡¶ü‡ßç‡¶∞‡¶ø‡¶ï (‡¶Ö‡ßç‡¶Ø‡¶æ‡¶∏‡¶ø‡¶°|‡¶Ö‡ßç‡¶Ø‡¶æ‡¶∏‡¶ø‡¶°)(?=\W|$)|‡¶®‡¶æ‡¶á‡¶ü‡ßç‡¶∞‡¶ø‡¶ï ‡¶è‡¶∏‡¶ø‡¶°(?=\W|$)": "HNO‚ÇÉ",
    r"‡¶∏‡¶æ‡¶≤‡¶´‡¶ø‡¶â‡¶∞‡¶ø‡¶ï (‡¶Ö‡ßç‡¶Ø‡¶æ‡¶∏‡¶ø‡¶°|‡¶Ö‡ßç‡¶Ø‡¶æ‡¶∏‡¶ø‡¶°)(?=\W|$)|‡¶∏‡¶æ‡¶≤‡¶´‡¶ø‡¶â‡¶∞‡¶ø‡¶ï ‡¶è‡¶∏‡¶ø‡¶°(?=\W|$)": "H‚ÇÇSO‚ÇÑ",
    r"(‡¶ï‡ßç‡¶Ø‡¶æ‡¶≤‡¶∏‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶ï‡ßç‡¶Ø‡¶æ‡¶≤‡¶∏‡¶ø‡ßü‡¶æ‡¶Æ) ‡¶ï‡¶æ‡¶∞‡ßç‡¶¨‡¶®‡ßá‡¶ü(?=\W|$)": "CaCO‚ÇÉ",
    r"(‡¶∏‡ßã‡¶°‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶∏‡ßã‡¶°‡¶ø‡ßü‡¶æ‡¶Æ) ‡¶ï‡ßç‡¶≤‡ßã‡¶∞‡¶æ‡¶á‡¶°(?=\W|$)": "NaCl",
    r"(‡¶∏‡ßã‡¶°‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶∏‡ßã‡¶°‡¶ø‡ßü‡¶æ‡¶Æ) ‡¶¨‡¶æ‡¶á‡¶ï‡¶æ‡¶∞‡ßç‡¶¨‡ßã‡¶®‡ßá‡¶ü(?=\W|$)": "NaHCO‚ÇÉ",
    r"(‡¶∏‡ßã‡¶°‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶∏‡ßã‡¶°‡¶ø‡ßü‡¶æ‡¶Æ) ‡¶ï‡¶æ‡¶∞‡ßç‡¶¨‡ßã‡¶®‡ßá‡¶ü(?=\W|$)": "Na‚ÇÇCO‚ÇÉ",
    r"(‡¶™‡¶ü‡¶æ‡¶∏‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶™‡¶ü‡¶æ‡¶∏‡¶ø‡ßü‡¶æ‡¶Æ) ‡¶™‡¶æ‡¶∞‡¶Æ‡ßç‡¶Ø‡¶æ‡¶ô‡ßç‡¶ó‡¶æ‡¶®‡ßá‡¶ü(?=\W|$)": "KMnO‚ÇÑ",
    r"‡¶π‡¶æ‡¶á‡¶°‡ßç‡¶∞‡ßã‡¶ú‡ßá‡¶® ‡¶™‡¶æ‡¶∞ ‡¶Ö‡¶ï‡ßç‡¶∏‡¶æ‡¶á‡¶°(?=\W|$)": "H‚ÇÇO‚ÇÇ",

    r"‡¶π‡¶æ‡¶á‡¶°‡ßç‡¶∞‡ßã‡¶ú‡ßá‡¶®(?=\W|$)": "H",
    r"(‡¶π‡¶ø‡¶≤‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶π‡¶ø‡¶≤‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "He",
    r"(‡¶≤‡¶ø‡¶•‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶≤‡¶ø‡¶•‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "Li",
    r"(‡¶¨‡ßá‡¶∞‡¶ø‡¶≤‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶¨‡ßá‡¶∞‡¶ø‡¶≤‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "Be",
    r"‡¶¨‡ßã‡¶∞‡¶®(?=\W|$)": "B",
    r"‡¶ï‡¶æ‡¶∞‡ßç‡¶¨‡¶® (?=\W|$)": "C ",
    r"‡¶®‡¶æ‡¶á‡¶ü‡ßç‡¶∞‡ßã‡¶ú‡ßá‡¶®(?=\W|$)": "N",
    r"‡¶Ö‡¶ï‡ßç‡¶∏‡¶ø‡¶ú‡ßá‡¶®(?=\W|$)": "O",
    r"‡¶´‡ßç‡¶≤‡ßã‡¶∞‡¶ø‡¶®(?=\W|$)": "F",
    r"(‡¶®‡¶ø‡ßü‡¶®|‡¶®‡¶ø‡¶Ø‡¶º‡¶®)(?=\W|$)": "Ne",
    r"(‡¶∏‡ßã‡¶°‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶∏‡ßã‡¶°‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "Na",
    r"(‡¶Æ‡ßç‡¶Ø‡¶æ‡¶ó‡¶®‡ßá‡¶∏‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶Æ‡ßç‡¶Ø‡¶æ‡¶ó‡¶®‡ßá‡¶∏‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "Mg",
    r"(‡¶Ö‡ßç‡¶Ø‡¶æ‡¶≤‡ßÅ‡¶Æ‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶Ö‡ßç‡¶Ø‡¶æ‡¶≤‡ßÅ‡¶Æ‡¶ø‡¶®‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "Al",
    r"‡¶∏‡¶ø‡¶≤‡¶ø‡¶ï‡¶®(?=\W|$)": "Si",
    r"‡¶´‡¶∏‡¶´‡¶∞‡¶æ‡¶∏(?=\W|$)": "P",
    r"‡¶∏‡¶æ‡¶≤‡¶´‡¶æ‡¶∞(?=\W|$)": "S",
    r"‡¶ï‡ßç‡¶≤‡ßã‡¶∞‡¶ø‡¶®(?=\W|$)": "Cl",
    r"‡¶Ü‡¶∞‡ßç‡¶ó‡¶®(?=\W|$)": "Ar",
    r"(‡¶™‡¶ü‡¶æ‡¶∏‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶™‡¶ü‡¶æ‡¶∏‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "K",
    r"(‡¶ï‡ßç‡¶Ø‡¶æ‡¶≤‡¶∏‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶ï‡ßç‡¶Ø‡¶æ‡¶≤‡¶∏‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "Ca",
    r"‡¶≤‡ßã‡¶π‡¶æ(?=\W|$)": "Fe",
    r"‡¶ï‡¶™‡¶æ‡¶∞|‡¶§‡¶æ‡¶Æ‡¶æ(?=\W|$)": "Cu",
    r"‡¶ú‡¶ø‡¶ô‡ßç‡¶ï(?=\W|$)": "Zn",
    r"‡¶∏‡¶ø‡¶≤‡¶≠‡¶æ‡¶∞|‡¶∞‡ßÇ‡¶™‡¶æ(?=\W|$)": "Ag",
    r"‡¶ó‡ßã‡¶≤‡ßç‡¶°|‡¶∏‡ßã‡¶®‡¶æ(?=\W|$)": "Au",
    r"‡¶Æ‡¶æ‡¶∞‡ßç‡¶ï‡¶æ‡¶∞‡¶ø|‡¶™‡¶æ‡¶∞‡¶¶(?=\W|$)": "Hg",
    r"‡¶≤‡ßá‡¶°|‡¶∏‡¶ø‡¶∏‡¶æ(?=\W|$)": "Pb",
    r" ‡¶ü‡¶ø‡¶®(?=\W|$)": " Sn",
    r"‡¶®‡¶ø‡¶ï‡ßá‡¶≤(?=\W|$)": "Ni",
    r"(‡¶ï‡ßç‡¶∞‡ßã‡¶Æ‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶ï‡ßç‡¶∞‡ßã‡¶Æ‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "Cr",
    r"‡¶Æ‡ßç‡¶Ø‡¶æ‡¶ô‡ßç‡¶ó‡¶æ‡¶®‡¶ø‡¶ú(?=\W|$)": "Mn",
    r"‡¶ï‡ßã‡¶¨‡¶æ‡¶≤‡ßç‡¶ü(?=\W|$)": "Co",
    r"‡¶™‡ßç‡¶≤‡¶æ‡¶ü‡¶ø‡¶®‡¶æ‡¶Æ(?=\W|$)": "Pt",
    r"(‡¶á‡¶â‡¶∞‡ßá‡¶®‡¶ø‡¶Ø‡¶º‡¶æ‡¶Æ|‡¶á‡¶â‡¶∞‡ßá‡¶®‡¶ø‡ßü‡¶æ‡¶Æ)(?=\W|$)": "U",
    r"(‡¶Ü‡¶Ø‡¶º‡ßã‡¶°‡¶ø‡¶®|‡¶Ü‡ßü‡ßã‡¶°‡¶ø‡¶®)(?=\W|$)": "I",
    r"‡¶¨‡ßç‡¶∞‡ßã‡¶Æ‡¶ø‡¶®(?=\W|$)": "Br",

    r"‡¶´‡¶ø‡¶ú‡¶ø‡¶ï‡ßç‡¶∏(?=\W|$)": "Physics",
    r"(‡¶∞‡¶∏‡¶æ‡¶Ø‡¶º‡¶®|‡¶∞‡¶∏‡¶æ‡ßü‡¶®)(?=\W|$)|‡¶ï‡ßá‡¶Æ‡¶ø‡¶∏‡ßç‡¶ü‡ßç‡¶∞‡¶ø(?=\W|$)": "Chemistry",
    r"‡¶ú‡ßÄ‡¶¨‡¶¨‡¶ø‡¶ú‡ßç‡¶û‡¶æ‡¶®(?=\W|$)|(‡¶¨‡¶æ‡¶Ø‡¶º‡ßã‡¶≤‡¶ú‡¶ø|‡¶¨‡¶æ‡ßü‡ßã‡¶≤‡¶ú‡¶ø)(?=\W|$)": "Biology",
    r"‡¶â‡¶ö‡ßç‡¶ö‡¶§‡¶∞ ‡¶ó‡¶£‡¶ø‡¶§(?=\W|$)|(‡¶π‡¶æ‡¶Ø‡¶º‡¶æ‡¶∞|‡¶π‡¶æ‡ßü‡¶æ‡¶∞) ‡¶Æ‡ßç‡¶Ø‡¶æ‡¶•(?=\W|$)|(‡¶π‡¶æ‡¶Ø‡¶º‡¶æ‡¶∞|‡¶π‡¶æ‡ßü‡¶æ‡¶∞) ‡¶Æ‡ßç‡¶Ø‡¶æ‡¶•‡¶Æ‡ßá‡¶ü‡¶ø‡¶ï‡ßç‡¶∏(?=\W|$)": "Higher Mathematics",
    r"‡¶Æ‡ßç‡¶Ø‡¶æ‡¶•(?=\W|$)": "Math",
    r"‡¶á‡¶Ç‡¶∞‡ßá‡¶ú‡¶ø(?=\W|$)": "English",
    r"‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ(?=\W|$)": "Bangla",
    r"‡¶Ü‡¶á‡¶∏‡¶ø‡¶ü‡¶ø(?=\W|$)|‡¶§‡¶•‡ßç‡¶Ø ‡¶ì ‡¶Ø‡ßã‡¶ó‡¶æ‡¶Ø‡ßã‡¶ó ‡¶™‡ßç‡¶∞‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶ø(?=\W|$)": "ICT",
    r"‡¶≠‡ßÇ‡¶ó‡ßã‡¶≤(?=\W|$)|‡¶ú‡¶ø‡¶ì‡¶ó‡ßç‡¶∞‡¶æ‡¶´‡¶ø(?=\W|$)": "Geography",
    r"‡¶∏‡¶æ‡¶á‡¶®‡ßç‡¶∏(?=\W|$)": "Science",
    r"‡¶ï‡¶Æ‡ßç‡¶™‡¶ø‡¶â‡¶ü‡¶æ‡¶∞(?=\W|$)": "Computer",
    r"‡¶è‡¶á ‡¶Ö‡¶Ç‡¶∂‡ßá ": ""
}


                for pattern, replacement in regex_patterns.items():
                  final_text = re.sub(pattern, replacement, final_text)
                  regex_pat={
    r"([a-zA-Z‚ÇÄ‚ÇÅ‚ÇÇ‚ÇÉ‚ÇÑ‚ÇÖ‚ÇÜ‚Çá‚Çà‚Çâ+-])‡ßá": r"\1‡¶è",
    r"([a-z0-9ŒªŒ∏œÄœÉœÅœâœÜŒ±Œ≤Œ≥]) (‡¶∏‡ßç‡¶ï‡ßü‡¶æ‡¶∞|‡¶∏‡ßç‡¶ï‡¶Ø‡¶º‡¶æ‡¶∞|‡¶∏‡ßç‡¶ï‡ßã‡ßü‡¶æ‡¶∞|‡¶∏‡ßç‡¶ï‡ßã‡¶Ø‡¶º‡¶æ‡¶∞)": r"\1¬≤",
    r"([a-z0-9ŒªŒ∏œÄœÉœÅœâœÜŒ±Œ≤Œ≥]) ‡¶ï‡¶ø‡¶â‡¶¨": r"\1¬≥",
    r"(‡¶∞‡ßÅ‡¶ü ‡¶ì‡¶≠‡¶æ‡¶∞‡•§‡¶∞‡ßÅ‡¶ü) ([a-z0-9ŒªŒ∏œÄœÉœÅœâœÜŒ±Œ≤Œ≥])": r"‚àö\2",
    r"([a-zŒªŒ∏œÄœÉœÅœâœÜŒ±Œ≤Œ≥]) ‡¶®‡¶ü": r"\1‚ÇÄ",
    r"‡¶ü‡¶æ‡¶® ([a-zŒªŒ∏œÄœÉœÅœâœÜŒ±Œ≤Œ≥])": r"tan \1",
    r"(‡¶ï‡¶∏|‡¶ï‡¶ú) ([a-zŒªŒ∏œÄœÉœÅœâœÜŒ±Œ≤Œ≥])": r"cos \2"



}

                for pattern, replacement in regex_pat.items():
                  final_text = re.sub(pattern, replacement, final_text)

            # Save final text
            with open(final_text_path, 'w', encoding='utf-8') as f:
              f.write(final_text)


            # Final update
            update_status(100, f'Processing completed successfully! ({total_iterations} summarization iterations)')


        # Return final results
            results = {
            'status': 'completed',
            'progress': 100,
            'message': f'Processing completed successfully! ({total_iterations} summarization iterations)',
            'files': {
                'audio': audio_path,
                'transcription': transcription_path,
                'summary_excel': summary_path,
                'final_text': final_text_path
            },
            'preview': final_text[:5000] + "..." if len(final_text) > 5000 else final_text,
            'iterations': total_iterations
        }

            return results

        except Exception as e:

            error_msg = f'Processing failed: {str(e)}'
            if job_id and job_id in processing_status:
              processing_status[job_id].update({
                  'status': 'error',
                 'message': error_msg,
                  'error': traceback.format_exc()
              })
            return {
            'status': 'error',
            'message': error_msg,
            'error': traceback.format_exc()
         }


    def recursive_summarization_with_concatenation(self, df, max_chunks=50):
        """
        Recursively summarize by concatenating pairs of rows until we have <= max_chunks
        """
        current_df = df.copy()
        iteration = 1

        while len(current_df) > max_chunks:
            print(f"\nüîÑ Iteration {iteration}: Processing {len(current_df)} rows...")

            # Generate summaries for current rows
            print("üìÑ Generating summaries...")
            current_df['Summary'] = current_df['Text'].apply(
                lambda x: self.generate_summary(str(x))
            )

            # Calculate token counts if tokenizer is available
            if hasattr(self, 'tokenizer') and self.tokenizer:
                print("üî¢ Calculating token counts...")
                current_df['Token(Text)'] = current_df['Text'].apply(
                    lambda x: self.count_tokens(x)
                )
                current_df['Token(Summary)'] = current_df['Summary'].apply(
                    lambda x: self.count_tokens(x)
                )

                # Print token statistics
                avg_text_tokens = current_df['Token(Text)'].mean()
                avg_summary_tokens = current_df['Token(Summary)'].mean()
                print(f"üìä Token Statistics - Iteration {iteration}:")
                print(f"   Average Text tokens: {avg_text_tokens:.1f}")
                print(f"   Average Summary tokens: {avg_summary_tokens:.1f}")

            # Prepare dataframe for saving with required columns
            save_columns = ['Text', 'Summary', 'Start_Time', 'End_Time']
            if hasattr(self, 'tokenizer') and self.tokenizer:
                save_columns.extend(['Token(Text)', 'Token(Summary)'])

            save_df = current_df[save_columns].copy()

            # Save Excel file for this iteration
            iteration_excel_path = os.path.join(
                os.path.dirname(self.temp_dir) if hasattr(self, 'temp_dir') else '/tmp',
                f"iteration_{iteration}_results.xlsx"
            )
            save_df.to_excel(iteration_excel_path, index=False)
            print(f"üíæ Iteration {iteration} results saved: {iteration_excel_path}")

            # Concatenate pairs of rows
            print("üîó Concatenating pairs of rows...")
            current_df = self.concatenate_pairs(current_df)

            print(f"üî¢ After concatenation: {len(current_df)} rows")

            # Debug: Print sample data
            print(f"üìù Sample rows after concatenation:")
            for i, (idx, row) in enumerate(current_df.head(3).iterrows()):
                print(f"   Row {i+1} (ID: {row.get('Chunk_ID', 'N/A')}):")
                print(f"     Start: {row.get('Start_Time', 'N/A')}, End: {row.get('End_Time', 'N/A')}")
                print(f"     Text: {str(row.get('Text', ''))[:100]}...")

            iteration += 1

            # Safety check to prevent infinite loops
            if iteration > 10:
                print("‚ö†Ô∏è Maximum iterations reached. Breaking loop.")
                break

        # Final summarization
        print(f"\n‚úÖ Final iteration: Processing {len(current_df)} rows...")
        current_df['Final_Summary'] = current_df['Text'].apply(
            lambda x: self.generate_summary(str(x))
        )

        return current_df, iteration
# Initialize the processor
try:
    print("üîÑ Initializing model...")
    processor = VideoProcessor()
    print("‚úÖ VideoProcessor initialized successfully!")
except Exception as e:
    print(f"‚ùå Model initialization failed: {e}")
    print("Please check your model path or use a different model")
    processor = None



# Flask Application
app = Flask(__name__)
CORS(app)

# Global variables
processor = None
processing_status = {}

# Simple HTML template for Colab
HTML_TEMPLATE = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Video Processing & Summarization</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }
        .container { max-width: 800px; margin: 0 auto; background: white; padding: 20px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
        .upload-area { border: 2px dashed #4CAF50; padding: 30px; text-align: center; margin: 20px 0; border-radius: 10px; background: #f9fff9; }
        .upload-area:hover { background: #f0fff0; }
        .progress-bar { width: 100%; height: 25px; background: #f0f0f0; border-radius: 12px; overflow: hidden; margin: 10px 0; }
        .progress-fill { height: 100%; background: linear-gradient(90deg, #4CAF50, #45a049); width: 0%; transition: width 0.3s; }
        button { background: #007bff; color: white; border: none; padding: 12px 24px; border-radius: 6px; cursor: pointer; font-size: 16px; margin: 5px; }
        button:hover { background: #0056b3; }
        button:disabled { background: #ccc; cursor: not-allowed; }
        .download-btn { background: #28a745; }
        .download-btn:hover { background: #218838; }
        .error { color: #dc3545; padding: 15px; background: #f8d7da; border: 1px solid #f5c6cb; border-radius: 5px; margin: 10px 0; }
        .success { color: #155724; padding: 15px; background: #d4edda; border: 1px solid #c3e6cb; border-radius: 5px; margin: 10px 0; }
        .preview { max-height: 300px; overflow-y: auto; border: 1px solid #ddd; padding: 15px; background: #f8f9fa; border-radius: 5px; font-family: monospace; }
        .status-badge { display: inline-block; padding: 5px 10px; border-radius: 15px; font-size: 12px; font-weight: bold; }
        .status-processing { background: #fff3cd; color: #856404; }
        .status-completed { background: #d4edda; color: #155724; }
        .status-error { background: #f8d7da; color: #721c24; }
    </style>
</head>
<body>
    <div class="container">
        <h1>üé¨ Video Processing & Summarization</h1>
        <p>Upload a Bengali video file to get automated transcription and summarization</p>

        <div class="upload-area" id="uploadArea" onclick="document.getElementById('videoInput').click()">
            <input type="file" id="videoInput" accept="video/*" style="display: none;">
            <p>üìÅ Click to select a video file or drag and drop</p>
            <button type="button">Choose Video File</button>
        </div>

        <div id="fileInfo" style="display: none;">
            <h3>üìπ Selected File</h3>
            <p><strong>Name:</strong> <span id="fileName"></span></p>
            <p><strong>Size:</strong> <span id="fileSize"></span></p>
            <button id="processBtn" onclick="processVideo()">üöÄ Start Processing</button>
        </div>

        <div id="progressSection" style="display: none;">
            <h3>‚öôÔ∏è Processing Status</h3>
            <div class="progress-bar">
                <div class="progress-fill" id="progressFill"></div>
            </div>
            <p id="progressText">0% - Initializing...</p>
        </div>

        <div id="resultsSection" style="display: none;">
            <h3>üìä Processing Results</h3>
            <div id="downloadButtons" style="text-align: center; margin: 20px 0;">
                <button class="download-btn" onclick="downloadFile('audio')">üéµ Audio File</button>
                <button class="download-btn" onclick="downloadFile('transcription')">üìã Transcription</button>
                <button class="download-btn" onclick="downloadFile('summary_excel')">üìä Summary Excel</button>
                <button class="download-btn" onclick="downloadFile('final_text')">üìù Final Summary</button>
            </div>

            <h4>üîç Preview:</h4>
            <div id="summaryPreview" class="preview"></div>
        </div>

        <div id="messages"></div>
    </div>

    <script>
        let selectedFile = null;
        let currentJobId = null;

        document.getElementById('videoInput').addEventListener('change', function(e) {
            const file = e.target.files[0];
            if (file) {
                selectedFile = file;
                document.getElementById('fileName').textContent = file.name;
                document.getElementById('fileSize').textContent = (file.size / 1024 / 1024).toFixed(2) + ' MB';
                document.getElementById('fileInfo').style.display = 'block';
            }
        });

        async function processVideo() {
            if (!selectedFile) {
                showError('Please select a video file first');
                return;
            }

            const formData = new FormData();
            formData.append('video', selectedFile);

            document.getElementById('progressSection').style.display = 'block';
            const processBtn = document.getElementById('processBtn');
            processBtn.disabled = true;
            processBtn.textContent = '‚è≥ Processing...';

            try {
                const response = await fetch('/api/process', {
                    method: 'POST',
                    body: formData
                });

                const result = await response.json();

                if (result.status === 'success') {
                    currentJobId = result.job_id;
                    pollProgress(result.job_id);
                } else {
                    throw new Error(result.message);
                }

            } catch (error) {
                showError('Processing failed: ' + error.message);
                processBtn.disabled = false;
                processBtn.textContent = 'üöÄ Start Processing';
            }
        }

        async function pollProgress(jobId) {
            const interval = setInterval(async () => {
                try {
                    const response = await fetch(`/api/status/${jobId}`);
                    const status = await response.json();

                    updateProgress(status.progress || 0, status.message || 'Processing...');

                    if (status.status === 'completed') {
                        clearInterval(interval);
                        displayResults(status);
                        showSuccess('üéâ Video processing completed successfully!');
                        document.getElementById('resultsSection').style.display = 'block';
                        document.getElementById('processBtn').disabled = false;
                        document.getElementById('processBtn').textContent = 'üöÄ Start Processing';
                    } else if (status.status === 'error') {
                        clearInterval(interval);
                        showError('‚ùå Processing failed: ' + status.message);
                        document.getElementById('processBtn').disabled = false;
                        document.getElementById('processBtn').textContent = 'üöÄ Start Processing';
                    }
                } catch (error) {
                    console.error('Error polling status:', error);
                }
            }, 3000);
        }

        function updateProgress(progress, message) {
            document.getElementById('progressFill').style.width = progress + '%';
            document.getElementById('progressText').textContent = progress + '% - ' + message;
        }

        function displayResults(status) {
            if (status.preview) {
                document.getElementById('summaryPreview').textContent = status.preview;
            }
        }

        async function downloadFile(type) {
            if (!currentJobId) {
                showError('No processing job found');
                return;
            }

            try {
                const response = await fetch(`/api/download/${currentJobId}/${type}`);

                if (response.ok) {
                    const blob = await response.blob();
                    const url = window.URL.createObjectURL(blob);
                    const a = document.createElement('a');
                    a.href = url;
                    a.download = getFileName(type);
                    a.click();
                    window.URL.revokeObjectURL(url);
                } else {
                    showError('Download failed');
                }
            } catch (error) {
                showError('Download error: ' + error.message);
            }
        }

        function getFileName(type) {
            const names = {
                'audio': 'extracted_audio.wav',
                'transcription': 'transcription_chunks.xlsx',
                'summary_excel': 'summaries_with_timestamps.xlsx',
                'final_text': 'final_summary.txt'
            };
            return names[type] || 'download.txt';
        }

        function showError(message) {
            const div = document.createElement('div');
            div.className = 'error';
            div.innerHTML = '‚ùå ' + message;
            document.getElementById('messages').appendChild(div);
            setTimeout(() => div.remove(), 8000);
        }

        function showSuccess(message) {
            const div = document.createElement('div');
            div.className = 'success';
            div.innerHTML = '‚úÖ ' + message;
            document.getElementById('messages').appendChild(div);
            setTimeout(() => div.remove(), 8000);
        }
    </script>
</body>
</html>'''

@app.route('/')
def index():
    """Serve the main HTML page"""
    return render_template_string(HTML_TEMPLATE)

@app.route('/api/process', methods=['POST'])
def process_video():
    """Start video processing"""
    try:
        if 'video' not in request.files:
            return jsonify({'status': 'error', 'message': 'No video file provided'})

        video_file = request.files['video']
        if video_file.filename == '':
            return jsonify({'status': 'error', 'message': 'No video file selected'})

        # Create unique job ID
        job_id = str(int(time.time() * 1000))

        # Create temporary directory for this job
        temp_dir = os.path.join('/tmp', f'video_processing_{job_id}')
        os.makedirs(temp_dir, exist_ok=True)

        # Save uploaded file
        video_path = os.path.join(temp_dir, video_file.filename)
        video_file.save(video_path)

        # Initialize processing status
        processing_status[job_id] = {
            'status': 'queued',
            'progress': 0,
            'message': 'Processing queued...',
            'temp_dir': temp_dir
        }

        # Start processing in background thread
        thread = threading.Thread(
            target=background_process,
            args=(job_id, video_path, temp_dir)
        )
        thread.daemon = True
        thread.start()

        return jsonify({
            'status': 'success',
            'job_id': job_id,
            'message': 'Processing started'
        })

    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': str(e)
        })

def background_process(job_id, video_path, temp_dir):
    """Background processing function"""
    try:
        global processor
        if processor is None:
            processing_status[job_id].update({
                'status': 'error',
                'message': 'Model not initialized'
            })
            return

        # Update status to processing
        processing_status[job_id].update({
            'status': 'processing',
            'progress': 5,
            'message': 'Starting video processing...'
        })

        # Process the video
        result = processor.process_complete_pipeline(video_path, temp_dir, job_id=job_id)

        # Update final status
        processing_status[job_id].update(result)

    except Exception as e:
        processing_status[job_id].update({
            'status': 'error',
            'message': str(e),
            'error': traceback.format_exc()
        })

@app.route('/api/status/<job_id>')
def get_status(job_id):
    """Get processing status"""
    if job_id in processing_status:
        return jsonify(processing_status[job_id])
    else:
        return jsonify({
            'status': 'error',
            'message': 'Job not found'
        })

@app.route('/api/download/<job_id>/<file_type>')
def download_file(job_id, file_type):
    """Download processed files"""
    if job_id not in processing_status:
        return jsonify({'error': 'Job not found'}), 404

    status = processing_status[job_id]
    if status['status'] != 'completed':
        return jsonify({'error': 'Processing not completed'}), 400

    if 'files' not in status or file_type not in status['files']:
        return jsonify({'error': 'File not found'}), 404

    file_path = status['files'][file_type]
    if not os.path.exists(file_path):
        return jsonify({'error': f'File does not exist: {file_path}'}), 404

    # Define MIME types and download names for different file types
    file_configs = {
        'audio': {
            'mimetype': 'audio/wav',
            'download_name': 'extracted_audio.wav'
        },
        'transcription': {
            'mimetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            'download_name': 'transcription_chunks.xlsx'
        },
        'summary_excel': {
            'mimetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            'download_name': 'summaries_with_timestamps.xlsx'
        },
        'final_text': {
            'mimetype': 'text/plain; charset=utf-8',
            'download_name': 'final_summary.txt'
        }
    }

    config = file_configs.get(file_type, {
        'mimetype': 'application/octet-stream',
        'download_name': f'download_{file_type}'
    })

    try:
        return send_file(
            file_path,
            as_attachment=True,
            download_name=config['download_name'],
            mimetype=config['mimetype']
        )
    except Exception as e:
        print(f"Error sending file {file_path}: {e}")
        return jsonify({'error': f'Failed to send file: {str(e)}'}), 500

# ===== CELL 4: Initialize and Run =====
# Initialize the processor
try:
    print("üîÑ Initializing model...")
    processor = VideoProcessor()  # Will use default model
    print("‚úÖ Model initialized successfully!")
except Exception as e:
    print(f"‚ùå Model initialization failed: {e}")
    print("Please check your model path or use a different model")

# Start the Flask app with ngrok
if __name__ == '__main__':
    # Create ngrok tunnel
    public_url = ngrok.connect(5000)
    print(f"üåê Public URL: {public_url}")
    print(f"üîó Access your app at: {public_url}")

    # Run Flask app
    app.run(host='0.0.0.0', port=5000, debug=False)





















































  r"\(‡¶è ": "\(a ",

  r"\(‡¶¨‡¶ø ": "\(b ",

  r"\(‡¶∏‡¶ø ": "\(c ",

  r"\(‡¶°‡¶ø ": "\(d ",

  r"\(‡¶á ": "\(e ",

  r"\(‡¶è‡¶´ ": "\(f ",

  r"\(‡¶ú‡¶ø ": "\(g ",

  r"\(‡¶è‡¶á‡¶ö ": "\(h ",

  r"\(‡¶Ü‡¶á ": "\(i ",

  r"\(‡¶ú‡ßá ": "\(j ",

  r"\(‡¶ï‡ßá ": "\(k ",

  r"\(‡¶è‡¶≤ ": "\(l ",

  r"\(‡¶è‡¶Æ ": "\(m ",

  r"\(‡¶è‡¶® ": "\(n ",

  r"\(‡¶ì ": "\(o ",

  r"\(‡¶™‡¶ø ": "\(p ",

  r"\(‡¶ï‡¶ø‡¶â ": "\(q ",

  r"\(‡¶Ü‡¶∞ ": "\(r ",

  r"\(‡¶è‡¶∏ ": "\(s ",

  r"\(‡¶ü‡¶ø ": "\(t ",

  r"\(‡¶á‡¶â ": "\(u ",

  r"\(‡¶≠‡¶ø ": "\(v ",

  r"\((‡¶°‡¶¨‡ßç‡¶≤‡¶ø‡¶â‡•§‡¶°‡¶æ‡¶¨‡¶≤‡¶ø‡¶â) ": "\(w ",

  r"\(‡¶è‡¶ï‡ßç‡¶∏ ": "\(x ",

  r"\(‡¶ì‡¶Ø‡¶º‡¶æ‡¶á |\(‡¶ì‡ßü‡¶æ‡¶á ": "\(y ",

  r"\(‡¶ú‡ßá‡¶° ": "\(z ",



Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-uw0w2kg8
  Running command git clone --filter=blob:none --quiet https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-uw0w2kg8
  Resolved https://github.com/csebuetnlp/normalizer to commit d405944dde5ceeacb7c2fd3245ae2a9dea5f35c9
  Preparing metadata (setup.py) ... [?25l[?25hdone
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://secu

Device set to use cuda:0


‚úÖ BanglaPunctuation loaded
‚úÖ VideoProcessor initialized successfully!
üîÑ Initializing model...


Device set to use cuda:0


‚úÖ BanglaPunctuation loaded
‚úÖ Model initialized successfully!
üåê Public URL: NgrokTunnel: "https://a9db8460d4f8.ngrok-free.app" -> "http://localhost:5000"
üîó Access your app at: NgrokTunnel: "https://a9db8460d4f8.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:51:51] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:51:52] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:07] "POST /api/process HTTP/1.1" 200 -


üé¨ Loading video: /tmp/video_processing_1756468506036/shama_1.mp4
‚è±Ô∏è Video duration: 163.0 minutes
üéµ Extracting audio to: /tmp/video_processing_1756468506036/extracted_audio.wav


INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:11] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:14] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:17] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:20] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:23] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:26] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:29] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:32] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:35] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:55:38] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.

‚úÖ Audio extracted successfully! (1644.8 MB)
‚úÖ Optimized speech recognizer initialized
‚ö° Per-chunk amplification: True
üåç Target language: Bengali (bn-BD)
üéØ Audio chunk size: 45.0 seconds
üìù Token-based final chunks: 512 tokens max


INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:36] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:42] "GET /api/status/1756468506036 HTTP/1.1" 200 -


‚úÖ Audio loaded: 9777.2 seconds, Avg: -29.4 dB, Peak: -0.1 dB


INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:43] "GET /api/status/1756468506036 HTTP/1.1" 200 -


üìù Step 1: Transcribing audio in 218 short chunks for sentence mapping...


Transcribing audio chunks:   0%|          | 0/218 [00:00<?, ?it/s]

INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:44] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:47] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:50] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:53] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:56] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:56:59] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:57:02] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:57:05] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:57:08] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 11:57:11] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.

üîÑ Step 2: Creating sentence-level timestamp mapping with carry-over handling...
‚úÖ Created 1107 sentences with timestamps
üîÑ Step 3: Creating token-based chunks (max 512 tokens) with overlap and carry-over...
‚úÖ Created 28 token-based chunks
üìä Step 4: Preparing output data...
üìÑ Input has ‚â§50 rows. Generating summaries directly...


INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:34:17] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:35:17] "GET /api/status/1756468506036 HTTP/1.1" 200 -


DataFrame shape: (28, 15)
Number of rows: 28
File saved successfully to: /content/drive/MyDrive/tfidf/new.txt
‚úÖ File copied to: /content/drive/MyDrive/tfidf/Concatenated_Summary/new.txt
Initializing Enhanced Bangla Combined Keyword Extractor...
Error initializing BNLP components: name 'BasicTokenizer' is not defined
Using fallback tokenization (simple split)
Successfully loaded 970 stopwords (no tokenization - tokenizer not available)

Loading documents from: /content/drive/MyDrive/tfidf/Concatenated_Summary
Found 213 .txt files.
Building corpus vocabulary...
Built corpus vocabulary: 13814 prominent terms
Processing documents with combined unigram-bigram extraction...
Added document: sum2.txt with 4902 unigrams, 2319 bigrams
Added document: sum22.txt with 1861 unigrams, 1002 bigrams
Added document: sum45.txt with 1509 unigrams, 880 bigrams
Added document: sum51.txt with 2662 unigrams, 1650 bigrams
Added document: sum3.txt with 3819 unigrams, 2068 bigrams
Added document: sum9.txt with

INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:36:17] "GET /api/status/1756468506036 HTTP/1.1" 200 -


Document length: 2006 tokens, Dynamic keywords: 40
Document length: 4042 tokens, Dynamic keywords: 50
Document length: 1580 tokens, Dynamic keywords: 31
Document length: 127 tokens, Dynamic keywords: 5
Document length: 1590 tokens, Dynamic keywords: 31
Document length: 1349 tokens, Dynamic keywords: 26
Document length: 163 tokens, Dynamic keywords: 5
Document length: 2367 tokens, Dynamic keywords: 47
Document length: 3105 tokens, Dynamic keywords: 50
Document length: 303 tokens, Dynamic keywords: 6
Document length: 1968 tokens, Dynamic keywords: 39
Document length: 1730 tokens, Dynamic keywords: 34
Document length: 180 tokens, Dynamic keywords: 5
Document length: 116 tokens, Dynamic keywords: 5
Document length: 3304 tokens, Dynamic keywords: 50
Document length: 1113 tokens, Dynamic keywords: 22
Document length: 1821 tokens, Dynamic keywords: 36
Document length: 2243 tokens, Dynamic keywords: 44
Document length: 3029 tokens, Dynamic keywords: 50
Document length: 1705 tokens, Dynamic key

INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:37:16] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:37:17] "GET /api/status/1756468506036 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:37:23] "GET /api/download/1756468506036/final_text HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:37:52] "GET /api/download/1756468506036/summary_excel HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:39:57] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [29/Aug/2025 12:40:03] "GET / HTTP/1.1" 200 -
