In [None]:
!pip install datasets
!pip install evaluate
!pip install rouge_score
import evaluate
import os
import pandas as pd
import torch
from google.colab import drive
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_excel("/content/drive/My Drive/Thesis_Dataset/TenK.xlsx")  # Adjust path
df = df.rename(columns={df.columns[0]: "text", df.columns[1]: "summary"})
df = df.dropna()

# Split dataset (80% training, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.20, random_state=42)
# Step 2: Split the dataset into train, validation, and test sets (70% train, 20% val, 20% test)
train_df, temp_df = train_test_split(df, test_size=0.20, random_state=42)  # 70% for training, 30% remaining
val_dataset, test_dataset = train_test_split(temp_df, test_size=0.50, random_state=42)  # Split the remaining 30% into 50% val and 50% test



# Combine datasets into a DatasetDict for Hugging Face Trainer
#dataset_dict = DatasetDict({"train": train_dataset, "val": val_dataset, "test": test_dataset})



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
dataset_dict = DatasetDict({"train": train_dataset, "val": val_dataset})



In [None]:
# Function to generate summaries
def generate_summary(text, model, tokenizer, device):
    model.to(device)

    # Tokenize input text
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt", padding="max_length").to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=200,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Define device
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [None]:
from transformers import MBartForConditionalGeneration, AutoTokenizer,MBart50Tokenizer,MBart50TokenizerFast

In [None]:
# Save tokenizer using original pretrained tokenizer to preserve language codes
base_model = "facebook/mbart-large-50"
tokenizer = MBart50TokenizerFast.from_pretrained(base_model)
tokenizer.save_pretrained("/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

('/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed/tokenizer_config.json',
 '/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed/special_tokens_map.json',
 '/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed/sentencepiece.bpe.model',
 '/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed/added_tokens.json',
 '/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed/tokenizer.json')

In [None]:
# Define saved paths
model_path = "/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART"
tokenizer_path = "/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed"

# Load tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained(tokenizer_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Set source and target languages
tokenizer.src_lang = "bn_XX"
tokenizer.tgt_lang = "bn_XX"
model.config.src_lang = "bn_XX"
model.config.tgt_lang = "bn_XX"
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids("bn_XX")



In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Move model to device
model.to(device)

# Generate summaries for validation data
test_dataset['generated_summary'] = test_dataset['text'].progress_apply(lambda x: generate_summary(x, model, tokenizer, device))


# Save output
test_dataset.to_excel("/content/drive/My Drive/Thesis_Dataset/test_data_with_summaries_mBART_3epoch.xlsx", index=False)

# Display sample results
print(test_dataset[['text', 'summary', 'generated_summary']])





  0%|          | 0/1001 [00:00<?, ?it/s]

                                                   text  \
2128  তাইলে যদি এই টোটাল দৈর্ঘ্যটা আমাদের এল হয় তাই...   
2486  আর ঘূর্ণন গতির ক্ষেত্রে স্বাধীনতার মাত্রা দুইট...   
4332  টু ইকুয়াল টু এ কস থিটা এক্স আর টু টাকেই মাত্র...   
252   এখানে যদি এমন হয় আমরা চার কেজি নিয়ে নেই তো এ...   
6221  এক্স আর ওয়াই এর সহগ ইন্টারচেঞ্জ হবে মাঝখানের ...   
...                                                 ...   
7159  এটা আমি ইকালি দিয়ে না লিখি তাহলে কনফিউজড হবা।...   
9625  তাহলে এ সেট যদি তুমি বের করতে চাও তাহলে শুধুমা...   
2957  ব্যাপারটা এমন না। অধাতু অধাতু হইলে সেখানে সমযো...   
2483  আর তাইলে গামা যদি তুমি ক্যালকুলেশন করো সিপি হচ...   
8266  তাইলে এই পাথটা মনে করো পজিটিভ চার্জে চার্জিত। ...   

                                                summary  \
2128  দোলকের উপর বাতাসের ক্রিয়া এবং তার উচ্চতার পরি...   
2486  এই অংশে গ্যাসের ডিগ্রি অফ ফ্রিডম এবং তার সাথে ...   
4332  এখানে মূলত ভেক্টরের এক্স অক্ষের অংশ এবং তার গু...   
252   এখানে, যদি আমরা চার কেজি নিয়ে নিই, তাহলে রিমেন...

Output generate

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
from transformers import MBartForConditionalGeneration, AutoTokenizer,MBart50Tokenizer,MBart50TokenizerFast,AutoModelForSeq2SeqLM

In [None]:
# ✅ Your Custom Summary Function
def generate_summary(text, model, tokenizer, device):
    model.to(device)

    # Tokenize input text
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt", padding="max_length").to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=200,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
# Save tokenizer using original pretrained tokenizer to preserve language codes
base_model = "facebook/mbart-large-50"
tokenizer = MBart50TokenizerFast.from_pretrained(base_model)
tokenizer.save_pretrained("/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed")

# Define saved paths
model_path = "/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART"
tokenizer_path = "/content/drive/My Drive/Thesis_Dataset/fine_tuned_mBART_tokenizer_fixed"

# Load tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained(tokenizer_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Set source and target languages
tokenizer.src_lang = "bn_XX"
tokenizer.tgt_lang = "bn_XX"
model.config.src_lang = "bn_XX"
model.config.tgt_lang = "bn_XX"
model.config.forced_bos_token_id = tokenizer.convert_tokens_to_ids("bn_XX")

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load transcribed text from Excel
input_excel = "/content/drive/MyDrive/Thesis_Transcription/Text_Output/chunk_transcripts.xlsx"
df = pd.read_excel(input_excel)

# Apply your summary function to each row
print("📄 Generating summaries for all transcribed chunks...")
df['Summary'] = df['Text'].progress_apply(lambda x: generate_summary(str(x), model, tokenizer, device))

# Save to new Excel
output_excel_path = "/content/drive/MyDrive/Thesis_Transcription/Text_Output/text_with_summaries_mBART.xlsx"
df.to_excel(output_excel_path, index=False)

print(f"✅ Excel with summaries saved: {output_excel_path}")

# Save all summaries as one merged text
merged_summary_text = "\n".join(df['Summary'].dropna().astype(str).tolist())
summary_txt_path = "/content/drive/MyDrive/Thesis_Transcription/Text_Output/merged_summary_mBART_without_regex.txt"
with open(summary_txt_path, "w", encoding="utf-8") as f:
    f.write(merged_summary_text)

print(f"📝 Merged summary saved at: {summary_txt_path}")





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

📄 Generating summaries for all transcribed chunks...


  0%|          | 0/47 [00:00<?, ?it/s]



✅ Excel with summaries saved: /content/drive/MyDrive/Thesis_Transcription/Text_Output/text_with_summaries_mBART.xlsx
📝 Merged summary saved at: /content/drive/MyDrive/Thesis_Transcription/Text_Output/merged_summary_mBART_without_regex.txt


In [11]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import re

# 1. Load the merged text file from the drive
summary_txt_path = "/content/drive/MyDrive/Thesis_Transcription/Text_Output/merged_summary_mBART_without_regex.txt"

with open(summary_txt_path, "r", encoding="utf-8") as f:
    merged_summary_text = f.read()

# 2. Define a dictionary of regex patterns and replacements
regex_patterns = {
    r"টু দি (পাওয়ার|পাওয়ার)|টু দা (পাওয়ার|পাওয়ার)": "^",     # Replace "টু দি পাওয়ার" with "^"
    r"সমান সমান|ইকুয়ালস টু|ইকুয়ালস টু|ইকুয়াল টু|ইকুয়াল টু": "=",

                 # Replace "দশ" and "টেন" with "10"
    r"এগারো|এলেভেন": "11",      # Replace "এগারো" and "এলেভেন" with "11"
    r"বারো|টুয়েলভ|টুয়েলভ": "12",        # Replace "বারো" and "টুয়েলভ" with "12"
    r"তেরো|থার্টিন": "13",      # Replace "তেরো" and "থার্টিন" with "13"
    r"চোদ্দো|ফোর্টিন": "14",    # Replace "চোদ্দো" and "ফোর্টিন" with "14"
    r"পনেরো|ফিফটিন": "15",      # Replace "পনেরো" and "ফিফটিন" with "15"
    r"ষোলো|সিক্সটিন": "16",     # Replace "ষোলো" and "সিক্সটিন" with "16"
    r"সতেরো|সেভেন্টিন": "17",   # Replace "সতেরো" and "সেভেন্টিন" with "17"
    r"আঠারো|এইটিন": "18",       # Replace "আঠারো" and "এইটিন" with "18"
    r"ঊনিশ|নাইনটিন": "19",       # Replace "ঊনিশ" and "নিনটিন" with "19"

    r"একুশ|(টুয়েন্টি|টুয়েন্টি) (ওয়ান|ওয়ান)": "21",# Replace "একুশ" and "টুয়েন্টি ওয়ান" with "21"
    r"বাইশ|(টুয়েন্টি|টুয়েন্টি) টু": "22",  # Replace "বাইশ" and "টুয়েন্টি টু" with "22"
    r"তেইশ|(টুয়েন্টি|টুয়েন্টি) থ্রি": "23",# Replace "তেইশ" and "টুয়েন্টি থ্রি" with "23"
    r"চব্বিশ|(টুয়েন্টি|টুয়েন্টি) ফোর": "24",# Replace "চব্বিশ" and "টুয়েন্টি ফোর" with "24"
    r"পঁচিশ|(টুয়েন্টি|টুয়েন্টি) ফাইভ": "25",# Replace "পঁইত্রিশ" and "থার্টি ফাইভ" with "25"
    r"ছাব্বিশ|(টুয়েন্টি|টুয়েন্টি) সিক্স": "26",# Replace "ছত্রিশ" and "থার্টি সিক্স" with "26"
    r"সাতাশ|(টুয়েন্টি|টুয়েন্টি) সেভেন": "27",# Replace "সাতত্রিশ" and "থার্টি সেভেন" with "27"
    r"আটাশ|(টুয়েন্টি|টুয়েন্টি) এইট": "28",# Replace "আটত্রিশ" and "থার্টি এইট" with "28"
    r"ঊনত্রিশ|(টুয়েন্টি|টুয়েন্টি) নাইন": "29",# Replace "ঊনচল্লিশ" and "থার্টি নাইন" with "29"

    r"একত্রিশ|থার্টি (ওয়ান|ওয়ান)": "31",# Replace "একত্রিশ" and "থার্টি ওয়ান" with "31"
    r"বত্রিশ|থার্টি টু": "32",   # Replace "বত্রিশ" and "থার্টি টু" with "32"
    r"তেত্রিশ|থার্টি থ্রি": "33",# Replace "তেত্রিশ" and "থার্টি থ্রি" with "33"
    r"চৌত্রিশ|থার্টি ফোর": "34",# Replace "চত্রিশ" and "থার্টি ফোর" with "34"
    r"পঁইয়ত্রিশ|পঁইয়ত্রিশ|থার্টি ফাইভ": "35",# Replace "পঁইত্রিশ" and "থার্টি ফাইভ" with "35"
    r"ছত্রিশ|থার্টি সিক্স": "36",# Replace "ছত্রিশ" and "থার্টি সিক্স" with "36"
    r"সাইত্রিশ|থার্টি সেভেন": "37",# Replace "সাতত্রিশ" and "থার্টি সেভেন" with "37"
    r"আটত্রিশ|থার্টি এইট": "38",# Replace "আটত্রিশ" and "থার্টি এইট" with "38"
    r"ঊনচল্লিশ|থার্টি নাইন": "39",# Replace "ঊনপঁইত্রিশ" and "থার্টি নাইন" with "39"

    r"একচল্লিশ|ফরটি (ওয়ান|ওয়ান)": "41",# Replace "একতল্লিশ" and "ফরটি ওয়ান" with "41"
    r"বিয়াল্লিশ|বিয়াল্লিশ|ফরটি টু": "42",# Replace "বিয়াল্লিশ" and "ফরটি টু" with "42"
    r"তেতাল্লিশ|ফরটি থ্রি": "43",# Replace "তেতাল্লিশ" and "ফরটি থ্রি" with "43"
    r"চুয়াল্লিশ|চুয়াল্লিশ|ফরটি ফোর": "44",# Replace "চুয়াল্লিশ" and "ফরটি ফোর" with "44"
    r"পঁয়তাল্লিশ|পঁয়তাল্লিশ|ফরটি ফাইভ": "45",# Replace "পঁইত্রাল্লিশ" and "ফরটি ফাইভ" with "45"
    r"ছেচল্লিশ|ফরটি সিক্স": "46",     # Replace "ছেষট্টি" and "ফিফটি" with "46"
    r"সাতচল্লিশ|ফরটি সেভেন": "47",# Replace "সাতষট্টি" and "ফিফটি ওয়ান" with "47"
    r"আটচল্লিশ|ফরটি এইট": "48",  # Replace "আটষট্টি" and "ফিফটি টু" with "48"
    r"ঊনপঞ্চাশ|ফরটি নাইন": "49",# Replace "ঊননব্বই" and "ফিফটি থ্রি" with "49"

    r"একান্ন|ফিফটি (ওয়ান|ওয়ান)": "51", # Replace "একান্ন" and "ফিফটি ফাইভ" with "51"
    r"বায়ান্ন|ফিফটি টু": "52",# Replace "বায়ান্ন" and "ফিফটি সিক্স" with "52"
    r"তিপ্পান্ন|ফিফটি থ্রি": "53",# Replace "তিপ্পান্ন" and "ফিফটি সেভেন" with "53"
    r"চুয়ান্ন|চুয়ান্ন|ফিফটি ফোর": "54",# Replace "চুয়ান্ন" and "ফিফটি এইট" with "54"
    r"পঞ্চান্ন|ফিফটি ফাইভ": "55",# Replace "পঁইপঁচান্ন" and "ফিফটি নাইন" with "55"
    r"ছাপ্পান্ন|ফিফটি সিক্স": "56",
    r"সাতান্ন|ফিফটি সেভেন": "57",
    r"এটান্ন|ফিফটি এইট": "58",
    r"ঊনষাট|ফিফটি নাইন": "59",

    r"একষট্টি|সিক্সটি (ওয়ান|ওয়ান)": "61",    # Replace "একষট্টি" and "সেভেনটি" with "61"
    r"বাষট্টি|সিক্সটি টু": "62",# Replace "বাষট্টি" and "সেভেনটি ওয়ান" with "62"
    r"তেষট্টি|সিক্সটি থ্রি": "63",
    r"চৌষট্টি|সিক্সটি ফোর": "64",
    r"পঁইষট্টি|সিক্সটি ফাইভ": "65",
    r"ছেষট্টি|সিক্সটি সিক্স": "66",
    r"সাতষট্টি|সিক্সটি সেভেন": "67",
    r"আটষট্টি|সিক্সটি এইট": "68",
    r"ঊনসত্তর|সিক্সটি নাইন": "69",

    r"একাত্তর|সেভেন্টি (ওয়ান|ওয়ান)": "71",
    r"বাহাত্তর|সেভেন্টি টু।বায়াত্তর|বিয়াত্তর": "72",
    r"তিয়াত্তর|তিয়াত্তর|সেভেন্টি থ্রি": "73",
    r"চুয়াত্তর|চুয়াত্তর|সেভেন্টি ফোর": "74",
    r"পঁচাত্তর|সেভেন্টি ফাইভ": "75",
    r"ছিয়াত্তর|ছিয়াত্তর|সেভেন্টি সিক্স": "76",
    r"সাতাত্তর|সেভেন্টি সেভেন": "77",
    r"আটাত্তর|সেভেন্টি এইট": "78",
    r"ঊনআশি|সেভেন্টি নাইন": "79",

    r"একাশি|এইটি (ওয়ান|ওয়ান)": "81",
    r"বিরাশি|এইটি টু": "82",
    r"তিরাশি|এইটি থ্রি": "83",
    r"চুরাশি|এইটি ফোর": "84",
    r"পঁচাশি|এইটি ফাইভ": "85",
    r"ছিয়াশি|ছিয়াশি|এইটি সিক্স": "86",
    r"সাতাশি|এইটি সেভেন": "87",
    r"আটাশি|এইটি এইট": "88",
    r"ঊননব্বই|এইটি নাইন": "89",

    r"একানব্বই|নাইনটি (ওয়ান|ওয়ান)": "91",
    r"বিরানব্বই|নাইনটি টু": "92",
    r"তিরানব্বই|নাইনটি থ্রি": "93",
    r"চুরানব্বই|নাইনটি ফোর": "94",
    r"পঁচানব্বই|নাইনটি ফাইভ": "95",
    r"ছিয়ানব্বই|ছিয়ানব্বই|নাইনটি সিক্স": "96",
    r"সাতানব্বই|নাইনটি সেভেন": "97",
    r"আটানব্বই|নাইনটি এইট": "98",
    r"নিরানব্বই|নাইনটি নাইন": "99",

             # Replace "বিশ" and "টুইন্টি" with "20"
    r"ত্রিশ|থার্টি": "30",      # Replace "ত্রিশ" and "থার্টি" with "30"
     r"চল্লিশ|ফরটি": "40",      # Replace "চল্লিশ" and "ফরটি" with "40"
     r"পঞ্চাশ|ফিফটি": "50",  # Replace "পঞ্চাশ" and "ফিফটি ফোর" with "50"
     r"ষাট|সিক্সটি": "60",        # Replace "ষাট" and "সিক্সটি" with "60"
    r"সত্তর|সেভেন্টি": "70",
     r"নব্বই|নাইনটি": "90",

     r"ওয়ান|ওয়ান": "1" ,      # Replace "ওয়ান", "এক", and "থ্রি" with "1"
     r" টু।": " 2।",
     r" টু\)": " 2)",
     r"দুই": "2",      # Replace "টু", "দুই", "two" with "2"
     r" টু ": " 2 ",
     r" টু,": " 2,",
     r"\(টু ": "(2 ",

    r" তিন | থ্রি ": " 3 ",            # Replace "তিন" and "থ্রি" with "3"
    r" চার | ফোর ": " 4 ",             # Replace "চার" and "ফোর" with "4"
    r" পাঁচ | ফাইভ ": " 5 ",           # Replace "পাঁচ" and "ফাইভ" with "5"
    r" ছয়| ছয় | সিক্স ": " 6 ",            # Replace "ছয়" and "সিক্স" with "6"
    r" সাত | সেভেন ": " 7 ",           # Replace "সাত" and "সেভেন" with "7"
    r" আট | এইট ": " 8 ",              # Replace "আট" and "এইট" with "8"
    r" নাইন ": " 9 ",             # Replace "নয়" and "নাইন" with "9"

    r" তিন,| থ্রি,": " 3,",
    r" তিন।| থ্রি।": " 3।",
    r" তিন\)| থ্রি\)": " 3)",
    r"^তিন |^থ্রি ": "3 ",
    r"\(তিন |\(থ্রি ": "(3 ",

    # 4
    r" চার,| ফোর,": " 4,",
    r" চার।| ফোর।": " 4।",
    r" চার\)| ফোর\)": " 4)",
    r"^চার |^ফোর ": "4 ",
    r"\(চার |\(ফোর ": "(4 ",

    # 5
    r" পাঁচ,| ফাইভ,": " 5,",
    r" পাঁচ।| ফাইভ।": " 5।",
    r" পাঁচ\)| ফাইভ\)": " 5)",
    r"^পাঁচ |^ফাইভ ": "5 ",
    r"\(পাঁচ |\(ফাইভ ": "(5 ",

    # 6
    r" (ছয়|ছয়),| সিক্স,": " 6,",
    r" (ছয়|ছয়)।| সিক্স।": " 6।",
    r" (ছয়|ছয়)\)| সিক্স\)": " 6)",
    r"^(ছয়|ছয়) |^সিক্স ": "6 ",
    r"\((ছয়|ছয়) |\(সিক্স ": "(6 ",

    # 7
    r" সাত,| সেভেন,": " 7,",
    r" সাত।| সেভেন।": " 7।",
    r" সাত\)| সেভেন\)": " 7)",
    r"^সাত |^সেভেন ": "7 ",
    r"\(সাত |\(সেভেন ": "(7 ",

    # 8
    r" আট,| এইট,": " 8,",
    r" আট।| এইট।": " 8।",
    r" আট\)| এইট\)": " 8)",
    r"^আট |^এইট ": "8 ",
    r"\(আট |\(এইট ": "(8 ",

    # 9
    r" নাইন,": " 9,",
    r" নাইন।": " 9।",
    r" নয়\)| নয়\)| নাইন\)": " 9)",
    r"^নয় |^নয় |^নাইন ": "9 ",
    r"\(নয় |\(নয় |\(নাইন ": "(9 ",

    # 10
   r" দশ,| টেন,": " 10,",
   r" দশ।| টেন।": " 10।",
   r" দশ\)| টেন\)": " 10)",
   r"^দশ |^টেন ": "10 ",
   r" দশ | টেন ": " 10 ",
   r"\(দশ |\(টেন ": "(10 ",

  # 20
    r"টুয়েন্টি|টুয়েন্টি": "20",
  r" বিশ,| টুইন্টি,": " 20,",
  r" বিশ।| টুইন্টি।": " 20।",
  r" বিশ\)| টুইন্টি\)": " 20)",
  r"^বিশ |^টুইন্টি ": "20 ",
  r" বিশ | টুইন্টি ": " 20 ",
  r"\(বিশ |\(টুইন্টি ": "(20 ",

    # 80
    r" আশি,| এইটি,": " 80,",
    r" আশি।| এইটি।": " 80।",
    r" আশি\)| এইটি\)": " 80)",
    r"^আশি |^এইটি ": "80 ",
    r" আশি | এইটি ": " 80 ",
    r"\(আশি |\(এইটি ": "(80 ",

    r"জিরো": "0",




    r"০": "0",
    r"১": "1",
    r"২": "2",
    r"৩": "3",
    r"৪": "4",
    r"৫": "5",
    r"৬": "6",
    r"৭": "7",
    r"৮": "8",
    r"৯": "9",

    r" এ ": " a ",                 # Replace "এ" with "a"
    r" বি ": " b ",                # Replace "বি" with "b"
    r" সি ": " c ",                # Replace "সি" with "c"
    r" ডি ": " d ",                # Replace "ডি" with "d"
    r" ই ": " e ",                 # Replace "ই" with "e"
    r" এফ ": " f ",                # Replace "এফ" with "f"
    r" জি ": " g ",                # Replace "জি" with "g"
    r" এইচ ": " h ",               # Replace "এইচ" with "h"
    r" আই ": " i ",                # Replace "আই" with "i"
    r" জে ": " j ",                # Replace "জে" with "j"
    r" কে ": " k ",                # Replace "কে" with "k"
    r" এল ": " l ",                # Replace "এল" with "l"
    r" এম ": " m ",                # Replace "এম" with "m"
    r" এন ": " n ",                # Replace "এন" with "n"
   # r"ও": "o",                 # Replace "ও" with "o"
    r" পি ": " p ",                # Replace "পি" with "p"
    r" কিউ ": " q ",               # Replace "কিউ" with "q"
    #r"আর": "r",                # Replace "আর" with "r"
    r" এস ": " s ",                # Replace "এস" with "s"
    r" টি ": " t ",                # Replace "টি" with "t"
    r" ইউ ": " u ",                # Replace "ইউ" with "u"
    r" ভি ": " v ",                # Replace "ভি" with "v"
    r" (ডব্লিউ।ডাবলিউ) ": " w ",            # Replace "ডব্লিউ" with "w"
    r" এক্স ": " x ",              # Replace "এক্স" with "x"
    r" ওয়াই | ওয়াই ": "y",             # Replace "ওয়াই" with "y"
    r" জেড ": " z ",               # Replace "জেড" with "z"


    #r"^এ ": "a ",                 # Replace "এ" with "a"
    r"^বি ": "b ",                # Replace "বি" with "b"
    r"^সি ": "c ",                # Replace "সি" with "c"
    r"^ডি ": "d ",                # Replace "ডি" with "d"
    r"^ই ": "e ",                 # Replace "ই" with "e"
    r"^এফ ": "f ",                # Replace "এফ" with "f"
    r"^জি ": "g ",                # Replace "জি" with "g"
    r"^এইচ ": "h ",               # Replace "এইচ" with "h"
    r"^আই ": "i ",                # Replace "আই" with "i"
    r"^জে ": "j ",                # Replace "জে" with "j"
    r"^কে ": "k ",                # Replace "কে" with "k"
    r"^এল ": "l ",                # Replace "এল" with "l"
    r"^এম ": "m ",                # Replace "এম" with "m"
    r"^এন ": "n ",                # Replace "এন" with "n"
   # r"ও": "o",                 # Replace "ও" with "o"
    r"^পি ": "p ",                # Replace "পি" with "p"
    r"^কিউ ": "q ",               # Replace "কিউ" with "q"
    #r"আর": "r",                # Replace "আর" with "r"
    r"^এস ": "s ",                # Replace "এস" with "s"
    r"^টি ": "t ",                # Replace "টি" with "t"
    r"^ইউ ": "u ",                # Replace "ইউ" with "u"
    r"^ভি ": "v ",                # Replace "ভি" with "v"
    r"^(ডব্লিউ।ডাবলিউ) ": "w ",            # Replace "ডব্লিউ" with "w"
    r"^এক্স ": "x ",              # Replace "এক্স" with "x"
    r"^ওয়াই |^ওয়াই ": "y ",              # Replace "ওয়াই" with "y"
    r"^জেড ": "z ",               # Replace "জেড" with "z"

    r" এ,": " a,",                 # Replace "এ" with "a"
    r" বি,": " b,",                # Replace "বি" with "b"
    r" সি,": " c,",                # Replace "সি" with "c"
    r" ডি,": " d,",                # Replace "ডি" with "d"
    r" ই,": " e,",                 # Replace "ই" with "e"
    r" এফ,": " f,",                # Replace "ফি" with "f"
    r" জি,": " g,",                # Replace "গি" with "g"
    r" এইচ,": " h,",               # Replace "এইচ" with "h"
    r" আই,": " i,",                # Replace "আই" with "i"
    r" জে,": " j,",                # Replace "জে" with "j"
    r" কে,": " k,",                # Replace "কে" with "k"
    r" এল,": " l,",                # Replace "এল" with "l"
    r" এম,": " m,",                # Replace "এম" with "m"
    r" এন,": " n,",                # Replace "এন" with "n"
    #r" ও,": " o,",                 # Replace "ও" with "o"
    r" পি,": " p,",                # Replace "পি" with "p"
    r" কিউ,": " q,",               # Replace "কিউ" with "q"
    #r" আর,": " r,",                # Replace "আর" with "r"
    r" এস,": " s,",                # Replace "এস" with "s"
    r" টি,": " t,",                # Replace "টি" with "t"
    r" ইউ,": " u,",                # Replace "ইউ" with "u"
    r" ভি,": " v,",                # Replace "ভি" with "v"
    r" (ডব্লিউ।ডাবলিউ),": " w,",            # Replace "ডব্লিউ" with "w"
    r" এক্স,": " x,",              # Replace "এক্স" with "x"
    r" ওয়াই,| ওয়াই,": " y,",              # Replace "ওয়াই" with "y"
    r" জেড,": " z,",               # Replace "জেড" with "z"

    r" এ।": " a।",                 # Replace "এ" with "a"
    r" বি।": " b।",                # Replace "বি" with "b"
    r" সি।": " c।",                # Replace "সি" with "c"
    r" ডি।": " d।",
    r" ই।": " e।",
    r" এফ।": " f।",
    r" জি।": " g।",
    r" এইচ।": " h।",
    r" আই।": " i।",
    r" জে।": " j।",
    r" কে।": " k।",
    r" এল।": " l।",
    r" এম।": " m।",
    r" এন।": " n।",
    r" ও।": " o।",
    r" পি।": " p।",
    r" কিউ।": " q।",
    r" আর।": " r।",
    r" এস।": " s।",
    r" টি।": " t।",
    r" ইউ।": " u।",
    r" ভি।": " v।",
    r" (ডব্লিউ।ডাবলিউ)।": " w।",
    r" এক্স।": " x।",
    r" ওয়াই।| ওয়াই।": " y।",
    r" জেড।": " z।",

    r" এ\)": " a)",                 # Replace "এ" with "a"
    r" বি\)": " b)",                # Replace "বি" with "b"
    r" সি\)": " c)",                # Replace "সি" with "c"
    r" ডি\)": " d)",
    r" ই\)": " e)",
    r" এফ\)": " f)",
    r" জি\)": " g)",
    r" এইচ\)": " h)",
    r" আই\)": " i)",
    r" জে\)": " j)",
    r" কে\)": " k)",
    r" এল\)": " l)",
    r" এম\)": " m)",
    r" এন\)": " n)",
    r" ও\)": " o)",
    r" পি\)": " p)",
    r" কিউ\)": " q)",
    r" আর\)": " r)",
    r" এস\)": " s)",
    r" টি\)": " t)",
    r" ইউ\)": " u)",
    r" ভি\)": " v)",
    r" (ডব্লিউ।ডাবলিউ)\)": " w)",
    r" এক্স\)": " x)",
    r" ওয়াই\)| ওয়াই\)": " y)",
    r" জেড\)": " z)",


    r"\(এ ": "\(a ",
    r"\(বি ": "\(b ",
r"\(সি ": "\(c ",
r"\(ডি ": "\(d ",
r"\(ই ": "\(e ",
r"\(এফ ": "\(f ",
r"\(জি ": "\(g ",
r"\(এইচ ": "\(h ",
r"\(আই ": "\(i ",
r"\(জে ": "\(j ",
r"\(কে ": "\(k ",
r"\(এল ": "\(l ",
r"\(এম ": "\(m ",
r"\(এন ": "\(n ",
r"\(ও ": "\(o ",
r"\(পি ": "\(p ",
r"\(কিউ ": "\(q ",
r"\(আর ": "\(r ",
r"\(এস ": "\(s ",
r"\(টি ": "\(t ",
r"\(ইউ ": "\(u ",
r"\(ভি ": "\(v ",
r"\((ডব্লিউ।ডাবলিউ) ": "\(w ",
r"\(এক্স ": "\(x ",
r"\(ওয়াই |\(ওয়াই ": "\(y ",
r"\(জেড ": "\(z ",

    r"বিসিডি": "bcd",
    r"এসিডি": "acd",
    r"এবিসি": "abc",
    r"এসটিপি": "STP",
    r"এইচএসসি": "HSC",
    r"এসএসসি": "SSC",
    r"পিপিএম": "ppm",
    r"পিপিবি": "ppb",
    r"এসপি": "sp",
    r"এবি": "ab",
    r"এডি": "ad",
    r"এসসি": "sc",
    r"এসএ": "sa",
    r"এসকিউ": "sq",
    r"এসআর": "sr",
    r"এসটি": "st",
    r"এসএম": "sm",
    r"এসএন": "sn",
    r"এসও": "so",
    r"এসডি": "sd",
    r"এসই": "se",
    r"এসি ": "ac ",
    r"বিসি": "bc",
    r"পিকিউ": "pq",
    r"পিআর": "pr",
    r"পিএস": "ps",
    r"পিএম": "pm",
    r"পিএইচ": "pH",
    r"পিএ": "pa",

    r"পিবি": "pb",
    r"পিজেড": "pz",
    r"এক্সওয়াই|এক্সওয়াই": "xy",
    r"এক্সও": "xo",
    r"এক্সজেড": "xz",
    r"ওয়াইজেড|ওয়াইজেড": "yz",
    r"ওএ": "oa",
    r"ওবি": "ob",
    r"ওসি": "oc",
    r"ওডি": "od",
    r"ওপি": "op",
    r"ওকিউ": "oq",
    r"ওআর": "or",
    r"ওএস": "os",
    r"ইউভি": "uv",
    r"আইআর": "IR",
    r"ইএ": "ea",
    r"ইবি": "eb",
    r"কেপি": "kp",
    r"কেসি": "kc",
    r"এমসিকিউ": "MCQ",
    r"সিকিউ": "CQ",
    r"ডিএন": "dn",
    r"ডিএস": "ds",
    r"ডিটি": "dt",
    r"ডিএক্স": "dx",
    r"ডিওয়াই|ডিওয়াই": "dy",
    r"ডিজেড": "dz",
    r"সিপি": "Cp",
    r"সিভি": "Cv",
    r"সিডি": "cd",
    r"সিসি": "cc",
    r"ডিডি": "dd",




    r"কসেক|কোসেক": "cosec",  # Replace "কসেক" or "কোসেক" with "cosec"
    r"লগ ": "log ",              # Replace "লগ" with "log"
    r"সাইন ": "sin ",            # Replace "সাইন" with "sin"
    r" কস । কজ ": " cos ",          # Replace "কজ" or "কস" with "cos"
    r"সেক ": "sec ",            # Replace "্সেক" with "sec"

    r"কট ": "cot ",              # Replace "কট" with "cot"
    r"ইনটু ": "X ",             # Replace "ইনটু" with "X"


    r"মিউ(?=\W|$)": "μ",               # "মিউ" → "μ"
    r"ডেলটা(?=\W|$)": "ẟ",             # "ডেলটা" → "ẟ"
    r"ল্যামডা(?=\W|$)": "λ",           # "ল্যামডা" → "λ"
    r"থিটা|থেটা(?=\W|$)": "θ",         # "থিটা" or "থেটা" → "θ"

    #r"পাই(?=\W|$)": "π",
    r"সিগমা(?=\W|$)": "σ",      # Sigma
    r" রো(?=\W|$)": " ρ",         # Rho
    r"ওমেগা(?=\W|$)": "ω",      # Omega
    r" ফাই(?=\W|$)": " φ",        # Phi
    r"আলফা(?=\W|$)": "α",        # আলফা → α
    r"বিটা|বেটা(?=\W|$)": "β",    # বিটা/বেটা → β
    r"গামা(?=\W|$)": "γ",         # গামা → γ


    r"ন্যানোমিটার(?=\W|$)": "nm",
    r"মিলিমিটার(?=\W|$)": "mm",
    r"সেন্টিমিটার(?=\W|$)": "cm",
    r" মিটার(?=\W|$)": " m",
    r"কিলোমিটার(?=\W|$)": "km",

    r"সেকেন্ড(?=\W|$)": "s",
    r"মিলিসেকেন্ড(?=\W|$)": "ms",
    r"ন্যানোসেকেন্ড(?=\W|$)": "ns",
    r"মাইক্রোসেকেন্ড(?=\W|$)": "µs",

    r"কেজি(?=\W|$)": "kg",             # "কেজি" → "kg"
    r" গ্রাম(?=\W|$)": " g",
    r"মিলিগ্রাম(?=\W|$)": "mg",

     r"মাইনাস(?=\W|$)": "-",
     r"প্লাস(?=\W|$)": "+",
     r"কমা": ",",

    r"কার্বন ডাই অক্সাইড(?=\W|$)| কার্বন ডাইঅক্সাইড(?=\W|$)": "CO₂",
    r"অক্সিজেন গ্যাস(?=\W|$)": "O₂",
    r"হাইড্রোজেন গ্যাস(?=\W|$)": "H₂",
    r"নাইট্রোজেন গ্যাস(?=\W|$)": "N₂",
    r"ওজোন(?=\W|$)": "O₃",
    r"অ্যামোনিয়া (?=\W|$)|অ্যামোনিয়া (?=\W|$)": "NH₃ ",
    r"মিথেন(?=\W|$)": "CH₄",
    r"ইথেন(?=\W|$)": "C₂H₆",
    r"ইথিন(?=\W|$)|(অ্যাসিটিলিন|অ্যাসিটিলিন)(?=\W|$)": "C₂H₂",
    r"ইথানল(?=\W|$)": "C₂H₅OH",
    r"গ্লুকোজ(?=\W|$)": "C₆H₁₂O₆",
    r"সালফার ডাই অক্সাইড(?=\W|$)": "SO₂",
    r"সালফার ট্রাই অক্সাইড(?=\W|$)": "SO₃",
    r"হাইড্রোক্লোরিক (অ্যাসিড|অ্যাসিড)(?=\W|$)|হাইড্রোক্লোরিক এসিড(?=\W|$)|এইচসিএল(?=\W|$)": "HCl",
    r"নাইট্রিক (অ্যাসিড|অ্যাসিড)(?=\W|$)|নাইট্রিক এসিড(?=\W|$)": "HNO₃",
    r"সালফিউরিক (অ্যাসিড|অ্যাসিড)(?=\W|$)|সালফিউরিক এসিড(?=\W|$)": "H₂SO₄",
    r"(ক্যালসিয়াম|ক্যালসিয়াম) কার্বনেট(?=\W|$)": "CaCO₃",
    r"(সোডিয়াম|সোডিয়াম) ক্লোরাইড(?=\W|$)": "NaCl",
    r"(সোডিয়াম|সোডিয়াম) বাইকার্বোনেট(?=\W|$)": "NaHCO₃",
    r"(সোডিয়াম|সোডিয়াম) কার্বোনেট(?=\W|$)": "Na₂CO₃",
    r"(পটাসিয়াম|পটাসিয়াম) পারম্যাঙ্গানেট(?=\W|$)": "KMnO₄",
    r"হাইড্রোজেন পার অক্সাইড(?=\W|$)": "H₂O₂",

    r"হাইড্রোজেন(?=\W|$)": "H",
    r"(হিলিয়াম|হিলিয়াম)(?=\W|$)": "He",
    r"(লিথিয়াম|লিথিয়াম)(?=\W|$)": "Li",
    r"(বেরিলিয়াম|বেরিলিয়াম)(?=\W|$)": "Be",
    r"বোরন(?=\W|$)": "B",
    r"কার্বন (?=\W|$)": "C ",
    r"নাইট্রোজেন(?=\W|$)": "N",
    r"অক্সিজেন(?=\W|$)": "O",
    r"ফ্লোরিন(?=\W|$)": "F",
    r"(নিয়ন|নিয়ন)(?=\W|$)": "Ne",
    r"(সোডিয়াম|সোডিয়াম)(?=\W|$)": "Na",
    r"(ম্যাগনেসিয়াম|ম্যাগনেসিয়াম)(?=\W|$)": "Mg",
    r"(অ্যালুমিনিয়াম|অ্যালুমিনিয়াম)(?=\W|$)": "Al",
    r"সিলিকন(?=\W|$)": "Si",
    r"ফসফরাস(?=\W|$)": "P",
    r"সালফার(?=\W|$)": "S",
    r"ক্লোরিন(?=\W|$)": "Cl",
    r"আর্গন(?=\W|$)": "Ar",
    r"(পটাসিয়াম|পটাসিয়াম)(?=\W|$)": "K",
    r"(ক্যালসিয়াম|ক্যালসিয়াম)(?=\W|$)": "Ca",
    r"লোহা(?=\W|$)": "Fe",
    r"কপার|তামা(?=\W|$)": "Cu",
    r"জিঙ্ক(?=\W|$)": "Zn",
    r"সিলভার|রূপা(?=\W|$)": "Ag",
    r"গোল্ড|সোনা(?=\W|$)": "Au",
    r"মার্কারি|পারদ(?=\W|$)": "Hg",
    r"লেড|সিসা(?=\W|$)": "Pb",
    r" টিন(?=\W|$)": " Sn",
    r"নিকেল(?=\W|$)": "Ni",
    r"(ক্রোমিয়াম|ক্রোমিয়াম)(?=\W|$)": "Cr",
    r"ম্যাঙ্গানিজ(?=\W|$)": "Mn",
    r"কোবাল্ট(?=\W|$)": "Co",
    r"প্লাটিনাম(?=\W|$)": "Pt",
    r"(ইউরেনিয়াম|ইউরেনিয়াম)(?=\W|$)": "U",
    r"(আয়োডিন|আয়োডিন)(?=\W|$)": "I",
    r"ব্রোমিন(?=\W|$)": "Br",

    r"ফিজিক্স(?=\W|$)": "Physics",
    r"(রসায়ন|রসায়ন)(?=\W|$)|কেমিস্ট্রি(?=\W|$)": "Chemistry",
    r"জীববিজ্ঞান(?=\W|$)|(বায়োলজি|বায়োলজি)(?=\W|$)": "Biology",
    r"উচ্চতর গণিত(?=\W|$)|(হায়ার|হায়ার) ম্যাথ(?=\W|$)|(হায়ার|হায়ার) ম্যাথমেটিক্স(?=\W|$)": "Higher Mathematics",
    r"ম্যাথ(?=\W|$)": "Math",
    r"ইংরেজি(?=\W|$)": "English",
    r"বাংলা(?=\W|$)": "Bangla",
    r"আইসিটি(?=\W|$)|তথ্য ও যোগাযোগ প্রযুক্তি(?=\W|$)": "ICT",
    r"ভূগোল(?=\W|$)|জিওগ্রাফি(?=\W|$)": "Geography",
    r"সাইন্স(?=\W|$)": "Science",
    r"কম্পিউটার(?=\W|$)": "Computer",
    r"এই অংশে ": ""
}

# 3. Apply all regex patterns using a loop
for pattern, replacement in regex_patterns.items():
    merged_summary_text = re.sub(pattern, replacement, merged_summary_text)

regex_pat={
    r"([a-zA-Z₀₁₂₃₄₅₆₇₈₉+-])ে": r"\1এ",
    r"([a-z0-9λθπσρωφαβγ]) (স্কয়ার|স্কয়ার|স্কোয়ার|স্কোয়ার)": r"\1²",
    r"([a-z0-9λθπσρωφαβγ]) কিউব": r"\1³",
    r"(রুট ওভার।রুট) ([a-z0-9λθπσρωφαβγ])": r"√\2",
    r"([a-zλθπσρωφαβγ]) নট": r"\1₀",
    r"টান ([a-zλθπσρωφαβγ])": r"tan \1",
    r"(কস|কজ) ([a-zλθπσρωφαβγ])": r"cos \2"



}
# 3. Apply all regex patterns using a loop
for pattern, replacement in regex_pat.items():
    merged_summary_text = re.sub(pattern, replacement, merged_summary_text)

# 4. Save the modified text to a new file
modified_summary_txt_path = "/content/drive/MyDrive/Thesis_Transcription/Text_Output/summary_mBART_regex.txt"

with open(modified_summary_txt_path, "w", encoding="utf-8") as f:
    f.write(merged_summary_text)

print(f"📝 Modified summary saved at: {modified_summary_txt_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📝 Modified summary saved at: /content/drive/MyDrive/Thesis_Transcription/Text_Output/summary_mBART_regex.txt
