In [None]:
!pip install -q sentence-transformers torch tqdm

In [None]:
from transformers import pipeline

# Bangla NER (mBERT fine-tuned)
bn_ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-base-wikiann-ner",
    aggregation_strategy="simple"
)


# English NER (BERT fine-tuned on CoNLL-2003)
en_ner = pipeline(
    "ner",
    model="xlm-roberta-large-finetuned-conll03-english",
    tokenizer="xlm-roberta-large-finetuned-conll03-english",
    aggregation_strategy="simple"
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
from collections import defaultdict

def group_entities(entities):
    grouped = defaultdict(list)
    for ent in entities:
        label = ent["entity_group"]
        text = ent["word"]
        grouped[label].append(text)
    return dict(grouped)


In [None]:
import json
from tqdm import tqdm

def extract_ner(jsonl_path, language):
    ner_results = {}

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for doc_id, line in enumerate(tqdm(f, desc=f"{language} NER")):
            try:
                doc = json.loads(line)
            except:
                continue

            text = doc.get("body", "").strip()
            if not text:
                continue

            try:
                if language == "bangla":
                    entities = bn_ner(text)
                else:
                    entities = en_ner(text)
            except:
                # handles very long or problematic texts safely
                continue

            grouped = group_entities(entities)
            if grouped:
                ner_results[str(doc_id)] = grouped

    return ner_results


In [None]:
# ============================================================
# SANITY CHECK: NER OUTPUT VALIDATION (Bangla + English)
# ============================================================

def sanity_check_ner():
    print("=" * 70)
    print("NAMED ENTITY RECOGNITION — SANITY CHECK (COLAB)")
    print("=" * 70)

    # -----------------------------
    # Test Sentences
    # -----------------------------
    bangla_sentences = [
        "শেখ হাসিনা বাংলাদেশের প্রধানমন্ত্রী ছিলেন।",
        "আমি ঢাকায় থাকি।",
        "রহিম সাহেব গ্রামীণ ব্যাংকে কাজ করেন।",
        "কাজী নজরুল ইসলাম আমাদের জাতীয় কবি।",
        "শাকিব আল হাসান ক্রিকেট খেলেন।",
        "বাংলাদেশ একটি সুন্দর দেশ।"
    ]

    english_sentences = [
        "Joe Biden is the president of USA.",
        "I live in New York City.",
        "Elon Musk is the CEO of Tesla and SpaceX.",
        "Google has its headquarters in Mountain View.",
        "Lionel Messi plays for Inter Miami.",
        "The United Nations was established in 1945."
    ]

    # -----------------------------
    # Bangla Sanity Tests
    # -----------------------------
    print("\n" + "=" * 70)
    print("BANGLA NER TESTS")
    print("=" * 70)

    for sentence in bangla_sentences:
        print(f"\nSentence: {sentence}")
        try:
            results = bn_ner(sentence)

            if results:
                print(f"Found {len(results)} entities:")
                for r in results:
                    print(
                        f"  - Entity: {r['word']:<25} "
                        f"Type: {r['entity_group']:<8} "
                        f"Confidence: {r['score']:.4f}"
                    )
            else:
                print("  No entities found.")

        except Exception as e:
            print(f"  Error: {e}")

    # -----------------------------
    # English Sanity Tests
    # -----------------------------
    print("\n" + "=" * 70)
    print("ENGLISH NER TESTS")
    print("=" * 70)

    for sentence in english_sentences:
        print(f"\nSentence: {sentence}")
        try:
            results = en_ner(sentence)

            if results:
                print(f"Found {len(results)} entities:")
                for r in results:
                    print(
                        f"  - Entity: {r['word']:<25} "
                        f"Type: {r['entity_group']:<8} "
                        f"Confidence: {r['score']:.4f}"
                    )
            else:
                print("  No entities found.")

        except Exception as e:
            print(f"  Error: {e}")

    print("\n" + "=" * 70)
    print("NER SANITY CHECK COMPLETE")
    print("=" * 70)


# Run sanity check
sanity_check_ner()


NAMED ENTITY RECOGNITION — SANITY CHECK (COLAB)

BANGLA NER TESTS

Sentence: শেখ হাসিনা বাংলাদেশের প্রধানমন্ত্রী ছিলেন।
Found 2 entities:
  - Entity: শেখ হাসিনা                Type: PER      Confidence: 0.9987
  - Entity: বাংলাদেশের প্রধানমন্ত্রী  Type: ORG      Confidence: 0.6554

Sentence: আমি ঢাকায় থাকি।
Found 1 entities:
  - Entity: ঢাকায়                    Type: LOC      Confidence: 0.9992

Sentence: রহিম সাহেব গ্রামীণ ব্যাংকে কাজ করেন।
Found 3 entities:
  - Entity: রহিম সাহেব                Type: PER      Confidence: 0.9951
  - Entity:                           Type: ORG      Confidence: 0.9993
  - Entity: গ্রামীণ ব্যাংক            Type: ORG      Confidence: 0.9381

Sentence: কাজী নজরুল ইসলাম আমাদের জাতীয় কবি।
Found 1 entities:
  - Entity: কাজী নজরুল ইসলাম          Type: PER      Confidence: 0.9997

Sentence: শাকিব আল হাসান ক্রিকেট খেলেন।
Found 1 entities:
  - Entity: শাকিব আল হাসান            Type: PER      Confidence: 0.8925

Sentence: বাংলাদেশ একটি সুন্দর দেশ।
Found 1 entiti

In [None]:
# English NER
english_ner = extract_ner(
    jsonl_path="english_corpus.jsonl",
    language="english"
)

with open("english_named_entities.json", "w", encoding="utf-8") as f:
    json.dump(english_ner, f, indent=2)

print("English NER documents:", len(english_ner))


# Bangla NER
bangla_ner = extract_ner(
    jsonl_path="bangla_corpus.jsonl",
    language="bangla"
)

with open("bangla_named_entities.json", "w", encoding="utf-8") as f:
    json.dump(bangla_ner, f, ensure_ascii=False, indent=2)

print("Bangla NER documents:", len(bangla_ner))


english NER: 3855it [06:26,  9.98it/s]


English NER documents: 3843


bangla NER: 5697it [04:06, 23.10it/s]


Bangla NER documents: 5665


In [None]:
print()




# Query Processor

In [None]:
!pip -q install transformers sentence-transformers torch tqdm numpy scikit-learn


In [None]:
import json, numpy as np

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def stream_jsonl_safe(path):
    """
    Robust JSONL reader:
    - skips malformed lines
    - preserves line order for embedding alignment
    """
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                print(f"[WARN] Skipping malformed JSON in {path} at line {lineno}")
                continue

# Load doc_id lists (embedding order reference)
bn_doc_ids = load_json("bangla_doc_ids.json")
en_doc_ids = load_json("english_doc_ids.json")

# Load embeddings
bn_emb = np.load("bangla_embeddings.npy")   # shape: (N_bn, dim)
en_emb = np.load("english_embeddings.npy")  # shape: (N_en, dim)

print("BN embeddings:", bn_emb.shape, "EN embeddings:", en_emb.shape)

# Load corpora into dict by doc_id
bn_docs = {}
for i, doc in enumerate(stream_jsonl_safe("bangla_corpus.jsonl")):
    bn_docs[str(i)] = doc

en_docs = {}
for i, doc in enumerate(stream_jsonl_safe("english_corpus.jsonl")):
    en_docs[str(i)] = doc

print("BN docs loaded:", len(bn_docs), "EN docs loaded:", len(en_docs))

# --- Alignment sanity check ---
assert len(bn_docs) >= bn_emb.shape[0], "Bangla docs < embeddings count!"
assert len(en_docs) >= en_emb.shape[0], "English docs < embeddings count!"

print("✔ Corpus–embedding alignment looks OK.")


BN embeddings: (5694, 768) EN embeddings: (3855, 768)
[WARN] Skipping malformed JSON in bangla_corpus.jsonl at line 300
[WARN] Skipping malformed JSON in bangla_corpus.jsonl at line 383
BN docs loaded: 5695 EN docs loaded: 3855
✔ Corpus–embedding alignment looks OK.


In [None]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Multilingual NER (works for Bangla + English, clean PER/ORG/LOC/MISC)
ner = pipeline(
    "ner",
    model="Davlan/xlm-roberta-large-ner-hrl",
    aggregation_strategy="simple"
)

# LaBSE for query embedding (cross-lingual)
labse = SentenceTransformer("sentence-transformers/LaBSE")

print("Models loaded.")


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


Models loaded.


In [None]:
!pip -q install deep-translator


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from deep_translator import GoogleTranslator

class SimpleTranslator:
    def translate(self, text: str, src: str, tgt: str) -> str:
        if src == tgt:
            return text
        if not text or not text.strip():
            return text
        try:
            # deep-translator expects 'en', 'bn'
            return GoogleTranslator(source=src, target=tgt).translate(text)
        except Exception as e:
            print(f"[WARN] Translation failed ({src}->{tgt}): {e}")
            return text  # fallback: return original

translator = SimpleTranslator()
print("Translator ready.")


Translator ready.


In [None]:
print(translator.translate("A turbulent year for the premier seaport", "en", "bn"))
print(translator.translate("বিকালে প্রধান উপদেষ্টার সঙ্গে সাক্ষাৎ করবেন নাহিদ ইসলাম", "bn", "en"))


প্রিমিয়ার সমুদ্রবন্দরের জন্য একটি উত্তাল বছর
Nahid Islam will meet with the chief advisor in the afternoon


In [None]:
import re, unicodedata
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple

@dataclass
class ProcessedQuery:
    original: str
    detected_language: str
    normalized: str
    tokens: List[str]
    translated: Optional[str] = None
    translation_language: Optional[str] = None
    expanded_terms: List[str] = field(default_factory=list)
    named_entities: List[Tuple[str, str]] = field(default_factory=list)
    processing_steps: List[str] = field(default_factory=list)

class QueryProcessor:
    BANGLA_RANGE = (0x0980, 0x09FF)

    EN_SYNS = {
        "election": ["vote", "voting", "poll"],
        "economy": ["economic", "financial", "market"],
        "health": ["medical", "hospital", "healthcare"],
        "climate": ["environment", "weather"],
        "cricket": ["match", "tournament"],
    }
    BN_SYNS = {
        "নির্বাচন": ["ভোট", "ব্যালট"],
        "অর্থনীতি": ["আর্থিক", "অর্থনৈতিক", "বাণিজ্য"],
        "স্বাস্থ্য": ["চিকিৎসা", "হাসপাতাল"],
        "আবহাওয়া": ["জলবায়ু", "বৃষ্টি"],
        "ক্রিকেট": ["ম্যাচ", "খেলা"],
    }

    def __init__(self, remove_stopwords=False, enable_expansion=True,
                 enable_translation=True, enable_ne_mapping=True):
        self.remove_stopwords = remove_stopwords
        self.enable_expansion = enable_expansion
        self.enable_translation = enable_translation
        self.enable_ne_mapping = enable_ne_mapping

    def detect_language(self, text: str) -> str:
        bangla, alpha = 0, 0
        for ch in text:
            if ch.isalpha():
                alpha += 1
                if self.BANGLA_RANGE[0] <= ord(ch) <= self.BANGLA_RANGE[1]:
                    bangla += 1
        return "bn" if alpha and bangla / alpha > 0.3 else "en"

    def normalize(self, text: str, lang: str):
        text = unicodedata.normalize("NFC", text).lower().strip()
        text = " ".join(text.split())
        if lang == "bn":
            tokens = re.findall(r"[\u0980-\u09FF]+", text)
        else:
            tokens = re.findall(r"[a-z0-9]+", text)
        return " ".join(tokens), tokens

    def expand_query(self, tokens, lang):
        if not self.enable_expansion:
            return []
        syns = self.EN_SYNS if lang == "en" else self.BN_SYNS
        expanded = []
        for t in tokens:
            for s in syns.get(t, []):
                if s not in tokens and s not in expanded:
                    expanded.append(s)
        return expanded

    def extract_entities(self, text):
        ents = []
        for e in ner(text):
            ents.append(e["word"])
        return ents

    def map_named_entities(self, query, src, tgt):
        if not self.enable_ne_mapping or src == tgt:
            return []
        mappings = []
        for ent in self.extract_entities(query):
            mapped = translator.translate(ent, src, tgt)
            if mapped and mapped.lower() != ent.lower():
                mappings.append((ent, mapped))
        return mappings

    def translate(self, text, src, tgt):
        if not self.enable_translation or src == tgt:
            return None
        return translator.translate(text, src, tgt)

    def process(self, query, target_lang=None):
        steps = []
        src = self.detect_language(query)
        steps.append(f"Language detected: {src}")

        norm, tokens = self.normalize(query, src)
        steps.append(f"Normalized: '{norm}'")

        expanded = self.expand_query(tokens, src)
        if expanded:
            steps.append(f"Expanded with: {expanded}")

        translated, ne_map = None, []
        if target_lang and target_lang != src:
            translated = self.translate(query, src, target_lang)
            steps.append(f"Translated to {target_lang}: '{translated}'")
            ne_map = self.map_named_entities(query, src, target_lang)
            if ne_map:
                steps.append(f"NE mappings: {ne_map}")

        return ProcessedQuery(
            original=query,
            detected_language=src,
            normalized=norm,
            tokens=tokens,
            translated=translated,
            translation_language=target_lang if translated else None,
            expanded_terms=expanded,
            named_entities=ne_map,
            processing_steps=steps,
        )

processor = QueryProcessor()
print("QueryProcessor ready.")


QueryProcessor ready.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def embed_query(text):
    return labse.encode([text], normalize_embeddings=True).astype(np.float32)

def search_embeddings(query_text, target_lang, topk=5):
    qv = embed_query(query_text)

    if target_lang == "bn":
        doc_mat, doc_ids, store = bn_emb, bn_doc_ids, bn_docs
    else:
        doc_mat, doc_ids, store = en_emb, en_doc_ids, en_docs

    doc_norm = doc_mat / (np.linalg.norm(doc_mat, axis=1, keepdims=True) + 1e-12)
    sims = cosine_similarity(qv, doc_norm)[0]
    top_idx = np.argsort(-sims)[:topk]

    results = []
    for i in top_idx:
        did = str(doc_ids[i]) if i < len(doc_ids) else str(i)
        d = store.get(did, {})
        results.append({
            "score": float(sims[i]),
            "doc_id": did,
            "title": d.get("title", ""),
            "url": d.get("url", ""),
            "date": d.get("date", "")
        })
    return results


In [None]:
def demo(query, topk=5):
    print("="*90)
    print("QUERY:", query)
    print("="*90)

    src = processor.detect_language(query)
    other = "bn" if src == "en" else "en"

    pq_src = processor.process(query)
    pq_other = processor.process(query, target_lang=other)

    print("\n--- Processing (original) ---")
    for s in pq_src.processing_steps:
        print(" -", s)

    print("\n--- Processing (translated / CLIR) ---")
    for s in pq_other.processing_steps:
        print(" -", s)

    print("\n--- LaBSE Retrieval (no translation) ---")
    print("\nEN corpus:")
    for r in search_embeddings(query, "en", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    print("\nBN corpus:")
    for r in search_embeddings(query, "bn", topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    if pq_other.translated:
        print("\n--- Retrieval with translated query (baseline comparison) ---")
        tq = pq_other.translated
        for r in search_embeddings(tq, other, topk):
            print(f"  [{r['score']:.4f}] {r['title'][:80]}")


In [None]:
demo("coronavirus vaccine")
demo("বাংলাদেশ নির্বাচন ফলাফল")
demo("ঢাকা আবহাওয়া")


QUERY: coronavirus vaccine

--- Processing (original) ---
 - Language detected: en
 - Normalized: 'coronavirus vaccine'

--- Processing (translated / CLIR) ---
 - Language detected: en
 - Normalized: 'coronavirus vaccine'
 - Translated to bn: 'করোনাভাইরাস টিকা'

--- LaBSE Retrieval (no translation) ---

EN corpus:
  [0.2532] Man bitten by snakes over 200 times helps create breakthrough Antivenom
  [0.2418] Dengue must be treated as a year-round emergency
  [0.2160] Russia blames Ukraine for deadly New Year drone strike
  [0.2134] Russia says at least 20 killed in Ukrainian drone strike
  [0.2076] Parents in India devastated as children with thalassemia test HIV positive

BN corpus:
  [0.2882] এবার আন্দোলনকারীদের বিরুদ্ধে আন্দোলনে নেমেছেন চিকিৎসক-নার্সরা
  [0.2503] গ্রিনল্যান্ড নিয়ে ট্রাম্পের হুমকি দেয়া ‘পুরোপুরি ভুল’ কাজ : ব্রিটিশ প্রধানমন্ত্র
  [0.2202] ইউক্রেনে রাশিয়ার ক্ষেপণাস্ত্র হামলায় নিহত ৭
  [0.2110] সুইজার‍ল্যান্ডে নববর্ষ উদ্‌যাপনের সময় আগুন লাগল কীভাবে
  [0.2084] হিমায়িত পোল

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [None]:
# Prepare document texts
bn_texts = []
bn_ids = []

for i, doc in bn_docs.items():
    text = (doc.get("title","") + " " + doc.get("body","")).strip()
    if text:
        bn_texts.append(text)
        bn_ids.append(i)

en_texts = []
en_ids = []

for i, doc in en_docs.items():
    text = (doc.get("title","") + " " + doc.get("body","")).strip()
    if text:
        en_texts.append(text)
        en_ids.append(i)

print("BN docs for TF-IDF:", len(bn_texts))
print("EN docs for TF-IDF:", len(en_texts))


BN docs for TF-IDF: 5695
EN docs for TF-IDF: 3855


In [None]:
# English TF-IDF
tfidf_en = TfidfVectorizer(
    lowercase=True,
    max_features=50000,
    ngram_range=(1,2),
    stop_words="english"
)

X_en = tfidf_en.fit_transform(en_texts)
print("English TF-IDF matrix:", X_en.shape)

# Bangla TF-IDF
tfidf_bn = TfidfVectorizer(
    lowercase=False,      # Bangla has no case
    max_features=50000,
    ngram_range=(1,2),
    token_pattern=r"[\u0980-\u09FF]+"
)

X_bn = tfidf_bn.fit_transform(bn_texts)
print("Bangla TF-IDF matrix:", X_bn.shape)


English TF-IDF matrix: (3855, 50000)
Bangla TF-IDF matrix: (5695, 50000)


In [None]:
def search_tfidf(query: str, lang: str, topk: int = 5):
    if lang == "en":
        vec = tfidf_en
        X = X_en
        ids = en_ids
        docs = en_docs
    else:
        vec = tfidf_bn
        X = X_bn
        ids = bn_ids
        docs = bn_docs

    qv = vec.transform([query])
    sims = cosine_similarity(qv, X)[0]

    top_idx = np.argsort(-sims)[:topk]

    results = []
    for idx in top_idx:
        did = ids[idx]
        doc = docs.get(did, {})
        results.append({
            "score": float(sims[idx]),
            "doc_id": did,
            "title": doc.get("title",""),
            "url": doc.get("url",""),
        })
    return results


In [None]:
def demo_tfidf(query, topk=5):
    print("="*90)
    print("TF-IDF QUERY:", query)
    print("="*90)

    src = processor.detect_language(query)
    other = "bn" if src == "en" else "en"

    pq = processor.process(query, target_lang=other)

    print("\n--- TF-IDF (same language only) ---")
    for r in search_tfidf(query, src, topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    if pq.translated:
        print("\n--- TF-IDF (translated query baseline) ---")
        for r in search_tfidf(pq.translated, other, topk):
            print(f"  [{r['score']:.4f}] {r['title'][:80]}")


In [None]:
demo_tfidf("coronavirus vaccine")
demo_tfidf("বাংলাদেশ নির্বাচন ফলাফল")
demo_tfidf("ঢাকা আবহাওয়া")


TF-IDF QUERY: coronavirus vaccine

--- TF-IDF (same language only) ---
  [0.5412] Severe shortage of rabies vaccine in Kushtia
  [0.3703] Gopalganj faces rabies vaccine shortage
  [0.2257] Acute shortage of rabies vaccine in Ishwardi
  [0.0472] Khaleda Zia returns home after 10 days of treatment
  [0.0366] Patients with cold-related diseases increase

--- TF-IDF (translated query baseline) ---
  [0.2476] হজযাত্রীরা ৮০ কেন্দ্রে টিকা নেবেন, তারিখ জানানো হবে এসএমএসে
  [0.1960] জাতির সেফ এক্সিট হলে আমরা খুশি: উপদেষ্টা
  [0.1446] ময়মনসিংহে টাইফয়েড টিকা কর্মসূচির কর্মীদের মজুরি কম দেওয়ার অভিযোগ
  [0.0629] জন এফ কেনেডির ৩৫ বছর বয়সী নাতনি টাটিয়ানা শ্লসবার্গের মৃত্যু
  [0.0535] ময়মনসিংহে এনসিপির দুই নেতার ওপর হামলা, হাসপাতালে ভর্তি
TF-IDF QUERY: বাংলাদেশ নির্বাচন ফলাফল

--- TF-IDF (same language only) ---
  [0.2006] প্রাথমিকে সহকারী শিক্ষক নিয়োগের ফল প্রকাশ হতে পারে আজ
  [0.1634] এনসিপি ও বাসদ মার্কসবাদীকে নিবন্ধন দিয়ে প্রজ্ঞাপন জারি
  [0.1317] প্রাথমিকে শিক্ষক নিয়োগের ফলে উত্তীর্ণ ৬৯২৬৫, প্রা

In [None]:
!pip -q install rank_bm25

In [None]:
from rank_bm25 import BM25Okapi
import re


In [None]:
def tokenize_en(text):
    return re.findall(r"[a-z0-9]+", text.lower())

def tokenize_bn(text):
    return re.findall(r"[\u0980-\u09FF]+", text)


In [None]:
# Build BM25 corpora
bn_corpus = []
bn_ids = []

for did, doc in bn_docs.items():
    text = (doc.get("title","") + " " + doc.get("body","")).strip()
    if text:
        bn_corpus.append(tokenize_bn(text))
        bn_ids.append(did)

en_corpus = []
en_ids = []

for did, doc in en_docs.items():
    text = (doc.get("title","") + " " + doc.get("body","")).strip()
    if text:
        en_corpus.append(tokenize_en(text))
        en_ids.append(did)

print("BM25 BN docs:", len(bn_corpus))
print("BM25 EN docs:", len(en_corpus))


BM25 BN docs: 5695
BM25 EN docs: 3855


In [None]:
bm25_bn = BM25Okapi(bn_corpus)
bm25_en = BM25Okapi(en_corpus)

print("BM25 models ready.")


BM25 models ready.


In [None]:
import numpy as np

def search_bm25(query: str, lang: str, topk: int = 5):
    if lang == "en":
        tokens = tokenize_en(query)
        bm25 = bm25_en
        ids = en_ids
        docs = en_docs
    else:
        tokens = tokenize_bn(query)
        bm25 = bm25_bn
        ids = bn_ids
        docs = bn_docs

    scores = bm25.get_scores(tokens)
    top_idx = np.argsort(scores)[::-1][:topk]

    results = []
    for idx in top_idx:
        did = ids[idx]
        doc = docs.get(did, {})
        results.append({
            "score": float(scores[idx]),
            "doc_id": did,
            "title": doc.get("title",""),
            "url": doc.get("url",""),
        })
    return results


In [None]:
def demo_bm25(query, topk=5):
    print("="*90)
    print("BM25 QUERY:", query)
    print("="*90)

    src = processor.detect_language(query)
    other = "bn" if src == "en" else "en"

    pq = processor.process(query, target_lang=other)

    print("\n--- BM25 (same language only) ---")
    for r in search_bm25(query, src, topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    if pq.translated:
        print("\n--- BM25 (translated query baseline) ---")
        for r in search_bm25(pq.translated, other, topk):
            print(f"  [{r['score']:.4f}] {r['title'][:80]}")


In [None]:
demo_bm25("coronavirus vaccine")
demo_bm25("বাংলাদেশ নির্বাচন ফলাফল")
demo_bm25("ঢাকা আবহাওয়া")


BM25 QUERY: coronavirus vaccine

--- BM25 (same language only) ---
  [14.8442] Severe shortage of rabies vaccine in Kushtia
  [14.1212] Gopalganj faces rabies vaccine shortage
  [13.0819] Acute shortage of rabies vaccine in Ishwardi
  [7.8227] Patients with cold-related diseases increase
  [7.1834] Khaleda Zia returns home after 10 days of treatment

--- BM25 (translated query baseline) ---
  [13.1985] হজযাত্রীরা ৮০ কেন্দ্রে টিকা নেবেন, তারিখ জানানো হবে এসএমএসে
  [11.6340] জাতির সেফ এক্সিট হলে আমরা খুশি: উপদেষ্টা
  [10.9661] ময়মনসিংহে টাইফয়েড টিকা কর্মসূচির কর্মীদের মজুরি কম দেওয়ার অভিযোগ
  [9.1847] বিএনপির চেয়ারম্যান হিসেবে দায়িত্ব গ্রহণ করলেন তারেক রহমান
  [8.3428] জন এফ কেনেডির ৩৫ বছর বয়সী নাতনি টাটিয়ানা শ্লসবার্গের মৃত্যু
BM25 QUERY: বাংলাদেশ নির্বাচন ফলাফল

--- BM25 (same language only) ---
  [10.6201] ২৩ কেন্দ্রের ফলাফলে ভিপি পদে এগিয়ে রিয়াজুল, জিএস পদে আলীম
  [10.5960] ৩৩ কেন্দ্রের ফলাফলে ভিপি পদে রিয়াজুল, জিএস পদে আলীম এগিয়ে
  [10.2410] প্রাথমিকে সহকারী শিক্ষক নিয়োগের ফল প্রকা

In [None]:
import difflib


In [None]:
def char_ngrams(text, n=3):
    text = text.lower()
    return set(text[i:i+n] for i in range(len(text) - n + 1))


In [None]:
def jaccard_ngram_similarity(a, b, n=3):
    A = char_ngrams(a, n)
    B = char_ngrams(b, n)
    if not A or not B:
        return 0.0
    return len(A & B) / len(A | B)


In [None]:
def edit_similarity(a, b):
    return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()


In [None]:
def fuzzy_similarity(a, b):
    return 0.6 * edit_similarity(a, b) + 0.4 * jaccard_ngram_similarity(a, b)


In [None]:
def search_fuzzy(query, lang, topk=5):
    if lang == "en":
        docs = en_docs
    else:
        docs = bn_docs

    scores = []

    for did, doc in docs.items():
        title = doc.get("title", "")
        if not title:
            continue
        s = fuzzy_similarity(query, title)
        if s > 0.2:  # threshold avoids garbage
            scores.append((s, did))

    scores.sort(reverse=True)
    results = []
    for s, did in scores[:topk]:
        doc = docs.get(did, {})
        results.append({
            "score": s,
            "doc_id": did,
            "title": doc.get("title",""),
            "url": doc.get("url",""),
        })
    return results


In [None]:
def demo_fuzzy(query, topk=5):
    print("="*90)
    print("FUZZY QUERY:", query)
    print("="*90)

    src = processor.detect_language(query)
    other = "bn" if src == "en" else "en"

    print("\n--- Fuzzy match (same language) ---")
    for r in search_fuzzy(query, src, topk):
        print(f"  [{r['score']:.3f}] {r['title'][:80]}")

    # try translated query for cross-lingual fuzzy
    pq = processor.process(query, target_lang=other)
    if pq.translated:
        print("\n--- Fuzzy match (translated query) ---")
        for r in search_fuzzy(pq.translated, other, topk):
            print(f"  [{r['score']:.3f}] {r['title'][:80]}")


In [None]:
demo_fuzzy("Bangladesh election")
demo_fuzzy("বাংলাদেশ নির্বাচন")
demo_fuzzy("coronavirus vaccine")
demo_fuzzy("Dhaka weather")
demo_fuzzy("ঢাকা আবহাওয়া")


FUZZY QUERY: Bangladesh election

--- Fuzzy match (same language) ---
  [0.615] US welcomes Bangladesh election plan
  [0.525] Bangladesh U-19 sense win
  [0.493] Bangladesh welcomes China's stance on election
  [0.485] Bangladesh women lose to Pakistan
  [0.481] Bangladesh still believe in win

--- Fuzzy match (translated query) ---
  [0.438] নেপালে বাংলাদেশের দুই সিনেমা
  [0.433] ডানা মেলছে বাংলাদেশের নারী হকি
  [0.405] বাংলাদেশের প্রতি কৃতজ্ঞতা জানালো ইরান
  [0.381] বছর গেল নির্বাচনের অপেক্ষায়
  [0.365] ‘বাংলাদেশী’ সন্দেহে ভারতে আরো ৫ শ্রমিককে নির্যাতন
FUZZY QUERY: বাংলাদেশ নির্বাচন

--- Fuzzy match (same language) ---
  [0.385] বছর গেল নির্বাচনের অপেক্ষায়
  [0.368] পাবনা-১ ও ২ আসনে নির্বাচন স্থগিত
  [0.362] নেপালে বাংলাদেশের দুই সিনেমা
  [0.350] বাংলাদেশে নির্বাচনে বাম গণতান্ত্রিক শক্তিকে বিজয়ী করার আহ্বান
  [0.346] ডানা মেলছে বাংলাদেশের নারী হকি

--- Fuzzy match (translated query) ---
  [0.615] US welcomes Bangladesh election plan
  [0.525] Bangladesh U-19 sense win
  [0.493] Bang

In [None]:
def minmax_normalize(scores):
    if not scores:
        return scores
    mn, mx = min(scores), max(scores)
    if mx - mn < 1e-9:
        return [0.0 for _ in scores]
    return [(s - mn) / (mx - mn) for s in scores]


In [None]:
def collect_candidates(query, lang, topk=50):
    cand_ids = set()

    for r in search_embeddings(query, lang, topk):
        cand_ids.add(r["doc_id"])

    for r in search_bm25(query, lang, topk):
        cand_ids.add(r["doc_id"])

    for r in search_fuzzy(query, lang, topk):
        cand_ids.add(r["doc_id"])

    return list(cand_ids)


In [None]:
# ===============================
# Embedding index lookup tables
# ===============================

en_id_to_embidx = {str(did): i for i, did in enumerate(en_doc_ids)}
bn_id_to_embidx = {str(did): i for i, did in enumerate(bn_doc_ids)}

print("Embedding index maps ready.")
print("EN:", len(en_id_to_embidx), "BN:", len(bn_id_to_embidx))


Embedding index maps ready.
EN: 3855 BN: 5694


In [None]:
def semantic_score(query, doc_id, lang):
    if lang == "en":
        idx = en_id_to_embidx.get(str(doc_id))
        if idx is None:
            return 0.0
        return float(cosine_similarity(
            embed_query(query),
            en_emb[idx:idx+1]
        )[0][0])
    else:
        idx = bn_id_to_embidx.get(str(doc_id))
        if idx is None:
            return 0.0
        return float(cosine_similarity(
            embed_query(query),
            bn_emb[idx:idx+1]
        )[0][0])


In [None]:
def bm25_score(query, doc_id, lang):
    if lang == "en":
        idx = en_id_to_bmidx.get(str(doc_id))
        if idx is None:
            return 0.0
        return float(bm25_en.get_scores(tokenize_en(query))[idx])
    else:
        idx = bn_id_to_bmidx.get(str(doc_id))
        if idx is None:
            return 0.0
        return float(bm25_bn.get_scores(tokenize_bn(query))[idx])


In [None]:
def fuzzy_score(query, doc_id, lang):
    doc = en_docs[doc_id] if lang == "en" else bn_docs[doc_id]
    title = doc.get("title", "")
    if not title:
        return 0.0
    return fuzzy_similarity(query, title)


In [None]:
#  final_score =
#     α · semantic_similarity   (LaBSE)
#   + β · lexical_score         (BM25)
#   + γ · fuzzy_score           (char-ngram + edit)


In [None]:
def search_hybrid(query, lang, topk=5,
                  alpha=0.4, beta=0.35, gamma=0.25):

    candidates = collect_candidates(query, lang, topk=50)

    sem_scores, lex_scores, fuz_scores = [], [], []

    for did in candidates:
        sem_scores.append(semantic_score(query, did, lang))
        lex_scores.append(bm25_score(query, did, lang))
        fuz_scores.append(fuzzy_score(query, did, lang))

    # Normalize each signal
    sem_n = minmax_normalize(sem_scores)
    lex_n = minmax_normalize(lex_scores)
    fuz_n = minmax_normalize(fuz_scores)

    final = []
    for i, did in enumerate(candidates):
        score = (
            alpha * sem_n[i] +
            beta  * lex_n[i] +
            gamma * fuz_n[i]
        )
        doc = en_docs[did] if lang == "en" else bn_docs[did]
        final.append({
            "score": score,
            "doc_id": did,
            "title": doc.get("title",""),
            "url": doc.get("url","")
        })

    final.sort(key=lambda x: x["score"], reverse=True)
    return final[:topk]


In [None]:
def demo_hybrid(query, topk=5):
    print("="*90)
    print("HYBRID QUERY:", query)
    print("="*90)

    src = processor.detect_language(query)
    other = "bn" if src == "en" else "en"

    pq = processor.process(query, target_lang=other)

    print("\n--- Hybrid retrieval (same language) ---")
    for r in search_hybrid(query, src, topk):
        print(f"  [{r['score']:.4f}] {r['title'][:80]}")

    if pq.translated:
        print("\n--- Hybrid retrieval (translated query) ---")
        for r in search_hybrid(pq.translated, other, topk):
            print(f"  [{r['score']:.4f}] {r['title'][:80]}")


In [None]:
demo_hybrid("viral fever")
demo_hybrid("বাংলাদেশ নির্বাচন ফলাফল")
demo_hybrid("ঢাকা আবহাওয়া")


HYBRID QUERY: viral fever

--- Hybrid retrieval (same language) ---
  [0.8303] Patients with cold-related diseases increase
  [0.7840] Cold spell prevailing in Bagerhat
  [0.7028] Biting Cold 195 admitted in Khulna Shishu Hospital in 3 days
  [0.6308] Severe cold grips Chatmohar
  [0.6038] 35 more dengue patients hospitalized

--- Hybrid retrieval (translated query) ---
  [0.7468] ভাইরাল ‘বুক চিনচিন’ গানের গায়ক পাবেলের বিয়ে
  [0.6468] ছোটদের বাতরোগ হলে কী করবেন
  [0.6139] কীভাবে দিন কাটছে প্রবাসী ইরানিদের
  [0.6036] শাহরুখকে নিয়ে ভাইরাল স্ক্রিনশটটি ভুয়া, বললেন সেই তুর্কী অভিনেত্রী
  [0.6036] কনকনে ঠান্ডায় কুড়িগ্রামে জনজীবন স্থবির
HYBRID QUERY: বাংলাদেশ নির্বাচন ফলাফল

--- Hybrid retrieval (same language) ---
  [0.7761] যতই ষড়যন্ত্র হোক, ইনশাআল্লাহ দেশে নির্বাচন হবে : ফারুক
  [0.7326] প্রাথমিকে শিক্ষক নিয়োগের ফলে উত্তীর্ণ ৬৯২৬৫, প্রার্থীদের ৬ শর্ত, জেলাভিত্তিক ফল 
  [0.7104] প্রাথমিকে সহকারী শিক্ষক নিয়োগের ফল প্রকাশ হতে পারে আজ
  [0.6847] এবি পার্টির নির্বাচন পরিচালনা কমিটি গঠন
  [0.67