# Module D — Ranking, Scoring, & Evaluation

1. Translation Failures
2. Named Entity Mismatch
3. Semantic vs. Lexical Wins
4. Cross-Script Ambiguity
5. Code-Switching

In [1]:
import feedparser
import json
from tqdm import tqdm
import os
from bs4 import BeautifulSoup
import html
import re
import pickle
from rank_bm25 import BM25Okapi

## 4. BN <-> EN CLIR

### 4.1 Load Saved BM25 Indexes

In [2]:
def load_index(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

en_pack = load_index('bm25_en.pkl')
bn_pack = load_index('bm25_bn.pkl')

bm25_en = en_pack['bm25']
doc_ids_en = en_pack['doc_ids']
docs_en = en_pack['docs']

bm25_bn = bn_pack['bm25']
doc_ids_bn = bn_pack['doc_ids']
docs_bn = bn_pack['docs']


def tokenize_en(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

def tokenize_bn(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

def search_en(query, top_k=5):
    q = tokenize_en(query)
    scores = bm25_en.get_scores(q)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [{"doc_id": doc_ids_en[i], "score": float(scores[i]), "title": docs_en[i].get("title",""), "url": docs_en[i].get("url","")} for i in idx]

def search_bn(query, top_k=5):
    q = tokenize_bn(query)
    scores = bm25_bn.get_scores(q)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [{"doc_id": doc_ids_bn[i], "score": float(scores[i]), "title": docs_bn[i].get("title",""), "url": docs_bn[i].get("url","")} for i in idx]

print("Index loaded:" , len(docs_en), "English documents and", len(docs_bn), "Bengali documents.")


Index loaded: 6864 English documents and 10362 Bengali documents.


### 4.2 Language Detection (BN vs EN)

In [3]:
def is_bangla(text):
    for ch in text:
        o = ord(ch) #unicode of bangla
        if 0x0980 <= o <= 0x09FF:
            return True
    return False    

In [4]:
is_bangla("এই একটি বাংলা বাক্য।")
is_bangla("this is an english sentence.")

False

### 4.3 Load BN <-> EN translation model (MarianMT / OPUS MT)

In [5]:
import torch
import transformers
print(torch.__version__)
print(transformers.__version__)

from transformers import MarianMTModel, MarianTokenizer
print("Marian import successful!")


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


2.5.1
4.36.2


  _torch_pytree._register_pytree_node(


Marian import successful!


In [6]:
from transformers import MarianMTModel, MarianTokenizer

BN_EN_NAME = "Helsinki-NLP/opus-mt-bn-en"
EN_BN_NAME = "shhossain/opus-mt-en-to-bn"

tok_bn_en = MarianTokenizer.from_pretrained(BN_EN_NAME)
mod_bn_en = MarianMTModel.from_pretrained(BN_EN_NAME)

tok_en_bn = MarianTokenizer.from_pretrained(EN_BN_NAME)
mod_en_bn = MarianMTModel.from_pretrained(EN_BN_NAME)

  return torch.load(checkpoint_file, map_location=map_location)


In [7]:
def translate_bn_to_en(text):
    batch = tok_bn_en([text], return_tensors="pt",padding=True, truncation=True)
    gen = mod_bn_en.generate(**batch, max_new_tokens=128)
    return tok_bn_en.batch_decode(gen, skip_special_tokens=True)[0]

print("bn_to_en Translation models loaded.")


bn_to_en Translation models loaded.


In [8]:
def translate_en_to_bn(text):
    batch = tok_en_bn([text], return_tensors="pt",padding=True, truncation=True)
    gen = mod_en_bn.generate(**batch, max_new_tokens=128)
    return tok_en_bn.batch_decode(gen, skip_special_tokens=True)[0]

print("en_to_bn Translation models loaded.")

en_to_bn Translation models loaded.


In [9]:
print(translate_bn_to_en("বাংলাদেশ একটি সুন্দর দেশ।"))
print(translate_en_to_bn("Bangladesh is a beautiful country."))

Bangladesh is a beautiful country.
বাংলাদেশ একটি সুন্দর দেশ।


### 4.4 CLIR Search Function

In [10]:
def clir_search(query, top_k=5):
    if is_bangla(query):
        q_en = translate_bn_to_en(query)
        results_bn = search_bn(query, top_k)
        results_en = search_en(q_en, top_k)
        return_en = {"queary_language": "bn", "translated_query": q_en, "results_language": "en", "results": results_en}
        return_bn = {"queary_language": "bn", "translated_query": q_en, "results_language": "bn", "results": results_bn}
        return return_bn,return_en
    else:
        q_bn = translate_en_to_bn(query)
        results_en = search_en(query, top_k)
        results_bn = search_bn(q_bn, top_k)
        return_bn = {"queary_language": "en", "translated_query": q_bn, "results_language": "bn", "results": results_bn}
        return_en = {"queary_language": "en", "translated_query": q_bn, "results_language": "en", "results": results_en}
        return return_bn,return_en

### 4.5 Test

In [11]:
query_key= "তারেক রহমান"
out_bn, out_en = clir_search(query_key, top_k=5)

print("Print result in Bangla:")
for r in out_bn['results']:
    print(r['score'],r['title'],"-", r['url'])
    print("____________________________________________")


print("Result in English:")
for r in out_en['results']:
    print(r['score'],r['title'],"-", r['url'])
    print("____________________________________________")

Print result in Bangla:
11.747503272245343 তারেক রহমান গুলশানের বাসায় পৌঁছেছেন - https://www.prothomalo.com/bangladesh/ulencxxzco
____________________________________________
11.640954876557565 দেশে ফেরার পথে তারেক রহমানের ফেসবুক পোস্ট, কী লিখেছেন - https://www.prothomalo.com/politics/uu1rv166rk
____________________________________________
11.494181537600852 তারেক রহমানকে স্বাগত জানালেন জামায়াত আমির - https://www.prothomalo.com/politics/f59fy9fvcy
____________________________________________
11.489458356733403 আল্লাহ যাকে ইচ্ছা ক্ষমতা দেন, যার থেকে ইচ্ছা ক্ষমতা কেড়ে নেন: ফেসবুক পোস্টে তারেক রহমান - https://www.prothomalo.com/politics/2c9ghxvi2a
____________________________________________
11.459852150314788 তারেক রহমানের অনুরোধ রেখেছেন যুক্তরাজ্য বিএনপির নেতা-কর্মীরা - https://www.prothomalo.com/politics/jvh1d4x8oh
____________________________________________
Result in English:
15.862190251827919 nn online: md tarek rahman, member secretary of amjanatar dal, ended his 134-hour-long hun

In [12]:
query_key= "khaleda zia"
out_bn, out_en = clir_search(query_key, top_k=5)

print("Print result in Bangla:")
for r in out_bn['results']:
    print(r['score'],r['title'],"-", r['url'])
    print("____________________________________________")


print("Result in English:")
for r in out_en['results']:
    print(r['score'],r['title'],"-", r['url'])
    print("____________________________________________")

Print result in Bangla:
8.302197291416983 খালেদা জিয়া ‘খুব ক্রিটিক্যাল কন্ডিশনে’ - https://www.prothomalo.com/bangladesh/khaaledaa-jiyyaa-khub-krittikyaal-knddishne
____________________________________________
8.069578602100007 খালেদা জিয়া ও বাংলাদেশ একাকার হয়ে গেছে: কবি আবদুল হাই শিকদার - https://www.jugantor.com/national/1049989
____________________________________________
7.929153039588992 খালেদা জিয়া স্নেহের বন্ধনে আবদ্ধ রেখেছিলেন: জাতীয় পার্টির আনিসুল-হাওলাদার - https://www.prothomalo.com/politics/lpihqoland
____________________________________________
7.822809483260775 খালেদা জিয়ার মৃত্যুতে জাতিসংঘ মহাসচিবের শোক - https://www.jugantor.com/national/1049885
____________________________________________
7.697655867679527 খালেদা জিয়ার সাজা স্থগিতের মেয়াদ ছয় মাস বাড়ল, বিদেশ যেতে পারবেন না - https://www.prothomalo.com/politics/cu1fscyz6m
____________________________________________
Result in English:
14.728275746370663 nn online: bangladesh nationalist party (bnp) chairperson and former