# Module C — Retrieval Models

1. Model 1: Lexical Retrieval (BM25)
2. Model 2: Fuzzy/Transliteration Matching
3. Model 3: Semantic Matching
4. Model 4: Hybrid Ranking

In [1]:
import feedparser
import json
from tqdm import tqdm
import os
from bs4 import BeautifulSoup
import html
import re
import pickle
from rank_bm25 import BM25Okapi

## 3. Configuration

### 3.1 Path

In [2]:
EN_PATH = r"E:\DM\Cross-Lingual-Information-Retrieval-System\data\document_en_clean.json"
BN_PATH = r"E:\DM\Cross-Lingual-Information-Retrieval-System\data\document_bn_clean.json"

EN_INDEX_OUT='bm25_en.pkl'
BN_INDEX_OUT='bm25_bn.pkl'

### 3.2 Tokenizer

#### 3.2.1 English Tokenizer

In [3]:
def tokenize_en(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

#### 3.2.1 Bangla Tokenizer

In [4]:
def tokenize_bn(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

### 3.3 Build and Save English BM25 index

In [5]:
with open(EN_PATH, 'r', encoding='utf-8') as f:
    docs_en = json.load(f)

corpus_end=[]
doc_ids_en=[]

for doc in docs_en:
    title = doc.get("title", "")
    body = doc.get("body", "")

    full_text = (title + " " + body).strip()

    corpus_end.append(tokenize_en(full_text))
    doc_ids_en.append(doc.get("doc_id"," "))

bm25_en = BM25Okapi(corpus_end)

with open(EN_INDEX_OUT, 'wb') as f:
    pickle.dump({"bm25": bm25_en, "doc_ids": doc_ids_en, "docs": docs_en}, f)

print(f"English docs indexing:" ,len(doc_ids_en))
print(f"BM25 English index saved to:" ,EN_INDEX_OUT)

English docs indexing: 6864
BM25 English index saved to: bm25_en.pkl


### 3.4 Build and Save Bangla BM25 index

In [6]:
with open(BN_PATH, 'r', encoding='utf-8') as f:
    docs_bn = json.load(f)

corpus_end=[]
doc_ids_bn=[]

for doc in docs_bn:
    title = doc.get("title", "")
    body = doc.get("body", "")

    full_text = (title + " " + body).strip()

    corpus_end.append(tokenize_bn(full_text))
    doc_ids_bn.append(doc.get("doc_id"," "))

bm25_bn = BM25Okapi(corpus_end)

with open(BN_INDEX_OUT, 'wb') as f:
    pickle.dump({"bm25": bm25_bn, "doc_ids": doc_ids_bn, "docs": docs_bn}, f)

print(f"Bengali docs indexing:" ,len(doc_ids_bn))
print(f"BM25 Bengali index saved to:" ,BN_INDEX_OUT)

Bengali docs indexing: 10362
BM25 Bengali index saved to: bm25_bn.pkl


### 3.5 Load Indexes and Search Function

#### 3.5.1 Load Indexes

In [7]:
def load_index(path):
    with open(path, "rb") as f:
        return pickle.load(f)

en_pack = load_index(EN_INDEX_OUT)
bn_pack = load_index(BN_INDEX_OUT)

bm25_en = en_pack["bm25"]
doc_ids_en = en_pack["doc_ids"]
docs_en = en_pack["docs"]

bm25_bn = bn_pack["bm25"]
doc_ids_bn = bn_pack["doc_ids"]
docs_bn = bn_pack["docs"]

#### 3.5.2 English Search Function

In [8]:
def search_en(query, top_k=5):
    q = tokenize_en(query)
    scores = bm25_en.get_scores(q)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [{"doc_id": doc_ids_en[i], "score": float(scores[i]), "title": docs_en[i].get("title",""), "url": docs_en[i].get("url","")} for i in idx]


#### 3.5.3 Bangla Search Function

In [9]:
def search_bn(query, top_k=5):
    q = tokenize_bn(query)
    scores = bm25_bn.get_scores(q)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [{"doc_id": doc_ids_bn[i], "score": float(scores[i]), "title": docs_bn[i].get("title",""), "url": docs_bn[i].get("url","")} for i in idx]

### 3.6 Test

#### 3.6.1 Test English Query

In [10]:
for r in search_en("Bangladesh cricket", top_k=5):
    print(r["score"], "|", r["doc_id"], "|", r["title"])
    print("  ", r["url"])

7.461620067836383 | en_003021 | sports reporter : the bangladesh cricket board announced on tuesday the 15-member bangladesh ‘a’ cricket squad for the upcoming asian cricket council (acc) asia cup rising stars, which is scheduled to be held at doha, the capital city of qatar from november 14 to november 21. akbar ali will lead the 15-member bangladesh ‘a’ cricket squad for the upcoming cricket meet. bangladesh ‘a’ cricket team will take part in the hong kong sixes, which is scheduled to be held in hong kong from november 7 to november 9. akbar ali will lead bangladesh ‘a’ cricket team in the hong kong sixes. meanwhile, bangladesh’s cricket is passing a busy time as the national cricket league (ncl) four-day is going underway. zawad abrar, sm meherob hasan, leg spinner of bangladesh under-19 cricket team shadhin islam have been called in the bangladesh ‘a’ cricket squad. a number of eight teams will splitting into two groups will take part in the asia cup rising stars. earlier, the name

#### 3.6.2 Test Bangla Query

In [11]:
for r in search_bn("তারেক রহমান", top_k=5):#বাংলাদেশ ক্রিকেট
    print(r["score"], "|", r["doc_id"], "|", r["title"])
    print("  ", r["url"])

11.747503272245343 | bn_002480 | তারেক রহমান গুলশানের বাসায় পৌঁছেছেন
   https://www.prothomalo.com/bangladesh/ulencxxzco
11.640954876557565 | bn_001006 | দেশে ফেরার পথে তারেক রহমানের ফেসবুক পোস্ট, কী লিখেছেন
   https://www.prothomalo.com/politics/uu1rv166rk
11.494181537600852 | bn_000863 | তারেক রহমানকে স্বাগত জানালেন জামায়াত আমির
   https://www.prothomalo.com/politics/f59fy9fvcy
11.489458356733403 | bn_003094 | আল্লাহ যাকে ইচ্ছা ক্ষমতা দেন, যার থেকে ইচ্ছা ক্ষমতা কেড়ে নেন: ফেসবুক পোস্টে তারেক রহমান
   https://www.prothomalo.com/politics/2c9ghxvi2a
11.459852150314788 | bn_000930 | তারেক রহমানের অনুরোধ রেখেছেন যুক্তরাজ্য বিএনপির নেতা-কর্মীরা
   https://www.prothomalo.com/politics/jvh1d4x8oh
