# Module B â€” Query Processing & Cross-Lingual Handling 

1. Language Detection
2. Normalization
3. Query Conversion/Translation 
4. Query Expansion 
5. Named-Entity Mapping

In [1]:
import feedparser
import json
from tqdm import tqdm
import os
from bs4 import BeautifulSoup
import html
import re

## 2. Data Preprocessing

### 2.1 English Filtering

In [2]:
opening_path = r"E:\DM\Cross-Lingual-Information-Retrieval-System\data\document_en.json"
with open(opening_path, "r", encoding="utf-8") as f:
    docs_en = json.load(f)

print(f"Total documents loaded: {len(docs_en)}")

Total documents loaded: 7262


In [3]:
docs_en_clean= []

for doc in docs_en:
    body = doc["token_count"] >=20
    if body:
        docs_en_clean.append(doc)

print(f"Total documents after cleaning: {len(docs_en_clean)}")

saving_path = r"E:\DM\Cross-Lingual-Information-Retrieval-System\data\document_en_clean.json"
with open(saving_path, "w", encoding="utf-8") as f:
    json.dump(docs_en_clean, f, ensure_ascii=False, indent=2)

Total documents after cleaning: 6864


### 2.2 Bangla Filtering

In [4]:
opening_path = r"E:\DM\Cross-Lingual-Information-Retrieval-System\data\document_bn.json"
with open(opening_path, "r", encoding="utf-8") as f:
    docs_bn = json.load(f)

print(f"Total documents loaded: {len(docs_bn)}")

Total documents loaded: 10738


In [5]:
docs_bn_clean= []

for doc in docs_bn:
    body = doc["token_count"] >=20
    if body:
        docs_bn_clean.append(doc)

print(f"Total Bilingual documents after cleaning: {len(docs_bn_clean)}")

saving_path = r"E:\DM\Cross-Lingual-Information-Retrieval-System\data\document_bn_clean.json"
with open(saving_path, "w", encoding="utf-8") as f:
    json.dump(docs_bn_clean, f, ensure_ascii=False, indent=2)

Total Bilingual documents after cleaning: 10362


### 2.3 English Text Normalization

In [6]:
def norm(text):
    return " ".join(text.lower().split())

In [7]:
for doc in docs_en_clean:
    doc["title"]= norm(doc["body"])
    doc["body"]= norm(doc["body"])

saving_path = r"E:\DM\Cross-Lingual-Information-Retrieval-System\data\document_en_clean.json"
with open(saving_path, "w", encoding="utf-8") as f:
    json.dump(docs_en_clean, f, ensure_ascii=False, indent=2)

In [8]:
p= "friends are snakes but python is not"
q= "Friends   Are Snakes  But Python Is Not"

print(norm(p) == norm(q))
print(p.lower() == q.lower())

True
False
