<a href="https://colab.research.google.com/github/ahmedrana603/NLP-Language-Modeling-for-Urdu-News-Articles/blob/main/Language_Modeling_for_Urdu_News_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Libraries**

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import re

# **Base URL**

In [None]:
base_url = "https://www.bbc.com/urdu/topics/cjgn7n9zzq7t"

article_links = set()
raw_articles = []
metadata_list = []

# **Collecting Article Links**

In [None]:

for page in range(1, 50):
    url = f"{base_url}?page={page}"
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")

    for a in soup.select("h2 a[href*='/urdu/articles/']"):
        href = a["href"]
        if href.startswith("/"):
            href = "https://www.bbc.com" + href
        article_links.add(href)

    if len(article_links) >= 270:
        break

article_links = list(article_links)[:270]


# **Scrapping Articles**

In [None]:
for idx, link in enumerate(article_links, 1):
    res = requests.get(link)
    soup = BeautifulSoup(res.text, "html.parser")

    title_tag = soup.find("h1", class_="article-heading")
    title = title_tag.get_text(strip=True) if title_tag else "No title found"

    date_tag = soup.find("time")
    date = date_tag.get_text(strip=True) if date_tag else "No date found"

    author_tag = soup.find("span", class_="byline__name")
    author = author_tag.get_text(strip=True) if author_tag else "BBC Urdu"

    category_tag = soup.find("a", class_="bbc-1f2hn8h e1hk9ate4")
    category = category_tag.get_text(strip=True) if category_tag else "Unknown"

    body_paragraphs = []

    article_tag = soup.find("article")
    if article_tag:
        for p in article_tag.find_all("p"):
            text = p.get_text(strip=True)
            if text.startswith("©") or "،تصویر کا ذریعہ" in text:
                continue
            body_paragraphs.append(text)

    if not body_paragraphs:
        for div in soup.find_all("div", class_=lambda x: x and "RichTextComponentWrapper" in x):
            for p in div.find_all("p"):
                text = p.get_text(strip=True)
                if text.startswith("©") or "،تصویر کا ذریعہ" in text:
                    continue
                body_paragraphs.append(text)

    if not body_paragraphs:
        for div in soup.find_all("div", {"dir": "rtl"}):
            for p in div.find_all("p"):
                text = p.get_text(strip=True)
                if len(text) > 5:
                    body_paragraphs.append(text)

    body = "\n".join(body_paragraphs).strip()


    raw_articles.append((idx, body))
    metadata_list.append({
        "article_id": idx,
        "title": title,
        "url": link,
        "category": category,
        "date": date,
        "author": author
    })

    time.sleep(0.5)


# **Txt File**

In [None]:
with open("raw.txt", "w", encoding="utf-8") as f:
    for idx, body in raw_articles:
        f.write(f"### Article {idx} ###\n")
        f.write(body + "\n\n")


# **Metadata JSON file**

In [None]:
with open("metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata_list, f, ensure_ascii=False, indent=2)


# **Diacritics Removal**

In [21]:

def remove_diacritics(text):
    """
    Removes Urdu diacritics (Aarabs) from text.
    Unicode ranges:
    064B–065F
    0670
    06D6–06ED
    """
    diacritics_pattern = r'[\u064B-\u065F\u0670\u06D6-\u06ED]'
    return re.sub(diacritics_pattern, '', text)


with open("raw.txt", "r", encoding="utf-8", errors='ignore') as f:
    raw_content = f.read()


cleaned_content = remove_diacritics(raw_content)


with open("no_diacritics.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_content)


print("Diacritics removed successfully.")

Diacritics removed successfully.


# **Noise Removal**

# **Removal of Non-Urdu Text**

In [25]:

def remove_urls(text):
    """Remove URLs like http://... or www..."""
    url_pattern = r'http\S+|www\S+'
    return re.sub(url_pattern, '', text)

def remove_emojis(text):
    """Remove emojis"""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub('', text)

def remove_english(text):
    """Remove English letters"""
    english_pattern = r'[A-Za-z]+'
    return re.sub(english_pattern, '', text)

def remove_navigation_text(text):
    """Remove common web/navigation phrases"""
    unwanted_phrases = [
        "مواد پر جائیں",
        "سبسکرائب کرنے کے لیے کلک کریں",
        "بی بی سی اردو کی خبروں اور فیچرز کو اپنے فون پر حاصل کریں",
        "اپنے فون پر حاصل کریں",
        "کلک کریں"
    ]
    for phrase in unwanted_phrases:
        text = text.replace(phrase, '')
    return text

def remove_noise(text):
    """Apply all noise removal rules"""
    text = remove_urls(text)
    text = remove_emojis(text)
    text = remove_english(text)
    text = remove_navigation_text(text)
    return text


def remove_non_urdu(text):
    """Keep only Urdu letters, digits, spaces, Urdu punctuation"""
    return re.sub(r'[^\u0600-\u06FF\s۔؟!،0-9]', '', text)


with open("noise_removed.txt", "r", encoding="utf-8", errors='ignore') as f:
    content = f.read()

content = remove_noise(content)

split_articles = content.split("### Article ")
filtered_articles = []

for part in split_articles:
    if not part.strip():
        continue

    lines = part.split("\n", 1)
    header_num = lines[0].strip()
    header = f"### Article {header_num} ###"
    body = lines[1] if len(lines) > 1 else ""

    body = remove_non_urdu(body)

    filtered_articles.append(header + "\n" + body.strip() + "\n\n")

with open("urdu_only_filtered.txt", "w", encoding="utf-8") as f:
    f.writelines(filtered_articles)

print("Noise removed and non-Urdu text filtered. Article headers preserved. File ready: urdu_only_filtered.txt")


✅ Noise removed and non-Urdu text filtered. Article headers preserved. File ready: urdu_only_filtered.txt


# **Sentence Segmentation**

In [26]:

with open("urdu_only_filtered.txt", "r", encoding="utf-8") as f:
    content = f.read()

split_articles = content.split("### Article ")
segmented_articles = []

for part in split_articles:
    if not part.strip():
        continue
    lines = part.split("\n", 1)
    header_num = lines[0].strip()
    header = f"### Article {header_num} ###"
    body = lines[1] if len(lines) > 1 else ""


    body = re.sub(r'([۔؟!])', r'\1\n', body)

    body = re.sub(r'\n+', '\n', body).strip()

    segmented_articles.append(header + "\n" + body + "\n\n")

with open("segmented.txt", "w", encoding="utf-8") as f:
    f.writelines(segmented_articles)

print("Sentence segmentation complete. File ready: segmented.txt")


✅ Sentence segmentation complete. File ready: segmented.txt


# **Whitespace and Formatting Normalization**

In [27]:

with open("segmented.txt", "r", encoding="utf-8") as f:
    content = f.read()

split_articles = content.split("### Article ")
normalized_articles = []

for part in split_articles:
    if not part.strip():
        continue

    lines = part.split("\n", 1)
    header_num = lines[0].strip()
    header = f"### Article {header_num} ###"
    body = lines[1] if len(lines) > 1 else ""


    body = re.sub(r'[ ]+', ' ', body)
    body = '\n'.join([line.strip() for line in body.split('\n')])
    body = re.sub(r'\n+', '\n', body).strip()

    normalized_articles.append(header + "\n" + body + "\n\n")

with open("normalized.txt", "w", encoding="utf-8") as f:
    f.writelines(normalized_articles)

print("Whitespace and formatting normalization complete. File ready: normalized.txt")


✅ Whitespace and formatting normalization complete. File ready: normalized.txt
