<a href="https://colab.research.google.com/github/ahmedrana603/NLP-Language-Modeling-for-Urdu-News-Articles/blob/main/Language_Modeling_for_Urdu_News_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Libraries**

In [54]:
import requests
from bs4 import BeautifulSoup
import json
import time


# **Base URL**

In [55]:
base_url = "https://www.bbc.com/urdu/topics/cjgn7n9zzq7t"

article_links = set()
raw_articles = []
metadata_list = []

# **Collecting Article Links**

In [56]:

for page in range(1, 50):
    url = f"{base_url}?page={page}"
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")

    for a in soup.select("h2 a[href*='/urdu/articles/']"):
        href = a["href"]
        if href.startswith("/"):
            href = "https://www.bbc.com" + href
        article_links.add(href)

    if len(article_links) >= 270:
        break

article_links = list(article_links)[:270]


# **Scrapping Articles**

In [57]:
for idx, link in enumerate(article_links, 1):
    res = requests.get(link)
    soup = BeautifulSoup(res.text, "html.parser")

    title_tag = soup.find("h1", class_="article-heading")
    title = title_tag.get_text(strip=True) if title_tag else "No title found"

    date_tag = soup.find("time")
    date = date_tag.get_text(strip=True) if date_tag else "No date found"

    author_tag = soup.find("span", class_="byline__name")
    author = author_tag.get_text(strip=True) if author_tag else "BBC Urdu"

    category_tag = soup.find("a", class_="bbc-1f2hn8h e1hk9ate4")
    category = category_tag.get_text(strip=True) if category_tag else "Unknown"

    body_paragraphs = []

    article_tag = soup.find("article")
    if article_tag:
        for p in article_tag.find_all("p"):
            text = p.get_text(strip=True)
            if text.startswith("©") or "،تصویر کا ذریعہ" in text:
                continue
            body_paragraphs.append(text)

    if not body_paragraphs:
        for div in soup.find_all("div", class_=lambda x: x and "RichTextComponentWrapper" in x):
            for p in div.find_all("p"):
                text = p.get_text(strip=True)
                if text.startswith("©") or "،تصویر کا ذریعہ" in text:
                    continue
                body_paragraphs.append(text)

    if not body_paragraphs:
        for div in soup.find_all("div", {"dir": "rtl"}):
            for p in div.find_all("p"):
                text = p.get_text(strip=True)
                if len(text) > 5:
                    body_paragraphs.append(text)

    body = "\n".join(body_paragraphs).strip()


    raw_articles.append((idx, body))
    metadata_list.append({
        "article_id": idx,
        "title": title,
        "url": link,
        "category": category,
        "date": date,
        "author": author
    })

    time.sleep(0.5)


# **Txt File**

In [58]:
with open("raw.txt", "w", encoding="utf-8") as f:
    for idx, body in raw_articles:
        f.write(f"### Article {idx} ###\n")
        f.write(body + "\n\n")


# **Metadata JSON file**

In [59]:
with open("metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata_list, f, ensure_ascii=False, indent=2)
