In [57]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json
import pandas as pd
import sqlite3
import hashlib
import pickle
from sentence_transformers import SentenceTransformer

In [58]:
sitemap_root = "https://www.dhakatribune.com/sitemap.xml"
needed_article = 3000

In [59]:
response = requests.get(sitemap_root)
if response.status_code == 200:
    sitemap_content = response.text
    root = ET.fromstring(sitemap_content)
    
    # Define namespace
    ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    
    # Extract all sitemap locations
    sitemaps = [loc.text for loc in root.findall('.//sm:loc', ns)]
    
    print("Loaded sitemap index with the following sub-sitemaps:")
    for sitemap in sitemaps:
        print(sitemap)
else:
    print(f"Failed to load sitemap: {response.status_code}")

Loaded sitemap index with the following sub-sitemaps:
https://www.dhakatribune.com/news-sitemap.xml
https://www.dhakatribune.com/page.xml
https://www.dhakatribune.com/tag.xml
https://www.dhakatribune.com/2013-04-01.xml
https://www.dhakatribune.com/2013-05-01.xml
https://www.dhakatribune.com/2013-06-01.xml
https://www.dhakatribune.com/2013-07-01.xml
https://www.dhakatribune.com/2013-08-01.xml
https://www.dhakatribune.com/2013-09-01.xml
https://www.dhakatribune.com/2013-10-01.xml
https://www.dhakatribune.com/2013-11-01.xml
https://www.dhakatribune.com/2013-12-01.xml
https://www.dhakatribune.com/2014-01-01.xml
https://www.dhakatribune.com/2014-02-01.xml
https://www.dhakatribune.com/2014-03-01.xml
https://www.dhakatribune.com/2014-04-01.xml
https://www.dhakatribune.com/2014-05-01.xml
https://www.dhakatribune.com/2014-06-01.xml
https://www.dhakatribune.com/2014-07-01.xml
https://www.dhakatribune.com/2014-08-01.xml
https://www.dhakatribune.com/2014-09-01.xml
https://www.dhakatribune.com/2014

In [60]:
# Fetch and parse the first sub-sitemap (news articles)
if 'sitemaps' in locals() and sitemaps:
    sub_sitemap_url = sitemaps[-1]  # e.g., news-sitemap.xml
    print(f"Fetching sub-sitemap: {sub_sitemap_url}")
    
    response = requests.get(sub_sitemap_url)
    if response.status_code == 200:
        sub_content = response.text
        sub_root = ET.fromstring(sub_content)
        
        # Extract all article URLs
        article_urls = [url.text for url in sub_root.findall('.//sm:loc', ns)]
        
        print(f"Loaded {len(article_urls)} article URLs from {sub_sitemap_url}")
        # Preview first 5
        for url in article_urls[:5]:
            print(url)
    else:
        print(f"Failed to load sub-sitemap: {response.status_code}")
else:
    print("No sitemaps loaded yet. Run the previous cell first.")

Fetching sub-sitemap: https://www.dhakatribune.com/2025-12-01.xml
Loaded 1758 article URLs from https://www.dhakatribune.com/2025-12-01.xml
https://www.dhakatribune.com/business/banks/397715/former-secretary-mohammad-ayub-mia-made-chairman
https://www.dhakatribune.com/bangladesh/court/397716/judgment-in-plot-scam-case-against-hasina-rehana
https://www.dhakatribune.com/bangladesh/politics/397717/jatiya-party-to-contest-polls-alone
https://www.dhakatribune.com/bangladesh/court/397718/security-beefed-up-ahead-of-verdict-in-hasina-plot
https://www.dhakatribune.com/bangladesh/397719/1-174-tourists-sail-to-saint-martin-as-season


In [61]:
# collect wanted number of articles

if 'sitemaps' in locals() and sitemaps:
    article_urls = []
    
    # start from the most recent sub-sitemaps and skip news, page, tag 
    monthly_sitemaps = [s for s in sitemaps if s.endswith('.xml') and '-' in s and s.count('-') == 2]
    monthly_sitemaps.sort(reverse=True)  # Sort descending to get latest first
    
    for sub_sitemap_url in monthly_sitemaps:
        if len(article_urls) >= needed_article:
            break
        print(f"Fetching sub-sitemap: {sub_sitemap_url}")
        
        response = requests.get(sub_sitemap_url)
        if response.status_code == 200:
            sub_content = response.text
            sub_root = ET.fromstring(sub_content)
            
            # Extract article URLs
            urls = [url.text for url in sub_root.findall('.//sm:loc', ns)][::-1]  # Reverse to get latest first within the sitemap
            article_urls.extend(urls)
            print(f"Added {len(urls)} URLs. Total so far: {len(article_urls)}")
        else:
            print(f"Failed to load {sub_sitemap_url}: {response.status_code}")
    
    # Trim to exactly needed_article if exceeded
    if len(article_urls) > needed_article:
        article_urls = article_urls[:needed_article]
    
    print(f"Collected {len(article_urls)} latest article URLs.")
    # Preview first 5
    for url in article_urls[:5]:
        print(url)
else:
    print("No sitemaps loaded yet. Run the previous cells first.")

Fetching sub-sitemap: https://www.dhakatribune.com/2025-12-01.xml
Added 1758 URLs. Total so far: 1758
Fetching sub-sitemap: https://www.dhakatribune.com/2025-11-01.xml
Added 2383 URLs. Total so far: 4141
Collected 3000 latest article URLs.
https://www.dhakatribune.com/bangladesh/399501/police-arrest-suspect-with-firearm-explosives-in
https://www.dhakatribune.com/bangladesh/election/399496/registration-deadline-on-%E2%80%98postal-vote-bd%E2%80%99-extended
https://www.dhakatribune.com/bangladesh/politics/399495/we-will-not-step-back-from-reform-even-at-the
https://www.dhakatribune.com/bangladesh/dhaka/399494/dmp-issues-detailed-traffic-guidelines-ahead-of
https://www.dhakatribune.com/bangladesh/399493/young-man-killed-in-bomb-explosion-in-mogbazar


In [62]:
# Fetch and parse an article from article_urls
if 'article_urls' in locals() and article_urls:
    url = article_urls[0]  # Fetch the first (latest) article
    print(f"Fetching article: {url}")
    
    response = requests.get(url)
    if response.status_code == 200:
        from bs4 import BeautifulSoup
        import json
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract data from JSON-LD for consistency
        article_data = {}
        json_lds = soup.find_all('script', type='application/ld+json')
        for json_ld in json_lds:
            try:
                data = json.loads(json_ld.string)
                if data.get('@type') == 'NewsArticle':
                    article_data = data
                    break
            except:
                pass
        
        # Extract all params except content from JSON-LD
        title = article_data.get('headline', 'Title not found')
        name = article_data.get('name', 'Name not found')
        keywords = article_data.get('keywords', '')
        description = article_data.get('description', 'Description not found')
        image_url = article_data.get('image', {}).get('url', 'Image not found') if isinstance(article_data.get('image'), dict) else 'Image not found'
        author_name = article_data.get('author', {}).get('name', 'Author not found') if isinstance(article_data.get('author'), dict) else 'Author not found'
        author_url = article_data.get('author', {}).get('url', '') if isinstance(article_data.get('author'), dict) else ''
        pub_date = article_data.get('datePublished', 'Date not found')
        mod_date = article_data.get('dateModified', 'Modified date not found')
        publisher_name = article_data.get('publisher', {}).get('name', 'Publisher not found') if isinstance(article_data.get('publisher'), dict) else 'Publisher not found'
        main_entity = article_data.get('mainEntityOfPage', '')
        
        print(f"Title: {title}")
        print(f"Name: {name}")
        print(f"Keywords: {keywords}")
        print(f"Description: {description}")
        print(f"Image URL: {image_url}")
        print(f"Author: {author_name}")
        print(f"Author URL: {author_url}")
        print(f"Publication Date: {pub_date}")
        print(f"Modified Date: {mod_date}")
        print(f"Publisher: {publisher_name}")
        print(f"Main Entity: {main_entity}")
        
        # Extract article content from HTML
        content_div = soup.find('div', itemprop='articleBody')
        content = content_div.get_text().strip() if content_div else "Content not found"
        print(f"Content Preview: {content[:500]}...")
        
        # Optional: Save full HTML or content to a file
        # with open('article.html', 'w') as f:
        #     f.write(response.text)
        
    else:
        print(f"Failed to fetch article: {response.status_code}")
else:
    print("No article URLs available. Run the collection cell first.")

Fetching article: https://www.dhakatribune.com/bangladesh/399501/police-arrest-suspect-with-firearm-explosives-in
Title: Police arrest suspect with firearm, explosives in Osman Hadi killing case
Name: Police arrest suspect with firearm, explosives in Osman Hadi killing case
Keywords: 
Description: Police have arrested one Himon Rahman Shikdar, an Adabar Thana Jubo League activist and an associate of Alamgir, the motorcyclist allegedly involved in the killing of Inqilab Mancha spokesperson and Dhaka-8 prospective candidate Sharif Osman Hadi. The arrest was made on...
Image URL: https://ecdn.dhakatribune.net/contents/cache/images/800x450x1/uploads/media/2025/12/19/77c2f1b252adf8d0419bf8be5d22722d-6944eab3b6703.jpeg?jadewits_media_id=56417
Author: UNB
Author URL: http://www.dhakatribune.com/author/unb1
Publication Date: 2025-12-24T22:58:00+06:00
Modified Date: 2025-12-24T22:58:00+06:00
Publisher: Dhaka Tribune
Main Entity: https://www.dhakatribune.com/bangladesh/399501/police-arrest-suspe

In [63]:
# Bulk fetch and parse n articles

articles_to_save = 10 
articles_data = []

urls_to_fetch = article_urls[:articles_to_save]

for url in urls_to_fetch:
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract from JSON-LD
        json_lds = soup.find_all('script', type='application/ld+json')
        data = None
        for json_ld in json_lds:
            try:
                d = json.loads(json_ld.string)
                if d.get('@type') == 'NewsArticle':
                    data = d
                    break
            except:
                pass
        
        if data:
            title = data['headline']
            pub_date = data['datePublished']
            
            # Extract content
            content_div = soup.find('div', itemprop='articleBody')
            content = content_div.get_text().strip() if content_div else ''
            
            articles_data.append({
                'title': title,
                'pub_date': pub_date,
                'link': url,
                'content': content
            })

# Save to CSV
df = pd.DataFrame(articles_data)
df.to_csv('dhakatribune_10_articles.csv', index=False)
print(f"Saved {len(articles_data)} articles")


Saved 10 articles


In [64]:
# Load the model
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
model = SentenceTransformer(model_name)

print(f"Loaded model: {model_name}")

Loaded model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2


In [65]:
# Connect to DB and create table
conn = sqlite3.connect('articles.db')
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS articles (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    hash TEXT UNIQUE,
    publisher TEXT,
    title TEXT,
    summary TEXT,
    link TEXT,
    published TEXT,
    language TEXT,
    embedding BLOB
)
''')

print("Table dropped created with UNIQUE constraint on hash")

Table dropped created with UNIQUE constraint on hash


In [66]:
# Check articles data
print(f"Number of articles: {len(articles_data)}")
df_articles = pd.DataFrame(articles_data)
df_articles.head()

Number of articles: 10


Unnamed: 0,title,pub_date,link,content
0,"Police arrest suspect with firearm, explosives...",2025-12-24T22:58:00+06:00,https://www.dhakatribune.com/bangladesh/399501...,"Police have arrested one Himon Rahman Shikdar,..."
1,Registration deadline on ‘Postal Vote BD’ exte...,2025-12-24T21:32:22+06:00,https://www.dhakatribune.com/bangladesh/electi...,The Election Commission (EC) has extended the ...
2,"'We will not step back from reform, even at th...",2025-12-24T21:28:51+06:00,https://www.dhakatribune.com/bangladesh/politi...,"Hasnat Abdullah, chief coordinator of the NCP’..."
3,DMP issues detailed traffic guidelines ahead o...,2025-12-24T21:20:49+06:00,https://www.dhakatribune.com/bangladesh/dhaka/...,Dhaka Metropolitan Police (DMP) has issued det...
4,Youth killed as crude bomb thrown from flyover...,2025-12-24T20:40:27+06:00,https://www.dhakatribune.com/bangladesh/399493...,"A youth named Siam, 21, was killed in a crude ..."


In [67]:
# Process and insert data
for article in articles_data:
    title = article['title']
    pub_date = article['pub_date']
    content = article['content']
    link = article['link']
    
    # Generate hash
    hash_value = hashlib.sha256((title + pub_date).encode()).hexdigest()
    
    # Encode content
    embedding = model.encode(content)
    embedding_blob = pickle.dumps(embedding.tolist())
    
    # Insert
    cursor.execute('''
    INSERT INTO articles (hash, publisher, title, summary, link, published, language, embedding)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    ''', (hash_value, 'DhakaTribune', title, content, link, pub_date, 'English', embedding_blob))

conn.commit()
conn.close()

print("Data inserted into articles.db")

IntegrityError: UNIQUE constraint failed: articles.hash

In [None]:
# Check the table by loading into pandas
conn = sqlite3.connect('articles.db')
df_db = pd.read_sql_query("SELECT id, hash, publisher, title, summary, link, published, language FROM articles", conn)
conn.close()

print(f"Loaded {len(df_db)} rows from articles.db")
df_db

Loaded 10 rows from articles.db


Unnamed: 0,id,hash,publisher,title,summary,link,published,language
0,1,1683043d93ea2aefa9f78794bfdf3a8cdea8a0bb09d938...,DhakaTribune,"Police arrest suspect with firearm, explosives...","Police have arrested one Himon Rahman Shikdar,...",https://www.dhakatribune.com/bangladesh/399501...,2025-12-24T22:58:00+06:00,English
1,2,87c31ebed9f581928e04c7943002c585c9177cce4f9cfb...,DhakaTribune,Registration deadline on ‘Postal Vote BD’ exte...,The Election Commission (EC) has extended the ...,https://www.dhakatribune.com/bangladesh/electi...,2025-12-24T21:32:22+06:00,English
2,3,d053b31e868a111ab00014ce71b38216b54b1daaf0032d...,DhakaTribune,"'We will not step back from reform, even at th...","Hasnat Abdullah, chief coordinator of the NCP’...",https://www.dhakatribune.com/bangladesh/politi...,2025-12-24T21:28:51+06:00,English
3,4,51941bdff89a863702128cae648cdec31c7d732fb653b3...,DhakaTribune,DMP issues detailed traffic guidelines ahead o...,Dhaka Metropolitan Police (DMP) has issued det...,https://www.dhakatribune.com/bangladesh/dhaka/...,2025-12-24T21:20:49+06:00,English
4,5,de314a1f3937a2c2949f0cafcbc28320a89c23b35b4634...,DhakaTribune,Youth killed as crude bomb thrown from flyover...,"A youth named Siam, 21, was killed in a crude ...",https://www.dhakatribune.com/bangladesh/399493...,2025-12-24T20:40:27+06:00,English
5,6,09819fe0cc49aa1915bea0bb48c691a8af5983978509e9...,DhakaTribune,Advisory Council approves Smoking and Tobacco ...,The Advisory Council on Wednesday approved the...,https://www.dhakatribune.com/bangladesh/399492...,2025-12-24T20:09:59+06:00,English
6,7,7d958e193f602555d19e3cb367b9e083c57e647e3fc2f5...,DhakaTribune,Kosovo: Will elections break the political gri...,"On December 28, for the second time in just te...",https://www.dhakatribune.com/world/399491/koso...,2025-12-24T20:05:21+06:00,English
7,8,1637877f53428a410dcb469789591dfcd02628013ddb1e...,DhakaTribune,What interest do China and Russia have in Vene...,The actions of the US fleet in the Caribbean a...,https://www.dhakatribune.com/world/europe/3994...,2025-12-24T19:55:55+06:00,English
8,9,1f977b245df045af304077e4be7654feee2586a26934fa...,DhakaTribune,"‘No internet shutdown, even for a minute’ as T...","The Advisory Council on Wednesday gave final, ...",https://www.dhakatribune.com/bangladesh/govern...,2025-12-24T19:53:55+06:00,English
9,10,f84c7a21e70b7067709d366b13f80c4118416aa55a54fe...,DhakaTribune,"At UN, Russia and China criticize US conduct t...",Russia and China on Tuesday criticized the Uni...,https://www.dhakatribune.com/world/europe/3994...,2025-12-24T19:42:49+06:00,English
