# 03 – Unstructured Text → Daily Topic/Keyword Features (DHM Notices & News)
**Goal:** Crawl DHM `/notice` and `/news` pages, fetch detail pages, parse text, build keyword features (English+Nepali),
and merge with master daily data (2019–2023).

Steps:
1. Crawl listing pages (notice/news) with pagination.
2. Collect article links, fetch detail pages, extract date/title/body.
3. Filter to 2019–2023, dedupe, save `text_corpus.csv`.
4. Keyword features → `topics_daily.csv`.
5. Optional LDA topics.
6. Merge with `master_kaligandaki_daily_withrain.csv` → `master_with_topics.csv`.

In [None]:
import os, re, time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import dateutil.parser as dparser

BASE = 'https://www.dhm.gov.np'
LISTING_ENDPOINTS = ['notice', 'news']
MAX_PAGES_PER_SECTION = 10
UA = {'User-Agent': 'Mozilla/5.0'}
TIMEOUT = 30
DELAY = 1.0

TEXT_CORPUS_CSV = 'text_corpus.csv'
TOPICS_DAILY_CSV = 'topics_daily.csv'
MASTER_PATH = 'master_kaligandaki_daily_withrain.csv'
MASTER_WITH_TOPICS = 'master_with_topics.csv'
START_DATE = pd.Timestamp('2019-01-01')
END_DATE = pd.Timestamp('2023-12-31')

In [None]:
def get_html(url):
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.status_code == 200 and r.text:
            time.sleep(DELAY)
            return r.text
    except Exception as e:
        print('Error fetching', url, e)
    return ''

def abs_url(href):
    return urljoin(BASE+'/', href) if href else None

def safe_date(txt):
    if not txt: return None
    try:
        return dparser.parse(txt, fuzzy=True, dayfirst=True).date()
    except: return None

In [None]:
# Crawl listings
article_urls = set()
for section in LISTING_ENDPOINTS:
    for p in range(MAX_PAGES_PER_SECTION):
        url = f"{BASE}/{section}?page={p}"
        html = get_html(url)
        if not html: continue
        soup = BeautifulSoup(html, 'html.parser')
        for a in soup.find_all('a'):
            href = a.get('href','')
            if any(f'/{s}/' in href for s in LISTING_ENDPOINTS):
                article_urls.add(abs_url(href))
        print(section, 'page', p, '->', len(article_urls), 'urls so far')

article_urls = {u for u in article_urls if urlparse(u).netloc}
print('Total collected URLs:', len(article_urls))

In [None]:
DATE_HINT_SEL = ['time','.date','.post-date','.posted-on','.views-field-created','.field--name-created']
BODY_SEL = ['.field--name-body','article','.content','.entry-content','.node__content']

def parse_article(url):
    html = get_html(url)
    if not html: return None
    soup = BeautifulSoup(html,'html.parser')
    title = None
    h = soup.find(['h1','h2','h3'])
    if h: title = h.get_text(' ',strip=True)
    dt = None
    for sel in DATE_HINT_SEL:
        node = soup.select_one(sel)
        if node:
            dt = safe_date(node.get('datetime') or node.get_text(' ',strip=True))
            if dt: break
    if not dt and title: dt = safe_date(title)
    if not dt: dt = safe_date(soup.get_text(' ',strip=True))
    body = None
    for sel in BODY_SEL:
        node = soup.select_one(sel)
        if node:
            body = node.get_text(' ',strip=True)
            if body and len(body)>40: break
    if not body: body = soup.get_text(' ',strip=True)
    return {'date':pd.to_datetime(dt,errors='coerce'),'source':'DHM','title':title,'text':body,'url':url}

In [None]:
records = []
for i,url in enumerate(sorted(article_urls)):
    rec = parse_article(url)
    if rec and pd.notna(rec['date']):
        records.append(rec)
    if (i+1)%25==0: print('Parsed',i+1)

corpus_df = pd.DataFrame(records)
if not corpus_df.empty:
    corpus_df['date'] = pd.to_datetime(corpus_df['date']).dt.normalize()
    mask = (corpus_df['date']>=START_DATE)&(corpus_df['date']<=END_DATE)
    corpus_df = corpus_df.loc[mask].drop_duplicates(subset=['date','title','url'])

corpus_df.to_csv(TEXT_CORPUS_CSV,index=False)
print('Saved corpus',TEXT_CORPUS_CSV,'rows=',len(corpus_df))
corpus_df.head()

In [None]:
KEYWORDS = {
 'maintenance':['maintenance','overhaul','shutdown','servicing','repair','सम्भार','मर्मत','बन्द'],
 'outage':['outage','blackout','interruption','load shedding','trip','fault','विद्युत अवरोध','लोडसेडिङ','बत्ती बन्द'],
 'flood':['flood','high flow','inundation','alert','warning','watch','बाढी','पहिरो','सूचना','चेतावनी'],
 'policy':['policy','tariff','import','export','regulation','curtail','नीति','दर','आयात','निर्यात','विनियमन'],
 'weather':['heavy rain','thunder','storm','monsoon','precipitation','हावाहुरी','मुसलधारे','मेघगर्जन','मौसम']
}

def kw_counts(text):
    t=(text or '').lower()
    out={k:0 for k in KEYWORDS}
    for k,words in KEYWORDS.items():
        for w in words:
            out[k]+=t.count(w.lower())
    return out

kk=[]
for _,r in corpus_df.iterrows():
    counts=kw_counts((r.get('title') or '')+' '+(r.get('text') or ''))
    counts['date']=r['date'].date()
    kk.append(counts)

daily_kw=pd.DataFrame(kk)
if not daily_kw.empty:
    daily_kw['date']=pd.to_datetime(daily_kw['date'])
    for k in KEYWORDS:
        daily_kw[f'{k}_flag']=(daily_kw[k]>0).astype(int)
    daily_kw=daily_kw.groupby('date',as_index=False).sum()

daily_kw.to_csv(TOPICS_DAILY_CSV,index=False)
print('Saved',TOPICS_DAILY_CSV,'rows=',len(daily_kw))
daily_kw.head()

In [None]:
master=pd.read_csv(MASTER_PATH,parse_dates=['date'])
out=master.merge(daily_kw,on='date',how='left')
for c in out.columns:
    if c!='date' and c not in master.columns:
        out[c]=out[c].fillna(0)
out.to_csv(MASTER_WITH_TOPICS,index=False)
print('Saved',MASTER_WITH_TOPICS,'rows=',len(out),'cols=',len(out.columns))
out.head()