# 03 – Unstructured Text → Daily Topic/Keyword Features

**Goal:** Collect and transform NEA notices & DHM bulletins/forecasts into daily signals for the BN/GP models.

**Outputs:**
- `text_corpus.csv` – raw parsed items with `date, source, title, text, url`
- `topics_daily.csv` – daily features from keywords (and optional LDA topics)
- `master_with_topics.csv` – join with your master/feature table by date

**Notes:**
- Requires internet for live scraping. If offline, place saved HTML into `text_raw/` then run parsing cells.
- Start with **keyword flags** (maintenance/outage/flood/policy). Optional: LDA if `scikit-learn` is available.


In [None]:
# ==== 0. Imports & Config ====
import os, re, json, time
from pathlib import Path
import pandas as pd

RAW_DIR = Path('text_raw')
RAW_DIR.mkdir(exist_ok=True)

NEA_NOTICE_URLS = [
    'https://www.nea.org.np/notice',
    'https://www.nea.org.np/notifications'
]
DHM_URLS = [
    'https://www.dhm.gov.np/bulletins',
    'https://www.dhm.gov.np/meteorology-forecast/4'
]

TEXT_CORPUS_CSV = 'text_corpus.csv'
TOPICS_DAILY_CSV = 'topics_daily.csv'
MASTER_PATH = 'master_kaligandaki_daily_withrain.csv'   # or features_daily.csv
MASTER_WITH_TOPICS = 'master_with_topics.csv'


## 1) Fetch pages (live) or use saved HTML
If offline, skip fetching and put HTML files into `text_raw/`.

In [None]:
# ==== 1. (Optional) Fetch ====
try:
    import requests
    LIVE = True
except Exception:
    LIVE = False

def fetch_and_save(url, outname):
    try:
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        (RAW_DIR/outname).write_text(r.text, encoding='utf-8')
        print('Saved', outname)
    except Exception as e:
        print('Fetch failed for', url, e)

if LIVE:
    for i,u in enumerate(NEA_NOTICE_URLS):
        fetch_and_save(u, f'nea_{i}.html')
    for i,u in enumerate(DHM_URLS):
        fetch_and_save(u, f'dhm_{i}.html')
else:
    print('Offline mode: ensure RAW_DIR contains saved HTML files.')


## 2) Parse HTML → corpus
Flexible parsing with BeautifulSoup. Adjust selectors if sites change.

In [None]:
# ==== 2. Parse HTML into a corpus ====
from bs4 import BeautifulSoup
from datetime import datetime

def parse_items_from_html(html_text, source):
    soup = BeautifulSoup(html_text, 'html.parser')
    items = []
    for card in soup.select('article, .card, li, .list-item, .post, .notice, .news-item'):
        text = card.get_text(' ', strip=True)
        if not text or len(text) < 40:
            continue
        date_match = re.search(r'(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4}|\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})', text)
        dt = None
        if date_match:
            rawd = date_match.group(0)
            for fmt in ('%d %b %Y','%d %B %Y','%Y-%m-%d','%d/%m/%Y'):
                try:
                    dt = datetime.strptime(rawd, fmt).date()
                    break
                except Exception:
                    pass
        title_tag = card.find(['h1','h2','h3'])
        title = title_tag.get_text(' ', strip=True) if title_tag else text[:80]
        url = None
        a = card.find('a')
        if a and a.has_attr('href'):
            url = a['href']
        items.append({'date': dt, 'source': source, 'title': title, 'text': text, 'url': url})
    return items

corpus = []
for p in RAW_DIR.glob('nea_*.html'):
    corpus += parse_items_from_html(p.read_text(encoding='utf-8', errors='ignore'), source='NEA')
for p in RAW_DIR.glob('dhm_*.html'):
    corpus += parse_items_from_html(p.read_text(encoding='utf-8', errors='ignore'), source='DHM')

corpus_df = pd.DataFrame(corpus)
corpus_df['date'] = pd.to_datetime(corpus_df['date'], errors='coerce')
corpus_df = corpus_df.dropna(subset=['date']).sort_values('date')
corpus_df.to_csv(TEXT_CORPUS_CSV, index=False)
print('Saved:', TEXT_CORPUS_CSV, '| rows=', len(corpus_df))
corpus_df.head()


## 3) Keyword features (fast & robust)
Compute daily counts and binary flags for important keywords.

In [None]:
# ==== 3. Keywords → daily features ====
KEYWORDS = {
  'maintenance': ['maintenance','overhaul','shutdown','servicing','repair'],
  'outage': ['outage','blackout','interruption','load shedding','trip','fault'],
  'flood': ['flood','high flow','inundation','alert','warning','watch'],
  'policy': ['policy','tariff','import','export','regulation','curtail'],
  'weather': ['heavy rain','thunder','storm','monsoon','precipitation']
}

def keyword_counts(text: str, keywords: dict):
    t = (text or '').lower()
    out = {k: 0 for k in keywords}
    for k, words in keywords.items():
        for w in words:
            out[k] += t.count(w)
    return out

kdf = []
for _, row in corpus_df.iterrows():
    counts = keyword_counts((row['title'] or '') + ' ' + (row['text'] or ''), KEYWORDS)
    counts['date'] = row['date'].date()
    kdf.append(counts)
kdf = pd.DataFrame(kdf)
kdf['date'] = pd.to_datetime(kdf['date'])
daily_kw = kdf.groupby('date', as_index=False).sum()
for k in KEYWORDS.keys():
    daily_kw[f'{k}_flag'] = (daily_kw[k] > 0).astype(int)

daily_kw.to_csv(TOPICS_DAILY_CSV, index=False)
print('Saved:', TOPICS_DAILY_CSV, '| rows=', len(daily_kw))
daily_kw.head()


## 4) (Optional) LDA topics
If `scikit-learn` is installed, fit LDA to generate topic proportions per day.

In [None]:
# ==== 4. Optional: LDA ====
try:
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    SK_OK = True
except Exception:
    SK_OK = False

if SK_OK and len(corpus_df) > 10:
    docs = (corpus_df['title'].fillna('') + ' ' + corpus_df['text'].fillna('')).tolist()
    vec = CountVectorizer(max_features=5000, stop_words='english')
    X = vec.fit_transform(docs)
    lda = LatentDirichletAllocation(n_components=6, max_iter=20, random_state=42)
    Theta = lda.fit_transform(X)
    topics_df = pd.DataFrame(Theta, columns=[f'topic_{i}' for i in range(Theta.shape[1])])
    topics_df['date'] = corpus_df['date'].values
    daily_topics = topics_df.groupby(pd.to_datetime(topics_df['date']).dt.date).mean().reset_index()
    daily_topics['date'] = pd.to_datetime(daily_topics['date'])
else:
    daily_topics = pd.DataFrame()

daily_topics.head() if not daily_topics.empty else print('LDA skipped (no sklearn or too few docs).')


## 5) Merge with master
Join keyword/topic features to your master table by `date`.

In [None]:
# ==== 5. Merge ====
master = pd.read_csv(MASTER_PATH, parse_dates=['date'])
out = master.merge(pd.read_csv(TOPICS_DAILY_CSV, parse_dates=['date']), on='date', how='left')
if not daily_topics.empty:
    out = out.merge(daily_topics, on='date', how='left')
out.to_csv('master_with_topics.csv', index=False)
print('Saved: master_with_topics.csv | rows=', len(out))
out.head()
