# 03_text_topics_DHM_PDF_OCR
Notebook to crawl DHM notice PDFs, extract text (with OCR fallback), build keyword features, and merge with master data.

### Cell 1 — Setup & configuration

In [None]:

import os, re, time, shutil
from pathlib import Path
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
import dateutil.parser as dparser

# Source (DHM notices)
BASE = "https://www.dhm.gov.np"
NOTICE_ROOT = f"{BASE}/notice"

# Request settings
UA = {"User-Agent": "Mozilla/5.0"}
TIMEOUT = 30
REQUEST_DELAY_SEC = 0.8

# Project window
START_DATE = pd.Timestamp("2019-01-01")
END_DATE   = pd.Timestamp("2023-12-31")

# Paths
RAW_DIR = Path("text_raw"); RAW_DIR.mkdir(exist_ok=True)
PDF_DIR = RAW_DIR / "pdfs"; PDF_DIR.mkdir(exist_ok=True)

# Outputs
TEXT_CORPUS_CSV = "text_corpus.csv"
TOPICS_DAILY_CSV = "topics_daily.csv"
MASTER_PATH = "master_kaligandaki_daily_withrain.csv"
MASTER_WITH_TOPICS = "master_with_topics.csv"

print("Dirs:", RAW_DIR, PDF_DIR)
print("Outputs:", TEXT_CORPUS_CSV, TOPICS_DAILY_CSV, MASTER_WITH_TOPICS)


### Cell 2 — Helpers

In [None]:

def get_html(url: str, sleep: float = REQUEST_DELAY_SEC):
    for attempt in range(3):
        try:
            r = requests.get(url, headers=UA, timeout=TIMEOUT)
            if r.status_code == 200 and r.text and len(r.text) > 200:
                time.sleep(sleep)
                return r.text
            else:
                print(f"[warn] status={r.status_code} len={len(r.text)} url={url}")
        except Exception as e:
            print(f"[retry {attempt+1}/3] GET failed: {url} -> {e}")
        time.sleep(1.0)
    return ""

def abs_url(href: str):
    return urljoin(BASE + "/", href) if href else None

def safe_date_from_text(txt: str):
    if not txt: return None
    try:
        return dparser.parse(txt, fuzzy=True, dayfirst=True).date()
    except Exception:
        return None

def notice_listing_urls(max_pages=30):
    for p in range(max_pages):
        yield f"{NOTICE_ROOT}?page={p}"
        yield f"{NOTICE_ROOT}/?page={p}"


### Cell 3 — Crawl listings and collect PDF links

In [None]:

pdf_links = set()

for url in notice_listing_urls(max_pages=30):
    html = get_html(url)
    if not html:
        continue
    soup = BeautifulSoup(html, "html.parser")
    for a in soup.find_all("a"):
        href = (a.get("href") or "").strip()
        if not href:
            continue
        if href.lower().endswith(".pdf") or "pdf" in href.lower():
            full = abs_url(href)
            if full and full.startswith("http"):
                pdf_links.add(full)
    print("[list] collected so far:", len(pdf_links))

pdf_links = sorted(pdf_links)
print("Total PDF links found:", len(pdf_links))


### Cell 4 — Download PDFs

In [None]:

def filename_from_url(u: str):
    name = u.split("/")[-1].split("?")[0]
    return re.sub(r"[^A-Za-z0-9_.-]+", "_", name or "notice.pdf")

downloaded = []
for i, url in enumerate(pdf_links, 1):
    fn = filename_from_url(url)
    path = PDF_DIR / fn
    if path.exists() and path.stat().st_size > 500:
        downloaded.append(path)
        continue
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.status_code == 200 and r.content and len(r.content) > 500:
            path.write_bytes(r.content)
            downloaded.append(path)
            print(f"[pdf] {i}/{len(pdf_links)} saved ->", path.name)
        else:
            print(f"[skip] {url} (status={r.status_code}, size={len(r.content) if r.content else 0})")
    except Exception as e:
        print(f"[error] {url} ->", e)
    time.sleep(REQUEST_DELAY_SEC)

print("Downloaded PDFs:", len(downloaded))


### Cell 5 — PDF text extraction

In [None]:

try:
    from pdfminer.high_level import extract_text as pdf_extract_text
    USE_PDFMINER = True
except Exception:
    USE_PDFMINER = False
    try:
        import PyPDF2
    except Exception:
        PyPDF2 = None

def read_pdf_text(path: Path) -> str:
    if USE_PDFMINER:
        try:
            return pdf_extract_text(str(path)) or ""
        except Exception as e:
            print("[pdfminer-fail]", path.name, e)
    if 'PyPDF2' in globals() and PyPDF2 is not None:
        try:
            text = []
            with open(path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    try:
                        text.append(page.extract_text() or "")
                    except Exception:
                        text.append("")
            return "\n".join(text)
        except Exception as e:
            print("[PyPDF2-fail]", path.name, e)
    return ""


### Cell 6 — OCR-enabled extractor

In [None]:

from pdf2image import convert_from_path
import pytesseract

POPPLER_PATH = None
TESSERACT_CMD = None
if TESSERACT_CMD:
    pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD

def extract_text_any(pdf_path: Path, max_pages_ocr=5) -> str:
    # pdfminer
    try:
        from pdfminer.high_level import extract_text as _pdfminer_extract
        txt = _pdfminer_extract(str(pdf_path)) or ""
        if len(txt.strip()) >= 40:
            return txt
    except Exception:
        pass
    # PyPDF2
    try:
        import PyPDF2
        text = []
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                try:
                    t = page.extract_text() or ""
                except Exception:
                    t = ""
                text.append(t)
        txt = "\n".join(text)
        if len(txt.strip()) >= 40:
            return txt
    except Exception:
        pass
    # OCR
    try:
        images = convert_from_path(str(pdf_path), dpi=200, first_page=1, last_page=max_pages_ocr, poppler_path=POPPLER_PATH)
        ocr_texts = [pytesseract.image_to_string(im) for im in images]
        return "\n".join(ocr_texts)
    except Exception as e:
        print("[OCR fail]", pdf_path.name, e)
        return ""


### Cell 7 — Build text_corpus.csv

In [None]:

def guess_date_from_name(s: str):
    pats = [
        r"(20\d{2})[-_/\.](\d{1,2})[-_/\.](\d{1,2})",
        r"(\d{1,2})[-_/\.](\d{1,2})[-_/\.](20\d{2})",
    ]
    for p in pats:
        m = re.search(p, s)
        if m:
            g = [int(x) for x in m.groups()]
            try:
                if g[0] > 1900: y,mn,d = g
                elif g[2] > 1900: d,mn,y = g
                else: continue
                return pd.to_datetime(f"{y:04d}-{mn:02d}-{d:02d}", errors="coerce")
            except Exception:
                pass
    return pd.NaT

records = []
pdf_paths = sorted(PDF_DIR.glob("*.pdf"))
print("PDFs found:", len(pdf_paths))

for i, path in enumerate(pdf_paths, 1):
    txt = extract_text_any(path)
    if not txt or len(txt.strip()) < 40:
        continue
    lines = [l.strip() for l in txt.splitlines() if l.strip()]
    title = (lines[0] if lines else path.stem)[:160]
    dt = safe_date_from_text(title) or safe_date_from_text(txt)
    if dt:
        dt_ts = pd.to_datetime(dt, errors="coerce")
    else:
        dt_ts = guess_date_from_name(path.stem)
    records.append({
        "date": dt_ts,
        "source": "DHM",
        "title": title,
        "text": txt,
        "url": path.as_posix()
    })

corpus_df = pd.DataFrame(records)
if not corpus_df.empty:
    corpus_df = corpus_df.dropna(subset=["date"])
    corpus_df["date"] = pd.to_datetime(corpus_df["date"]).dt.normalize()
    mask = (corpus_df["date"] >= START_DATE) & (corpus_df["date"] <= END_DATE)
    corpus_df = corpus_df.loc[mask]
    corpus_df = corpus_df.drop_duplicates(subset=["date","title","url"]).sort_values("date")

corpus_df.to_csv(TEXT_CORPUS_CSV, index=False)
print("Saved:", TEXT_CORPUS_CSV, "| rows =", len(corpus_df))
corpus_df.head(10)


### Cell 8 — Keyword features

In [None]:

KEYWORDS = {
    "maintenance": ["maintenance","overhaul","shutdown","servicing","repair","सम्भार","मर्मत","बन्द"],
    "outage":      ["outage","blackout","interruption","load shedding","trip","fault","विद्युत अवरोध","लोडसेडिङ","बत्ती बन्द"],
    "flood":       ["flood","high flow","inundation","alert","warning","watch","बाढी","पहिरो","सूचना","चेतावनी"],
    "policy":      ["policy","tariff","import","export","regulation","curtail","नीति","दर","आयात","निर्यात","विनियमन"],
    "weather":     ["heavy rain","thunder","storm","monsoon","precipitation","हावाहुरी","मुसलधारे","मेघगर्जन","मौसम"],
}

def kw_counts(text: str):
    t = (text or "").lower()
    out = {k: 0 for k in KEYWORDS}
    for k, words in KEYWORDS.items():
        for w in words:
            out[k] += t.count(w.lower())
    return out

if 'corpus_df' in globals() and len(corpus_df) > 0:
    kk = []
    for _, r in corpus_df.iterrows():
        counts = kw_counts((r.get("title") or "") + " " + (r.get("text") or ""))
        counts["date"] = r["date"].date()
        kk.append(counts)
    daily_kw = pd.DataFrame(kk)
    daily_kw["date"] = pd.to_datetime(daily_kw["date"])
    for k in KEYWORDS:
        daily_kw[f"{k}_flag"] = (daily_kw[k] > 0).astype(int)
    daily_kw = daily_kw.groupby("date", as_index=False).sum()
else:
    daily_kw = pd.DataFrame(columns=["date"] + list(KEYWORDS.keys()) + [f"{k}_flag" for k in KEYWORDS])

daily_kw.to_csv(TOPICS_DAILY_CSV, index=False)
print("Saved:", TOPICS_DAILY_CSV, "| rows =", len(daily_kw))
daily_kw.head(10)


### Cell 9 — Merge with master

In [None]:

master = pd.read_csv(MASTER_PATH, parse_dates=["date"])
out = master.merge(daily_kw, on="date", how="left")
for c in out.columns:
    if c != "date" and c not in master.columns:
        out[c] = out[c].fillna(0)
out.to_csv(MASTER_WITH_TOPICS, index=False)
print("Saved:", MASTER_WITH_TOPICS, "| rows =", len(out), "| cols =", len(out.columns))
out.head(10)
