In [102]:
# === fetchers.py ===
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import fitz  # PyMuPDF
import streamlit as st
import openai
import json
import pandas as pd
from datetime import datetime, timedelta
import re

# Define 3-month cutoff
cutoff_date = datetime.today() - timedelta(days=90)

def extract_date_from_dpiit_url(pdf_url):
    """
    Attempts to extract a date from a DPIIT PDF URL such as:
    - https://dpiit.gov.in/sites/default/files/QCO_LaboratoryGlassware_24January2024.pdf
    - https://dpiit.gov.in/sites/default/files/notification_Amendment_23November2012%20%206_0.pdf
    """
    from datetime import datetime
    import re

    # Extract the filename
    filename = pdf_url.split('/')[-1]
    
    # Remove extension and decode URL
    filename = re.sub(r'\.pdf$', '', filename, flags=re.IGNORECASE)
    filename = re.sub(r'%20', ' ', filename)

    # Try to find a pattern like 24January2024 or 23November2012
    match = re.search(r'(\d{1,2})\s*([A-Za-z]+)\s*(\d{4})', filename)
    if match:
        day, month_str, year = match.groups()
        try:
            date_obj = datetime.strptime(f"{day} {month_str} {year}", "%d %B %Y")
            return date_obj
        except ValueError:
            try:
                date_obj = datetime.strptime(f"{day} {month_str} {year}", "%d %b %Y")
                return date_obj
            except ValueError:
                return None
    return None

def parse_date(date_str):
    try:
        return datetime.strptime(date_str.strip(), "%d/%m/%Y")
    except:
        try:
            return datetime.strptime(date_str.strip(), "%d-%m-%Y")
        except:
            return None
            
def parse_date_string(date_str):
    # Try parsing formats like "May 16, 2025"
    for fmt in ("%B %d, %Y", "%b %d, %Y", "%d %B %Y", "%d-%m-%Y"):
        try:
            return datetime.strptime(date_str.strip(), fmt)
        except:
            continue
    return None

def clean_commerce_date(date_str):
    # Remove ordinal suffixes (st, nd, rd, th)
    cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str, flags=re.IGNORECASE)
    # Replace dots with spaces
    cleaned = cleaned.replace('.', ' ')
    # Normalize whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned.title()  # Ensures consistent capitalization for parsing

# 1. DPIIT
def scrape_dpiit():
    url = "https://dpiit.gov.in/policies-rules-and-acts/notifications"
    base_url = "https://dpiit.gov.in"
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    results = []

    for table in soup.find_all("table"):
        for row in table.find_all("tr"):
            link_tag = row.find("a", href=True)
            if link_tag and link_tag['href'].endswith(".pdf"):
                pdf_url = urljoin(base_url, link_tag["href"])
                doc_date = extract_date_from_dpiit_url(pdf_url)

                if doc_date and doc_date >= cutoff_date:
                    title = link_tag.text.strip()
                    results.append({
                        "source": "DPIIT",
                        "title": title,
                        "url": pdf_url,
                        "date": doc_date.strftime("%Y-%m-%d")
                    })

    return results

# 2. Power Ministry
def scrape_powermin():
    base_url = "https://powermin.gov.in"
    url = "https://powermin.gov.in/circular"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    results = []
    for table in soup.find_all("table"):
        for row in table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) >= 5:
                subject = cols[1].get_text(strip=True)
                date_text = cols[2].get_text(strip=True)
                link_tag = cols[4].find("a", href=True)
                doc_date = parse_date(date_text)
                if doc_date and doc_date >= cutoff_date and link_tag:
                    pdf_url = urljoin(base_url, link_tag['href'])
                    results.append({"source": "Power Ministry", "title": subject, "url": pdf_url, "date": doc_date.strftime("%Y-%m-%d")})
    return results

# 3. RBI
def scrape_rbi():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    driver.get("https://website.rbi.org.in/web/rbi/notifications?delta=100")

    # Scroll to load all content
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    results = []
    base_url = "https://website.rbi.org.in"
    items = soup.find_all("div", class_="notification-row-each-inner")
    print(f"🔎 Notifications found: {len(items)}")

    for block in items:
        a_tag = block.find("a", class_="mtm_list_item_heading")
        title = a_tag.get_text(strip=True) if a_tag else "Untitled"

        date_tag = block.find("div", class_="notification-date")
        date_str = date_tag.get_text(strip=True) if date_tag else ""
        doc_date = parse_date_string(date_str)

        pdf_tag = block.find("a", class_="matomo_download download_link", href=True)
        pdf_url = urljoin(base_url, pdf_tag["href"]) if pdf_tag else None

        if doc_date and doc_date >= cutoff_date and pdf_url:
            results.append((
                "RBI (New)",  # string label, not dictionary key
                title,
                pdf_url,
                doc_date.strftime("%Y-%m-%d")
            ))

    print(f"✅ Final RBI Notifications with PDFs: {len(results)}")
    return results

# 4. Commerce
def scrape_commerce():
    url = "https://commerce.gov.in/acts-and-schemes/"
    res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(res.content, "html.parser")
    cards = soup.select(".whats-new-wrapper")
    results = []

    for card in cards:
        heading = card.select_one("h3")
        meta = card.select_one("p")
        link = card.select_one("a.innr-btn")
        if not heading or not meta or not link:
            continue

        title = heading.get_text(strip=True)
        raw_date = meta.get_text(strip=True).split("|")[0].strip()

        # Clean and normalize date string
        normalized_date = clean_commerce_date(raw_date)

        # Try multiple formats
        doc_date = None
        for fmt in ("%d %B %Y", "%d %b %Y"):
            try:
                doc_date = datetime.strptime(normalized_date, fmt)
                break
            except ValueError:
                continue

        if not doc_date:
            continue

        pdf_url = link.get("href")
        if not pdf_url or ".pdf" not in pdf_url.lower():
            continue
        if doc_date < cutoff_date:
            continue

        results.append({
            "source": "Commerce",
            "title": title,
            "url": pdf_url,
            "date": doc_date.strftime("%Y-%m-%d")
        })

    return results

In [103]:
# # === Run & Test ===
# from datetime import datetime, timedelta
# all_results = []
# for scraper in [scrape_dpiit, scrape_powermin, scrape_rbi, scrape_commerce]:
#     try:
#         all_results += scraper()
#     except Exception as e:
#         print(f"❌ Error in {scraper.__name__}: {e}")

# df = pd.DataFrame(all_results)
# print(f"\n✅ Total documents fetched: {len(df)}")

# if not df.empty:
#     print("\n📊 Document counts by ministry:")
#     print(df['source'].value_counts())

#     print("\n🔍 Sample entries:")
#     for source in df['source'].unique():
#         sample = df[df['source'] == source].head(1)
#         print(f"\n--- {source} ---")
#         print(sample[['date', 'title', 'url']].to_string(index=False))

In [104]:
# Combine results
all_results = []

# Scrape each source
dpiit = scrape_dpiit()
print(f"\n✅ DPIIT: {len(dpiit)} results")
all_results.extend(dpiit)

powermin = scrape_powermin()
print(f"\n✅ Power Ministry: {len(powermin)} results")
all_results.extend(powermin)

rbi = scrape_rbi()
# Note: RBI result is a list of tuples, we convert to dict for consistency
rbi_dicts = [{"source": src, "title": title, "url": url, "date": date} for (src, title, url, date) in rbi]
print(f"\n✅ RBI: {len(rbi_dicts)} results")
all_results.extend(rbi_dicts)

commerce = scrape_commerce()
print(f"\n✅ Commerce: {len(commerce)} results")
all_results.extend(commerce)

# Create DataFrame
df = pd.DataFrame(all_results)

# Show DataFrame shape and column names
print(f"\n🧾 Final DataFrame shape: {df.shape}")
print(f"📋 Columns: {df.columns.tolist()}")

# Check for missing values
print("\n🔍 Missing values per column:")
print(df.isnull().sum())

# Show counts by source
if 'source' in df.columns:
    print("\n📊 Document counts by ministry:")
    print(df['source'].value_counts())

# Show sample entries
print("\n🔍 Sample entries:")
print(df.head(5))


✅ DPIIT: 4 results

✅ Power Ministry: 1 results
🔎 Notifications found: 100
✅ Final RBI Notifications with PDFs: 39

✅ RBI: 39 results

✅ Commerce: 0 results

🧾 Final DataFrame shape: (44, 4)
📋 Columns: ['source', 'title', 'url', 'date']

🔍 Missing values per column:
source    0
title     0
url       0
date      0
dtype: int64

📊 Document counts by ministry:
source
RBI (New)         39
DPIIT              4
Power Ministry     1
Name: count, dtype: int64

🔍 Sample entries:
           source                                           title  \
0           DPIIT                                       (1.67 MB)   
1           DPIIT                                      (832.3 KB)   
2           DPIIT                                       (1.96 MB)   
3           DPIIT                                       (1.84 MB)   
4  Power Ministry  Sexual Harassment Internal Complaint Committee   

                                                 url        date  
0  https://dpiit.gov.in/sites/default/file