## Scraping

In [211]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import fitz  # PyMuPDF
import streamlit as st
import openai
import json
import pandas as pd

In [94]:
from datetime import datetime, timedelta

# Define 3-month cutoff
cutoff_date = datetime.today() - timedelta(days=90)

In [259]:
def parse_date_string(date_str):
    # Try parsing formats like "May 16, 2025"
    for fmt in ("%B %d, %Y", "%d %B %Y", "%d-%m-%Y"):
        try:
            return datetime.strptime(date_str.strip(), fmt)
        except:
            continue
    return None

In [313]:
import re

def clean_commerce_date(date_str):
    # Remove 'st', 'nd', 'rd', 'th', dots, extra spaces
    cleaned = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str, flags=re.IGNORECASE)
    cleaned = cleaned.replace('.', ' ').strip()
    cleaned = re.sub(r'\s+', ' ', cleaned)  # collapse multiple spaces
    return cleaned.title()

In [279]:
def parse_mca_date(date_str):
    try:
        return datetime.strptime(date_str.strip(), "%d/%m/%Y")
    except Exception:
        return None

In [253]:
def extract_date_from_text(text):
    patterns = [
        r'\d{1,2}\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}',
        r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},\s+\d{4}',
        r'\d{2}-\d{2}-\d{4}',
    ]
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for m in re.findall(pattern, text, re.IGNORECASE):
            for fmt in ("%d %b %Y", "%d %B %Y", "%b %d, %Y", "%B %d, %Y", "%d-%m-%Y"):
                try:
                    return datetime.strptime(m.strip(), fmt)
                except:
                    continue
    return None

In [95]:
# Define the base URL and the notifications page URL
base_url = "https://dpiit.gov.in"
notifications_url = "https://dpiit.gov.in/policies-rules-and-acts/notifications"

# Fetch the HTML content of the notifications page
response = requests.get(notifications_url)
soup = BeautifulSoup(response.content, "html.parser")

# Extract PDF titles and links from the tables
pdf_links = []
for table in soup.find_all("table"):
    for row in table.find_all("tr"):
        link_tag = row.find("a", href=True)
        if link_tag and link_tag['href'].endswith('.pdf'):
            title = link_tag.text.strip()
            pdf_url = urljoin(base_url, link_tag['href'])
            pdf_links.append((title, pdf_url))

In [96]:
# Function to download and extract text from a PDF URL
def extract_text_from_pdf_url(pdf_url):
    response = requests.get(pdf_url)
    with open("dpiit_sample.pdf", "wb") as f:
        f.write(response.content)
    doc = fitz.open("dpiit_sample.pdf")
    return "\n".join(page.get_text() for page in doc)

# Process the first PDF as a sample
if pdf_links:
    title, pdf_url = pdf_links[0]
    print(f"\n🔗 Downloading: {title}\n📄 URL: {pdf_url}")
    text = extract_text_from_pdf_url(pdf_url)
    print(f"\n📁 First 1000 characters:\n{text[:1000]}")
else:
    print("❌ No PDFs found.")


🔗 Downloading: (1.67 MB)
📄 URL: https://dpiit.gov.in/sites/default/files/notification_StartUps_13May2025.pdf

📁 First 1000 characters:
3024 GI/2025 
(1) 
 
रजिस्ट्री सं. डी.एल.- 33004/99 
REGD. No. D. L.-33004/99 
 
 
 
xxxGIDHxxx 
xxxGIDExxx 
असाधारण  
EXTRAORDINARY 
भाग II—खण् ड 3—उप-खण् ड (ii)  
PART II—Section 3—Sub-section (ii) 
प्राजधकार से प्रकाजित 
PUBLISHED BY AUTHORITY 
 
वाजणज्य और उद्योग मंत्रालय 
(उद्योग संवधन और आंतररक व्यापार जवभाग) 
(स्ट्टाटधअप इंजडया अनुभाग) 
अजधसूचना 
नई दिल्ली, 8 मई, 2025 
 
का.आ. 2046(अ).— केंद्र सरकार ने उद्योग संवधन और आंतररक व्यापार जवभाग (डीपीआईआईटी), वाजणज्य और 
उद्योग मंत्रालय द्वारा िारी रािपत्र अजधसूचना में यथा पररभाजित स्ट्टाटधअप्स होने के नाते पात्र ऋण प्राप्तकर्त्ाधओं को 
जवर्त्पोजित करने के जलए सिस्ट्य संस्ट्थानों (एमआई) द्वारा ऋण प्रिान करने के जलए क्रेजडट गारंटी िेने के उद्देश्य से ‘स्ट्टाटधअप्स 
के जलए ऋण गारंटी स्ट्कीम (सीिीएसएस)’ को अनुमोदित दकया है।   
 
यह अजधसूचना स्ट्टाटधअप्स के जलए ऋण गारंटी स्ट्कीम (सीिीएसएस) के संबंध में पूव

In [97]:
# DPIIT Notifications Page
base_url = "https://dpiit.gov.in"
url = "https://dpiit.gov.in/policies-rules-and-acts/notifications"

# Step 1: Fetch and parse the page
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Step 2: Extract PDF links from tables
pdf_links = []
for table in soup.find_all("table"):
    for row in table.find_all("tr"):
        a_tag = row.find("a", href=True)
        if a_tag and a_tag['href'].endswith('.pdf'):
            title = a_tag.get_text(strip=True)
            pdf_url = urljoin(base_url, a_tag['href'])
            pdf_links.append((title, pdf_url))

# Step 3: Show how many PDFs were found
print(f"\n📄 Total PDFs found: {len(pdf_links)}\n")

# Step 4: Print all PDF titles and links
for i, (title, link) in enumerate(pdf_links, 1):
    print(f"{i}. {title}\n   🔗 {link}\n")

# Step 5: Extract text from the first PDF (optional)
def extract_text_from_pdf_url(pdf_url):
    response = requests.get(pdf_url)
    with open("dpiit_sample.pdf", "wb") as f:
        f.write(response.content)
    doc = fitz.open("dpiit_sample.pdf")
    return "\n".join(page.get_text() for page in doc)

# Preview the first PDF
if pdf_links:
    title, pdf_url = pdf_links[0]
    print(f"\n🔍 Previewing first PDF: {title}")
    text = extract_text_from_pdf_url(pdf_url)
    print(f"\n🧾 First 1000 characters of text:\n{text[:1000]}")


📄 Total PDFs found: 358

1. (1.67 MB)
   🔗 https://dpiit.gov.in/sites/default/files/notification_StartUps_13May2025.pdf

2. (1.28 MB)
   🔗 https://dpiit.gov.in/sites/default/files/notification_CGSS_20March2023.pdf

3. (426.27 KB)
   🔗 https://dpiit.gov.in/sites/default/files/notification_Definition_StartupIndia_06July2021.pdf

4. (1.05 MB)
   🔗 https://dpiit.gov.in/sites/default/files/notification-Startup-29January2021.pdf

5. (1.77 MB)
   🔗 https://dpiit.gov.in/sites/default/files/Startup_Notification11April2018_0.pdf

6. (832.3 KB)
   🔗 https://dpiit.gov.in/sites/default/files/QCO_Hinges_26March2025.pdf

7. (1.96 MB)
   🔗 https://dpiit.gov.in/sites/default/files/QCO_CopperProduct_27February2025.pdf

8. (1.73 MB)
   🔗 https://dpiit.gov.in/sites/default/files/QCO_FlashLight_29January2025.pdf

9. (1.73 MB)
   🔗 https://dpiit.gov.in/sites/default/files/QCO_ElectricFenceEnergizer_20November2024.pdf

10. (1.72 MB)
   🔗 https://dpiit.gov.in/sites/default/files/QCO_ElectricalAppliances_07No

## Summarizing

In [99]:
def date_guard_agent(title, text, cutoff_date):
    prompt = f"""
You're an AI assistant that reads Indian government notification documents.

1. Extract the date mentioned in the text (if any).
2. If the notification is dated earlier than {cutoff_date.strftime('%d %B %Y')}, just return: SKIP
3. Otherwise, summarize it in 3–7 bullet points.

Title: {title}

Text:
{text}
"""
    response = openai.ChatCompletion.create(
        model="gpt-4.1-nano",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response['choices'][0]['message']['content']

In [100]:
import re

def extract_english_text(text):
    blocks = text.split("\n\n")
    english_blocks = [
        b for b in blocks 
        if len(re.findall(r'[\u0900-\u097F]', b)) < 0.25 * len(b)  # keep mostly non-Hindi
    ]
    return "\n\n".join(english_blocks)

In [101]:
openai.api_key = "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"

def summarize_text(text, max_points=7):
    prompt = f"""
You are a policy analyst. Given the following government guideline, extract a structured summary:
- Focus only on the English part.
- CONDITION: If the document is released more than 90 days before than the current date, give no summary.
- Summarize the document in {max_points} or more clear bullet points.
- Focus on regulatory decisions, compliance requirements, incentives, deadlines, and any sectoral/company impact.

TEXT:
{text}
"""
    response = openai.ChatCompletion.create(
        model="gpt-4.1-nano",
        messages=[
            {"role": "system", "content": "You summarize Indian government policy documents."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
    )
    return response['choices'][0]['message']['content']

In [102]:
summarize_text(text)

"- The government has approved the 'Credit Guarantee Scheme for Startups (CGSS)' to provide credit guarantees to loans extended by Member Institutions (MIs) to eligible startups recognized by DPIIT, aiming to facilitate collateral-free debt funding.\n- The scheme is effective from the date of DPIIT notification and applies to loans sanctioned on or after this date, covering both transaction-based and umbrella-based guarantee products.\n- The primary objective is to offer guarantee coverage up to a specified limit (maximum Rs. 20 crore per borrower) to mitigate credit risk for lenders, encouraging more lending to startups.\n- Eligible lenders include scheduled commercial banks, RBI-registered NBFCs with a minimum net worth of Rs. 100 crore and a BBB rating or above, and SEBI-registered Alternative Investment Funds (AIFs).\n- Member Institutions must execute agreements or undertakings with the Trustee to qualify for guarantee coverage and are responsible for prudent credit evaluation, mo

In [103]:
for title, pdf_url in pdf_links:
    print(f"\n📄 Processing: {title}")
    text = extract_text_from_pdf_url(pdf_url)
    english = extract_english_text(text)
    
    result = date_guard_agent(title, english, cutoff_date)
    if result.strip().upper().startswith("SKIP"):
        print("⏭️ Skipped (older than 3 months)\n")
        continue
    
    print(f"✅ Summary:\n{result}")


📄 Processing: (1.67 MB)
✅ Summary:
The date mentioned in the notification is 8th May, 2025.

Since the notification is dated on or after 16 February 2025, I will provide a summary:

- The Government of India has approved the 'Credit Guarantee Scheme for Startups (CGSS)' to provide credit guarantees to loans extended by Member Institutions (MIs) to eligible Startups recognized by DPIIT.
- The scheme aims to facilitate collateral-free debt funding for Startups, replacing the earlier scheme from October 2022.
- The scheme covers various instruments such as venture debt, working capital, subordinated debt, debentures, and convertible debt, with a maximum guarantee limit of Rs. 20 crore per borrower.
- Guarantee fees are structured based on the type of guarantee (transaction-based or umbrella-based), with special rates for units from North East, women entrepreneurs, and champion sectors.
- The scheme establishes responsibilities for Member Institutions, including due diligence, monitoring,

In [335]:
# ---------------------------------------
# 1. DPIIT — https://dpiit.gov.in/...
# ---------------------------------------
def scrape_dpiit():
    url = "https://dpiit.gov.in/policies-rules-and-acts/notifications"
    base_url = "https://dpiit.gov.in"
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    results = []
    for table in soup.find_all("table"):
        for row in table.find_all("tr"):
            link_tag = row.find("a", href=True)
            if link_tag and link_tag['href'].endswith(".pdf"):
                title = link_tag.text.strip()
                pdf_url = urljoin(base_url, link_tag["href"])
                results.append(("DPIIT", title, pdf_url, ""))  # No reliable date in HTML
    return results

# ---------------------------------------
# 2. Power Ministry
# ---------------------------------------
def scrape_powermin():
    base_url = "https://powermin.gov.in"
    url = "https://powermin.gov.in/circular"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    results = []
    for table in soup.find_all("table"):
        for row in table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) >= 5:
                subject = cols[1].get_text(strip=True)
                date_text = cols[2].get_text(strip=True)
                link_tag = cols[4].find("a", href=True)
                doc_date = parse_date(date_text)
                if doc_date and doc_date >= cutoff_date and link_tag:
                    pdf_url = urljoin(base_url, link_tag['href'])
                    results.append(("Ministry of Power (Circular)", subject, pdf_url, doc_date.strftime("%Y-%m-%d")))
    return results

# ---------------------------------------
# 3. RBI
# ---------------------------------------
def scrape_rbi():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    driver.get("https://website.rbi.org.in/web/rbi/notifications?delta=100")

    # Scroll to load content
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    results = []
    base_url = "https://website.rbi.org.in"

    items = soup.find_all("div", class_="notification-row-each-inner")
    # print(f"🔎 Notifications found: {len(items)}")

    for block in items:
        # Title
        a_tag = block.find("a", class_="mtm_list_item_heading")
        title = a_tag.get_text(strip=True) if a_tag else "Untitled"

        # Date
        date_tag = block.find("div", class_="notification-date")
        date_str = date_tag.get_text(strip=True) if date_tag else ""
        doc_date = parse_date_string(date_str)

        # PDF link
        pdf_tag = block.find("a", class_="matomo_download download_link", href=True)
        pdf_url = urljoin(base_url, pdf_tag["href"]) if pdf_tag else None

        if doc_date and doc_date >= cutoff_date and pdf_url:
            # print("🔹", title)
            # print("📅", doc_date.strftime("%Y-%m-%d"))
            # print("🔗", pdf_url)
            # print("-" * 50)
            results.append((
                "RBI (New)",
                title,
                pdf_url,
                doc_date.strftime("%Y-%m-%d")
            ))

    # print(f"✅ Final RBI Notifications with PDFs: {len(results)}")
    return results
# ---------------------------------------
# 4. MCA
# ---------------------------------------
# import undetected_chromedriver as uc

# def scrape_mca():
#     options = uc.ChromeOptions()
#     options.add_argument("--headless")
#     options.add_argument("--no-sandbox")
#     options.add_argument("--disable-gpu")
#     options.add_argument("--window-size=1920,1080")

#     driver = uc.Chrome(options=options)
#     driver.get("https://www.mca.gov.in/content/mca/global/en/acts-rules/ebooks/notifications.html")

#     service = Service(ChromeDriverManager().install())
#     # driver = webdriver.Chrome(service=service, options=options)

#     # url = "https://www.mca.gov.in/content/mca/global/en/acts-rules/ebooks/notifications.html"
#     # driver.get(url)

#     # Wait for the table to load (increase if needed)
#     time.sleep(5)

#     soup = BeautifulSoup(driver.page_source, "html.parser")
#     driver.quit()
#     print(soup.prettify()[:2000])
#     rows = soup.select("#notificationCircularTable tbody tr")
#     print(f"🔎 MCA Notifications found: {len(rows)}")

#     results = []

#     for row in rows:
#         cols = row.find_all("td")
#         if len(cols) != 3:
#             continue

#         a_tag = cols[0].find("a", class_="dmslink")
#         if not a_tag:
#             continue

#         title = a_tag.get_text(strip=True)
#         doc_id = a_tag.get("val")
#         date_str = cols[2].get_text(strip=True)
#         doc_date = parse_mca_date(date_str)

#         if not doc_id or not doc_date or doc_date < cutoff_date:
#             continue

#         pdf_url = f"{base_pdf_url}?docid={doc_id}&docCategory=Notifications&type=open"

#         print("🔹", title)
#         print("📅", doc_date.strftime("%Y-%m-%d"))
#         print("🔗", pdf_url)
#         print("-" * 50)

#         results.append((
#             "MCA",
#             title,
#             pdf_url,
#             doc_date.strftime("%Y-%m-%d")
#         ))

#     print(f"✅ Final MCA Notifications with PDFs: {len(results)}")
#     return results
# ---------------------------------------
# 5. Ministry of Commerce
# ---------------------------------------
def scrape_commerce():
    url = "https://commerce.gov.in/acts-and-schemes/"
    cutoff_date = datetime.today() - timedelta(days=90)

    res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(res.content, "html.parser")

    cards = soup.select(".whats-new-wrapper")
    print(f"🔎 Commerce notifications found: {len(cards)}")

    results = []

    for card in cards:
        heading = card.select_one("h3")
        meta = card.select_one("p")
        link = card.select_one("a.innr-btn")

        if not heading or not meta or not link:
            continue

        title = heading.get_text(strip=True)
        p_text = meta.get_text(strip=True)

        # Extract date from the beginning of <p>
        raw_date = p_text.split("|")[0].strip()
        normalized_date = clean_commerce_date(raw_date)
        
        try:
            doc_date = datetime.strptime(normalized_date, "%d %B %Y")
        except Exception:
            print(f"❌ Still couldn't parse: '{raw_date}' → '{normalized_date}'")
            continue
        pdf_url = link.get("href")

        if not doc_date:
            print(f"❌ Could not parse date from: {date_part}")
            continue

        if not pdf_url or ".pdf" not in pdf_url.lower():
            print(f"❌ Skipping non-PDF link: {pdf_url}")
            continue

        if doc_date < cutoff_date:
            continue  # old

        print("🔹", title)
        print("📅", doc_date.strftime("%Y-%m-%d"))
        print("🔗", pdf_url)
        print("-" * 50)

        results.append((
            "Commerce",
            title,
            pdf_url,
            doc_date.strftime("%Y-%m-%d")
        ))

    print(f"✅ Final Commerce notifications with PDFs: {len(results)}")
    return results

In [337]:
# ---------------------------------------
# Combine All Sources
# ---------------------------------------
all_results = []
for scraper in [scrape_dpiit, scrape_powermin, scrape_rbi, scrape_commerce]:
    try:
        all_results += scraper()
    except Exception as e:
        print(f"❌ Error in {scraper.__name__}: {e}")

🔎 Commerce notifications found: 206
❌ Skipping non-PDF link: void(0)
❌ Still couldn't parse: '15 Febuary 2007' → '15 Febuary 2007'
❌ Skipping non-PDF link: None
❌ Skipping non-PDF link: None
✅ Final Commerce notifications with PDFs: 0


In [339]:
import collections

df = pd.DataFrame(all_results, columns=["Source", "Title", "PDF URL", "Date"])

# Print summary count per source
source_counts = collections.Counter(df["Source"])
print("\n📊 Document counts by ministry:\n")
for source, count in source_counts.items():
    print(f"• {source}: {count} documents")

# Print a few samples from each source
print("\n🔍 Sample entries:")
for source in source_counts:
    sample = df[df["Source"] == source].head(1)
    print(f"\n--- {source} ---")
    print(sample[["Date", "Title", "PDF URL"]].to_string(index=False))


📊 Document counts by ministry:

• DPIIT: 358 documents
• Ministry of Power (Circular): 2 documents
• RBI (New): 4 documents

🔍 Sample entries:

--- DPIIT ---
Date     Title                                                                      PDF URL
     (1.67 MB) https://dpiit.gov.in/sites/default/files/notification_StartUps_13May2025.pdf

--- Ministry of Power (Circular) ---
      Date                                          Title                                                                                                 PDF URL
2025-02-19 Sexual Harassment Internal Complaint Committee https://powermin.gov.in/sites/default/files/Sexual_Harassment_Internal_Complaint_Committee_18022025.pdf

--- RBI (New) ---
      Date                                                                                                                                                                                                                                                                          