In [None]:
import os
import time
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--start-maximized")
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])

# Initialize WebDriver
service = Service("C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe")  # Update with the correct path
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL
BASE_URL = "https://www.data.gov.in/catalogs?page="

# Maximum pages to scrape
MAX_PAGES = 1556

# Resume from last scraped page
resume_file = "last_scraped_page.txt"
start_page = 1
if os.path.exists(resume_file):
    with open(resume_file, "r") as f:
        start_page = int(f.read().strip())

# **Create "gov_datasets" Folder if not exists**
save_folder = "gov_datasets"
os.makedirs(save_folder, exist_ok=True)  # Creates the folder if it doesn't exist

# Function to format date properly
def format_date(date_text):
    try:
        return datetime.strptime(date_text, "%d-%m-%Y").strftime("%d-%m-%Y")  # Keep DD-MM-YYYY format
    except ValueError:
        return date_text  # Keep original text if format is unexpected

# Function to scrape a page
def scrape_page(page_number):
    url = f"{BASE_URL}{page_number}"
    print(f"🔄 Scraping Page {page_number}...")

    driver.get(url)
    time.sleep(3)  # Allow page to load

    # Wait for dataset elements
    try:
        datasets = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "grid-header"))
        )
    except:
        print(f"⚠️ No datasets found on page {page_number}. Skipping...")
        return []

    page_data = []
    
    for dataset in datasets:
        try:
            # Extract Dataset Name
            dataset_name = dataset.find_element(By.TAG_NAME, "a").text.strip()

            # Extract Ministry Name
            try:
                ministry = dataset.find_element(By.XPATH, "./following-sibling::div[@class='grid-desc']/p[@class='card-text'][1]").text.strip()
            except:
                ministry = "N/A"

            # Extract Published Date
            try:
                published_on = dataset.find_element(By.XPATH, "./following::div[@class='grid-meta-val'][1]").text.strip()
                published_on = format_date(published_on)
            except:
                published_on = "N/A"

            # Extract Updated Date
            try:
                updated_on = dataset.find_element(By.XPATH, "./following::div[@class='grid-meta-val'][2]").text.strip()
                updated_on = format_date(updated_on)
            except:
                updated_on = "N/A"

            # Extract View Count
            try:
                views = dataset.find_element(By.XPATH, "./following::div[@class='grid-meta-val'][3]").text.strip()
            except:
                views = "N/A"

            page_data.append([dataset_name, ministry, published_on, updated_on, views])
            print(f"✅ Scraped: {dataset_name}")

        except Exception as e:
            print(f"⚠️ Skipping dataset due to error: {e}")

    return page_data

# Main scraping loop
all_data = []
for page in range(start_page, MAX_PAGES + 1):
    page_data = scrape_page(page)
    
    if page_data:
        all_data.extend(page_data)

    # Save progress every 4 pages
    if page % 4 == 0 or page == MAX_PAGES:
        df = pd.DataFrame(all_data, columns=["Dataset Name", "Ministry", "Published On", "Updated On", "Views"])
        
        # **Save the file inside "gov_datasets" folder**
        file_name = os.path.join(save_folder, f"datasets_page_{page-3}_to_{page}.xlsx")
        df.to_excel(file_name, index=False)
        print(f"📁 Saved: {file_name}")

        all_data = []  # Clear data list after saving

        # Save the last scraped page for resuming
        with open(resume_file, "w") as f:
            f.write(str(page))

# Close WebDriver
driver.quit()
print("✅ Scraping Completed.")


🔄 Scraping Page 1...
✅ Scraped: Scheme for Promotion of Culture of Science (SPOCS)
✅ Scraped: Senior/Young Artist Scheme
✅ Scraped: Financial Assistance for Development of Buddhist/Tibetan Arts and Culture
✅ Scraped: Festival of India
✅ Scraped: Grants-in-aid to Indo-Foreign Friendship Cultural Societies
✅ Scraped: Repertory Grant Scheme
✅ Scraped: Financial Assistance for Preservation and Development of Cultural Heritage of the Himalayas
✅ Scraped: General Election to Lok Sabha 2024 - Statistical Reports Data
🔄 Scraping Page 2...
✅ Scraped: Junior Fellowship Scheme
✅ Scraped: Financial Assistance for Tagore Cultural Complexes (TCC)
✅ Scraped: Cultural Function and Production Grant (CFPG)
✅ Scraped: Senior Fellowship Scheme Data
✅ Scraped: Museum Grant Scheme
✅ Scraped: Utilisation of MPLAD Scheme funds and detail of works-16th lok sabha MPs
✅ Scraped: Utilisation of MPLAD Scheme funds and detail of works-Since inception of scheme
✅ Scraped: Utilisation of MPLAD Scheme funds and detail