In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os
import time

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])

# Initialize WebDriver
chromedriver_path = "C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe"
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Create 'mospi' folder if it doesn't exist
output_folder = "mospi"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Define base URL
base_url = "https://www.mospi.gov.in/download-reports?page={}"

# Function to get already scraped pages
def get_last_scraped_page():
    existing_files = [f for f in os.listdir(output_folder) if f.startswith("mospi_page_")]
    if existing_files:
        last_file = sorted(existing_files)[-1]
        last_page = int(last_file.split("_")[-1].split(".")[0])
        return last_page + 1  # Resume from the next page
    return 0

# Resume from the last scraped page
start_page = get_last_scraped_page()

# Scrape in batches of 5 pages at a time
batch_size = 5
current_page = start_page

while True:
    data_list = []

    for _ in range(batch_size):
        driver.get(base_url.format(current_page))
        time.sleep(2)  # Allow time for the page to load
        
        # Extract table rows
        rows = driver.find_elements(By.XPATH, "//tbody/tr")
        
        for row in rows:
            try:
                sl_no = row.find_element(By.CLASS_NAME, "views-field-counter").text.strip()
                title_element = row.find_element(By.CLASS_NAME, "views-field-title").find_element(By.TAG_NAME, "a")
                report_title = title_element.text.strip()
                report_link = title_element.get_attribute("href")
                report_number = row.find_element(By.CLASS_NAME, "views-field-field-report-number").text.strip()
                round_number = row.find_element(By.CLASS_NAME, "views-field-field-round").text.strip()
                
                data_list.append([sl_no, report_title, report_link, report_number, round_number])
            except Exception as e:
                print(f"Error extracting data on page {current_page}: {e}")

        print(f"Scraped page {current_page}")
        current_page += 1

    if not data_list:
        print("No more data to scrape. Exiting...")
        break

    # Save to Excel
    df = pd.DataFrame(data_list, columns=["Sl.No", "Subject", "Report Link", "Report Number", "Round"])
    file_name = os.path.join(output_folder, f"mospi_page_{current_page - 1}.xlsx")
    df.to_excel(file_name, index=False)
    print(f"Saved {file_name}")

driver.quit()


Scraped page 0
Scraped page 1
Scraped page 2
Scraped page 3
Scraped page 4
Saved mospi\mospi_page_4.xlsx
Scraped page 5
Scraped page 6
Scraped page 7
Scraped page 8
Scraped page 9
Saved mospi\mospi_page_9.xlsx
Scraped page 10
Scraped page 11
Scraped page 12
Scraped page 13
Scraped page 14
Saved mospi\mospi_page_14.xlsx
Scraped page 15
Scraped page 16
Scraped page 17
Scraped page 18
Scraped page 19
Saved mospi\mospi_page_19.xlsx
Scraped page 20
Scraped page 21
Scraped page 22
Scraped page 23
Scraped page 24
Saved mospi\mospi_page_24.xlsx
Scraped page 25
Scraped page 26
Scraped page 27
Scraped page 28
Scraped page 29
Saved mospi\mospi_page_29.xlsx
Scraped page 30
Scraped page 31
Scraped page 32
Scraped page 33
Scraped page 34
Saved mospi\mospi_page_34.xlsx
Scraped page 35
Scraped page 36
Scraped page 37
Scraped page 38
Scraped page 39
Saved mospi\mospi_page_39.xlsx
Scraped page 40
Scraped page 41
Scraped page 42
Scraped page 43
Scraped page 44
Saved mospi\mospi_page_44.xlsx
Scraped page 