In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import pandas as pd
import os
import random

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode for efficiency
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])

# Initialize WebDriver
chromedriver_path = "C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe"
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL
BASE_URL = "https://eprocure.gov.in/cppp/searchbyorg/Department%20of%20Posts?page="

# Folder & Progress File
SAVE_FOLDER = "tender_list"
PROGRESS_FILE = "end_scraped_page.txt"

# Create folder if not exists
if not os.path.exists(SAVE_FOLDER):
    os.makedirs(SAVE_FOLDER)

# Get last scraped page
def get_final_scraped_page():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as f:
            return int(f.read().strip())
    return 1

# Save progress
def save_progress(page_number):
    with open(PROGRESS_FILE, "w") as f:
        f.write(str(page_number))

# Function to scrape data from a page
def scrape_page(page_num):
    url = BASE_URL + str(page_num)
    driver.get(url)
    time.sleep(random.uniform(2, 5))  # Random delay to avoid detection
    
    tenders = []
    try:
        rows = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//tr[@style='border-bottom: 1px solid #ffffff;background-color: #FAFAFA;']"))
        )
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            if len(columns) >= 6:
                tenders.append([
                    columns[0].text.strip(),  # Sl.no
                    columns[1].text.strip(),  # Published Date
                    columns[2].text.strip(),  # Bid Submission Closing Date
                    columns[3].text.strip(),  # Tender Opening Date
                    columns[4].text.strip(),  # Title/Tender ID
                    columns[5].text.strip()   # Organization Name
                ])
    except TimeoutException:
        print(f"Timeout on page {page_num}")
    except NoSuchElementException:
        print(f"Elements not found on page {page_num}")
    
    return tenders

# Main scraping loop
start_page = get_final_scraped_page()
data = []
batch = (start_page - 1) // 10 + 1  # Start batch from correct number

for page in range(start_page, 4172):  # Adjusted for 4000+ pages
    print(f"Scraping page {page}...")
    tenders = scrape_page(page)
    data.extend(tenders)
    
    if page % 10 == 0:  # Save every 10 pages
        batch_folder = os.path.join(SAVE_FOLDER, f"Batch_{batch}")
        if not os.path.exists(batch_folder):
            os.makedirs(batch_folder)
        
        df = pd.DataFrame(data, columns=["Sl.no", "Published Date", "Bid Submission Closing Date", "Tender Opening Date", "Title/Tender ID", "Organization Name"])
        df.to_excel(os.path.join(batch_folder, f"tenders_{page-9}_to_{page}.xlsx"), index=False)
        print(f"Saved batch {batch} (Pages {page-9} to {page})")
        data = []
        batch += 1
        save_progress(page)
    
# Save remaining data if any
if data:
    batch_folder = os.path.join(SAVE_FOLDER, f"Batch_{batch}")
    if not os.path.exists(batch_folder):
        os.makedirs(batch_folder)
    df = pd.DataFrame(data, columns=["Sl.no", "Published Date", "Bid Submission Closing Date", "Tender Opening Date", "Title/Tender ID", "Organization Name"])
    df.to_excel(os.path.join(batch_folder, f"tenders_final.xlsx"), index=False)
    print(f"Saved final batch {batch}")

# Close driver
driver.quit()
