In [5]:
import time
import random
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    WebDriverException
)

# Setup for headless Chrome and webdriver
def setup_driver():
    options = Options()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    # Set your chromedriver path
    service = Service('C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe')
    driver = webdriver.Chrome(service=service, options=options)
    
    return driver

# Random waiting function to mimic human browsing behavior
def random_wait():
    wait_time = random.uniform(2, 5)
    print(f"Waiting for {wait_time:.2f} seconds...")
    time.sleep(wait_time)

def scrape_ugc_page(driver, page_number):
    url = "https://www.ugc.gov.in/universitydetails/university?type=ddmCMsxJZgXH2S/m0uMOKQ=="
    
    if page_number == 1:
        print(f"Opening URL: {url}")
        driver.get(url)

        try:
            # Click the "View All" button
            print("Clicking 'View All' button...")
            view_all_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, "btnall"))
            )
            view_all_button.click()
            time.sleep(3)  # Wait for data to load
        except Exception as e:
            print("Error clicking 'View All' button:", e)
            return []

    print(f"Scraping data from page {page_number}...")

    try:
        # Wait for table data to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//table[@id='tbl']//tbody/tr"))
        )

        # Get all table rows
        rows = driver.find_elements(By.XPATH, "//table[@id='tbl']//tbody/tr")
        print(f"Found {len(rows)} universities on page {page_number}.")

        if not rows:
            return []

        data = []
        for index, row in enumerate(rows):
            print(f"Extracting data for university {index + 1} of {len(rows)}...")

            try:
                serial_no = row.find_element(By.XPATH, "./td[1]").text.strip()
                uni_type = row.find_element(By.XPATH, "./td[2]").text.strip()
                name = row.find_element(By.XPATH, "./td[3]").text.strip()
                address = row.find_element(By.XPATH, "./td[4]").text.strip()
                status = row.find_element(By.XPATH, "./td[7]").text.strip()  # Fix: Status comes before State
                state =  row.find_element(By.XPATH, "./td[6]").text.strip()    # Fix: State now correctly mapped
                zipcode = row.find_element(By.XPATH, "./td[5]").text.strip()  # Fix: Zipcode correctly mapped

                data.append({
                    "Serial No": serial_no,
                    "Type": uni_type,
                    "University Name": name,
                    "Address": address,
                    "Zip": zipcode,
                    "State": state,
                    "Status": status
                })

                print(f"Scraped: {serial_no} | {name} | {state}")

            except Exception as e:
                print(f"Error extracting details for university {index + 1}: {e}")
                continue  # Skip to the next row in case of error

        return data

    except TimeoutException:
        print(f"Timeout error on page {page_number}. Skipping page.")
        return []

# Function to save results to separate Excel files for each page
def save_to_excel(data, page_number):
    if not data:
        print(f"No data found for page {page_number}, skipping file creation.")
        return
    
    df = pd.DataFrame(data)
    filename = f"ugc_universities_page_{page_number}.xlsx"
    df.to_excel(filename, index=False)
    print(f"Data for page {page_number} saved to {filename}")

def scrape_multiple_pages():
    driver = setup_driver()
    page_number = 1

    while True:
        print(f"Processing Page {page_number}")

        page_data = scrape_ugc_page(driver, page_number)

        if not page_data:
            break  # Stop if no universities are found

        save_to_excel(page_data, page_number)

        try:
            # Check if "Next" button is available
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, "tbl_next"))
            )

            # If the button is disabled, stop pagination
            if "disabled" in next_button.get_attribute("class"):
                print("Pagination ended. No more pages.")
                break

            # Click "Next" button to go to the next page
            print("Clicking 'Next' button to load next page...")
            next_button.click()
            page_number += 1
            random_wait()
        except Exception as e:
            print(f"Pagination ended: {e}")
            break  # Stop when there's no "Next" button

    driver.quit()
    print("Scraping completed.")

if __name__ == "__main__":
    scrape_multiple_pages()


Processing Page 1
Opening URL: https://www.ugc.gov.in/universitydetails/university?type=ddmCMsxJZgXH2S/m0uMOKQ==
Clicking 'View All' button...
Scraping data from page 1...
Found 25 universities on page 1.
Extracting data for university 1 of 25...
Scraped: 1 | "Kaushalya" the Skill University | Gujarat
Extracting data for university 2 of 25...
Scraped: 2 | A.K.S. University | Madhya Pradesh
Extracting data for university 3 of 25...
Scraped: 3 | A.P.G. (Alakh Prakash Goyal) Shimla University | Himachal Pradesh
Extracting data for university 4 of 25...
Scraped: 4 | A.P.J. Abdul Kalam Technological University | Kerala
Extracting data for university 5 of 25...
Scraped: 5 | AAFT University of Media and Arts | Chhattisgarh
Extracting data for university 6 of 25...
Scraped: 6 | Abhilashi University | Himachal Pradesh
Extracting data for university 7 of 25...
Scraped: 7 | Abhyuday University | Madhya Pradesh
Extracting data for university 8 of 25...
Scraped: 8 | Academy of Maritime Education an