In [6]:
import time
import random
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def setup_driver():
    options = Options()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    service = service('C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe')
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def random_wait():
    wait_time = random.uniform(1, 2)
    print(f"Waiting for {wait_time:.2f} seconds...")
    time.sleep(wait_time)

def save_to_excel(data, file_path):
    df = pd.DataFrame(data, columns=["CD Rank", "College Name", "Course Fee", "Average Package", "User Review", "Ranking"])
    if os.path.exists(file_path):
        existing_df = pd.read_excel(file_path)
        df = pd.concat([existing_df, df], ignore_index=True)
    df.to_excel(file_path, index=False)
    print(f"Data saved to {file_path}")

def scrape_collegedunia(url="https://collegedunia.com/btech-colleges"):
    driver = setup_driver()
    driver.get(url)
    random_wait()
    
    os.makedirs("college dunia", exist_ok=True)
    file_path = os.path.join("college dunia", "college_data.xlsx")
    
    if not os.path.exists(file_path):
        pd.DataFrame(columns=["CD Rank", "College Name", "Course Fee", "Average Package", "User Review", "Ranking"]).to_excel(file_path, index=False)
    
    scraped_colleges = set()
    last_height = 0
    
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
        while True:
            college_data = []
            colleges = driver.find_elements(By.XPATH, "//h3[contains(@class, 'font-weight-medium text-lg mb-0')]")
            
            for college in colleges:
                try:
                    name = college.text.strip()
                    if name in scraped_colleges:
                        continue
                    
                    cd_rank = college.find_element(By.XPATH, "./preceding::td[contains(@class, 'font-weight-medium text-lg position-relative')][1]").text.strip()
                    course_fee = college.find_element(By.XPATH, "./following::span[contains(@class, 'text-lg text-green d-block font-weight-bold mb-1')][1]").text.strip()
                    avg_package = college.find_element(By.XPATH, "./following::span[contains(@class, 'text-green d-block mb-1')][1]").text.strip()
                    user_review = college.find_element(By.XPATH, "./following::span[contains(@class, 'text-lg text-primary d-block font-weight-medium mb-1')][1]").text.strip()
                    ranking = college.find_element(By.XPATH, "./following::span[contains(@class, 'jsx-2794970405')][1]").text.strip()
                    
                    college_data.append([cd_rank, name, course_fee, avg_package, user_review, ranking])
                    scraped_colleges.add(name)
                except NoSuchElementException:
                    continue
            
            if college_data:
                save_to_excel(college_data, file_path)
            
            new_height = driver.execute_script("return document.documentElement.scrollHeight")
            if new_height == last_height:
                print("No more new content to load. Stopping scrape.")
                break
            last_height = new_height
            
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            random_wait()
            
    except TimeoutException:
        print("Timed out waiting for page elements to load")
    
    driver.quit()

# Example Usage
scrape_collegedunia("https://collegedunia.com/btech-colleges")


Waiting for 1.32 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.21 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.21 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.83 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.64 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.42 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.10 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.06 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.80 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.51 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.70 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.09 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.17 seconds...
Data saved to college dunia\college_data.xlsx
Waiting for 1.28 seconds...
Data saved