In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import sys

In [9]:
# List of state-specific Redbus links to scrape
state_links = [
    "https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/tsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/kaac-transport",
    "https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile",
    "https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile"
]

In [10]:
# Lists to store scraped data
states = []
routes = []
links = []

# XPaths for scraping
xpath_routes = '//a[@class="route"]'
xpath_pagination_table = '//div[@class="DC_117_paginationTable"]'
xpath_statename = "//h1[@class='D120_search_h1 D120_default' and @id='toc_id_2']"

temp_path = 'G:/PROJECT - REDBUS/01_bus_routes.csv'

# Initialize Chrome WebDriver
try:
    driver = webdriver.Chrome()
except Exception as e:
    print(f"Error initializing WebDriver: {e}")
    sys.exit(1)

# Define driver wait time
wait = WebDriverWait(driver, 10)  #10 seconds

In [11]:
def rename_state(text):
    states_data = {
        "Kerala RTC Online Ticket Booking": "Kerala",
        "APSRTC": "Andhra Pradesh",
        "TSRTC Online Bus Ticket Booking": "Telangana",
        "Kadamba Transport Corporation Limited (KTCL)": "Kadamba",
        "RSRTC": "Rajasthan",
        "South Bengal State Transport Corporation (SBSTC)": "South Bengal",
        "HRTC": "Himachal",
        "KAAC TRANSPORT": "Assam",
        "UPSRTC": "Uttar Pradesh",
        "WBTC (CTC)": "West Bengal"
    }
    # Return the mapped value if found, else return the original text
    return states_data.get(text,text)

In [12]:
def scrape_data(state_links):
    try:
        for link in state_links:
            driver.get(link)
            driver.maximize_window()
            driver.execute_script("window.scrollBy(0,1500)")  # Scroll down to 1500 pixels
            time.sleep(1)

            # Get total number of pages
            page_elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"{xpath_pagination_table}/div")))
            total_pages = len(page_elements)
            
            # Get state name
            state_name = wait.until(EC.presence_of_element_located((By.XPATH, xpath_statename)))
            state = rename_state(state_name.text)

            state_data, routes_data, links_data = [], [], []
            for i in range(1, total_pages + 1):
                objects = wait.until(EC.presence_of_all_elements_located((By.XPATH, xpath_routes))) 
                for obj in objects:
                    text = obj.get_attribute('text')
                    href = obj.get_attribute('href')
                    routes_data.append(text)
                    links_data.append(href)
                    state_data.append(state)
                if i == total_pages:
                    break
                next_page_xpath = f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
                nextpage = driver.find_element(By.XPATH, next_page_xpath)
                nextpage.click()
                time.sleep(0.2)
                
            routes.extend(routes_data)                                                
            links.extend(links_data)
            states.extend(state_data)
    except Exception as e:
        print(e)
    finally:
        driver.quit()

In [13]:
#Main function for scraping state, routes, links
scrape_data(state_links)

In [14]:
DATA = pd.DataFrame({'state_names': states, 'routes': routes, 'links': links}) #convert to dataframe
DATA.to_csv(temp_path, index=False) #dataframe to csv

In [15]:
DATA

Unnamed: 0,state_names,routes,links
0,Andhra Pradesh,Hyderabad to Vijayawada,https://www.redbus.in/bus-tickets/hyderabad-to...
1,Andhra Pradesh,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...
2,Andhra Pradesh,Hyderabad to Ongole,https://www.redbus.in/bus-tickets/hyderabad-to...
3,Andhra Pradesh,Kakinada to Visakhapatnam,https://www.redbus.in/bus-tickets/kakinada-to-...
4,Andhra Pradesh,Bangalore to Tirupati,https://www.redbus.in/bus-tickets/bangalore-to...
...,...,...,...
311,West Bengal,Habra to Nandakumar (west bengal),https://www.redbus.in/bus-tickets/habra-to-nan...
312,West Bengal,Habra to Kolaghat,https://www.redbus.in/bus-tickets/habra-to-kol...
313,West Bengal,Kolkata to Mayapur ISKCON,https://www.redbus.in/bus-tickets/kolkata-to-m...
314,West Bengal,Habra to Heria,https://www.redbus.in/bus-tickets/habra-to-heria
