In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import time

In [2]:
df = pd.read_csv('01_bus_routes.csv')

In [3]:
all_bus_details = []

#XPATH
xpath_bus_details = '//div[@class="clearfix row-one"]'

#create a chrome driver instance
driver = webdriver.Chrome()

#defines to wait until a certain operation
wait = WebDriverWait(driver,10) #10 seconds dynamic wait
wait_ = WebDriverWait(driver,5) 

In [4]:
#Helper function to select view_buses
def view_buses():
    xpath_view_buses = '//div[@class="button" and text()="View Buses"]'
    
    try:
        object_viewbuses = wait.until(EC.presence_of_all_elements_located((By.XPATH,xpath_view_buses)))
    except (NoSuchElementException,TimeoutException):
        return
    
    if object_viewbuses:
        for element in reversed(object_viewbuses): #reversed() to click button from bottom so, following buttons will be in sight
                    try:
                        time.sleep(2)
                        element.click()
                        time.sleep(2) # 2 sec wait to load dynamic content
                    except Exception:
                        continue

In [5]:
#helper function to scroll bottom of page dynamically
def scroll_to_bottom():
    while True:
        # get initial page height
        current_page_height = driver.execute_script("return document.body.scrollHeight")

        #scroll to bottom of page
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        
        # Wait for the page to load new content...if any
        time.sleep(1)

        #get new height after scrolling page
        currentpage_new_height = driver.execute_script("return document.body.scrollHeight") 

        if currentpage_new_height == current_page_height:
            break
        
        current_page_height = currentpage_new_height


In [6]:
# for each iteration it scrapes current page data
def scrape_current_page(state, route, link, elements_data):
    bus_details = []
    for element in elements_data:
        try:
            bus_name = element.find_element(By.XPATH, './/div[@class="travels lh-24 f-bold d-color"]').text
        except NoSuchElementException:
            bus_name = "NA"

        try:
            bus_type = element.find_element(By.XPATH, './/div[@class="bus-type f-12 m-top-16 l-color evBus"]').text
        except NoSuchElementException:
            bus_type = "NA"

        try:
            departing_time = element.find_element(By.XPATH, './/div[@class="dp-time f-19 d-color f-bold"]').text
        except NoSuchElementException:
            departing_time = "NA"

        try:
            duration = element.find_element(By.XPATH, './/div[@class="dur l-color lh-24"]').text
        except NoSuchElementException:
            duration = "NA"

        try:
            reaching_time = element.find_element(By.XPATH, './/div[@class="bp-time f-19 d-color disp-Inline"]').text
        except NoSuchElementException:
            reaching_time = "NA"

        try:
            price = element.find_element(By.XPATH, './/span[contains(@class, "f-19 f-bold") or contains(@class, "f-bold f-19")]').text
        except NoSuchElementException:
            price = "NA"

        try:
            star_rating = element.find_element(By.XPATH, './/div[@class="rating-sec lh-24"]//span').text
        except NoSuchElementException:
            try:
                element.find_element(By.XPATH, './/span[contains(@class, "blue rating_badge")]')
                star_rating = '0'
            except NoSuchElementException:
                star_rating = '0'

        try:
            seat_available = element.find_element(By.XPATH, './/div[contains(@class, "seat-left") and (contains(@class, "m-top-16") or contains(@class, "m-top-30"))]').text
        except NoSuchElementException:
            seat_available = "NA"

        bus_details.append([state, route, link, bus_name, bus_type, departing_time, duration, reaching_time, price, star_rating, seat_available])
    
    return bus_details


In [7]:
def check_oops_message():
    # check_xpath = "//div[@class='oops-wrapper']//h3[text()='Oops! No buses found.']"
    check_xpath = (
        "//div[(contains(@class, 'oops-wrapper') and //h3[text()='Oops! No buses found.']) or "
        "(@class='oops-wrapper new_oops_wrapper' and text()='Oops! No buses found.')]"
    )
    try:
        wait_.until(EC.presence_of_element_located((By.XPATH, check_xpath)))
        return True
    except (NoSuchElementException,TimeoutException):
        return False

In [8]:

def main_function():
    for _,row in df.iterrows():
        state = row['state_names']
        route = row['routes']
        link = row['links']

        try:
            driver.get(link)
            
            if check_oops_message():
                continue

            driver.maximize_window()
            view_buses()  #click on button view_buses
            scroll_to_bottom()  #scroll pages dynamically to bottom

            elements_data = wait_.until(EC.presence_of_all_elements_located((By.XPATH,xpath_bus_details)))

            if elements_data:
                current_page_data = scrape_current_page(state, route, link, elements_data)
                all_bus_details.extend(current_page_data)
            else:
                print(f"No buses available for state: {state}, route: {route}, link: {link}")    
        except Exception as e:
            print(f"Error on line: state: {state}, route: {route}, link: {link}")
            print(f"Error_details: {e}")         

    driver.quit()

In [9]:
main_function()
final_data = pd.DataFrame(all_bus_details,columns=['state','route','link','busname','bustype','departing_time','duration','reaching_time','price','star_rating','seats_available'])
final_data.to_csv('G:/PROJECT - REDBUS/02_big_data.csv',index=False)

In [10]:
final_data

Unnamed: 0,state,route,link,busname,bustype,departing_time,duration,reaching_time,price,star_rating,seats_available
0,Andhra Pradesh,Hyderabad to Vijayawada,https://www.redbus.in/bus-tickets/hyderabad-to...,APSRTC - 4916,"SUPER LUXURY (NON-AC, 2 + 2 PUSH BACK)",18:45,07h 30m,02:15,469,3.4,23 Seats available
1,Andhra Pradesh,Hyderabad to Vijayawada,https://www.redbus.in/bus-tickets/hyderabad-to...,APSRTC - 2680,VENNELA (A.C. SLEEPER),19:00,07h 00m,02:00,839,4.2,14 Seats available
2,Andhra Pradesh,Hyderabad to Vijayawada,https://www.redbus.in/bus-tickets/hyderabad-to...,APSRTC - 2798,INDRA(A.C. Seater),19:15,07h 15m,02:30,567,3.3,33 Seats available
3,Andhra Pradesh,Hyderabad to Vijayawada,https://www.redbus.in/bus-tickets/hyderabad-to...,APSRTC - 4972,"SUPER LUXURY (NON-AC, 2 + 2 PUSH BACK)",19:20,08h 00m,03:20,469,4.2,18 Seats available
4,Andhra Pradesh,Hyderabad to Vijayawada,https://www.redbus.in/bus-tickets/hyderabad-to...,APSRTC - 2563,INDRA(A.C. Seater),19:30,05h 55m,01:25,528,3.3,29 Seats available
...,...,...,...,...,...,...,...,...,...,...,...
8852,West Bengal,Kolkata to Digha,https://www.redbus.in/bus-tickets/kolkata-to-d...,Anadi Travels (Disha Gold),A/C Seater Push Back (2+3),23:00,05h 40m,04:40,500,1.0,60 Seats available
8853,West Bengal,Kolkata to Durgapur (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-d...,Royal Cruiser,Scania Multi-Axle AC Semi Sleeper (2+2),19:30,03h 45m,23:15,463,3.4,25 Seats available
8854,West Bengal,Kolkata to Asansol (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-a...,Royal Cruiser,Scania Multi-Axle AC Semi Sleeper (2+2),19:30,04h 45m,00:15,509,3.8,14 Seats available
8855,West Bengal,Kolkata to Asansol (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-a...,Beauty Travels,NON A/C Seater / Sleeper (2+2),20:20,04h 10m,00:30,380,2.9,36 Seats available
