## DATA SCRAPING WITH SELENIUM AND PYTHON

In [2]:
# Importing required modules

import time
import numpy as np 
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

### 1. ANDHRA

#### ROUTE_NAMES AND ROUTE_LINKS

In [8]:
# opens browser
driver_AP = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_AP.get("https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile")
driver_AP.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_AP, 10)

def Andhra_routes_and_links(path):
    Andhra_routes = []
    Andhra_links = []

    # route links
    for i in range(1, 6):
        paths = driver_AP.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            Andhra_links.append(l)

         # route names
        for route in paths:
            Andhra_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_AP)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return Andhra_routes, Andhra_links

Andhra_routes, Andhra_links = Andhra_routes_and_links("//a[@class='route']")

No more pages to paginate


In [9]:
# Convert lists into df
df_AP = pd.DataFrame({"Route_name":Andhra_routes, "Route_link":Andhra_links})

# Convert df to csv file 
df_AP.to_csv('df_AP.csv', index = False)

# Read the csv file
df_1 = pd.read_csv('df_AP.csv')
df_1

Unnamed: 0,Route_name,Route_link
0,Hyderabad to Vijayawada,https://www.redbus.in/bus-tickets/hyderabad-to...
1,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...
2,Hyderabad to Ongole,https://www.redbus.in/bus-tickets/hyderabad-to...
3,Kakinada to Visakhapatnam,https://www.redbus.in/bus-tickets/kakinada-to-...
4,Bangalore to Tirupati,https://www.redbus.in/bus-tickets/bangalore-to...
5,Bangalore to Kadapa,https://www.redbus.in/bus-tickets/bangalore-to...
6,Ongole to Hyderabad,https://www.redbus.in/bus-tickets/ongole-to-hy...
7,Kadapa to Bangalore,https://www.redbus.in/bus-tickets/kadapa-to-ba...
8,Chittoor (Andhra Pradesh) to Bangalore,https://www.redbus.in/bus-tickets/chittoor-and...
9,Visakhapatnam to Kakinada,https://www.redbus.in/bus-tickets/visakhapatna...


#### BUS_DETAILS

In [10]:
# retrive the bus details
driver_AP = webdriver.Chrome()
Bus_names_AP = []
Bus_types_AP = []
Start_Time_AP = []
End_Time_AP = []
Ratings_AP = []
Total_Duration_AP = []
Prices_AP = []
Seats_Available_AP = []
Route_names = []
Route_links = []

for i,r in df_1.iterrows():
    link=r["Route_link"]
    routes=r["Route_name"]

    # Loop through each link
    driver_AP.get(link)
    driver_AP.maximize_window()
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver_AP.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    # click elements to views bus
    try:
        clicks = driver_AP.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_AP, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_AP.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_AP).send_keys(Keys.END).perform()

            time.sleep(5)  # Adjust sleep time as needed

            new_page_source = driver_AP.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException: 

        scrolling = True
        while scrolling:
            old_page_source = driver_AP.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_AP).send_keys(Keys.END).perform()

            time.sleep(5)  # Adjust sleep time as needed

            new_page_source = driver_AP.page_source

            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver_AP.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_AP.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_AP.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_AP.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_AP.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_AP.find_elements(By.XPATH,"//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_AP.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_AP.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_AP.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_AP.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_AP.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_AP.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_AP.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_AP.append(ratings.text)

    for price_elem in price:
        Prices_AP.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_AP.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [11]:
# convert lists to a dictionary
data_1 = {
    "Bus_name": Bus_names_AP,
    "Bus_type": Bus_types_AP,
    "Start_time": Start_Time_AP,
    "End_time": End_Time_AP,
    "Total_duration": Total_Duration_AP,
    "Price": Prices_AP,
    "Seats_Available": Seats_Available_AP,
    "Ratings": Ratings_AP,
    "Route_link": Route_links,
    "Route_name": Route_names,
}

# convert dictionary to dataframe
df_buses_1 = pd.DataFrame(data_1)

# convert dataframe to csv and read
df_buses_1.to_csv("df_buses_1.csv",index=False)
df_buses_1 = pd.read_csv("df_buses_1.csv")
df_buses_1

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,Sri KVR Travels,A/C Sleeper (2+1),23:35,05:15,05h 40m,INR 560,13 Seats available,4.6\n295,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
1,FRESHBUS,Electric A/C Seater (2+2),23:10,05:35,06h 25m,450,28 Seats available,4.5\n504,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
2,IntrCity SmartBus,A/C Seater / Sleeper (2+1),23:50,05:35,05h 45m,INR 433,20 Seats available,4.4\n949,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
3,AdSri Durga Malleswari Travels,A/C Sleeper (2+1),23:10,04:35,05h 25m,665,17 Seats available,4.0\n111,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
4,IntrCity SmartBus,Scania AC Multi Axle Sleeper (2+1),23:59,06:30,06h 31m,INR 595,16 Seats available,4.5\n465,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
...,...,...,...,...,...,...,...,...,...,...
3639,True Bus,Volvo Multi Axle B9R A/C Sleeper (2+1),17:00,23:59,06h 59m,INR 995,2 Seats available,3.3,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Nandyal
3640,True Bus,Volvo Multi Axle B9R A/C Sleeper (2+1),22:30,04:30,06h 00m,INR 1349,27 Seats available,3.3,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Nandyal
3641,Muneer Travels,Non A/C Seater / Sleeper (2+1),22:15,03:45,05h 30m,INR 500,33 Seats available,3.0,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Nandyal
3642,DHANUNJAYA TRAVELS,A/C Sleeper (2+1),21:00,03:00,06h 00m,INR 799,21 Seats available,2.2,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Nandyal


### 2. KERALA

#### ROUTE_NAMES AND ROUTE_LINKS

In [17]:
# opens browser
driver_K = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_K.get("https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile")
driver_K.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_K, 10)

def Kerala_routes_and_links(path):
    Kerala_routes = []
    Kerala_links = []

    # route links
    for i in range(1, 3):
        paths = driver_K.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            Kerala_links.append(l)

         # route names
        for route in paths:
            Kerala_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_K)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return Kerala_routes, Kerala_links

Kerala_routes, Kerala_links = Kerala_routes_and_links("//a[@class='route']")

No more pages to paginate


In [18]:
# Convert lists into df
df_K = pd.DataFrame({"Route_name":Kerala_routes, "Route_link":Kerala_links})

# Convert df to csv file 
df_K.to_csv('df_K.csv', index = False)

# Read the csv file
df_2 = pd.read_csv('df_K.csv')
df_2

Unnamed: 0,Route_name,Route_link
0,Bangalore to Kozhikode,https://www.redbus.in/bus-tickets/bangalore-to...
1,Kozhikode to Bangalore,https://www.redbus.in/bus-tickets/kozhikode-to...
2,Kozhikode to Ernakulam,https://www.redbus.in/bus-tickets/kozhikode-to...
3,Ernakulam to Kozhikode,https://www.redbus.in/bus-tickets/ernakulam-to...
4,Bangalore to Kannur,https://www.redbus.in/bus-tickets/bangalore-to...
5,Kozhikode to Mysore,https://www.redbus.in/bus-tickets/kozhikode-to...
6,Kannur to Bangalore,https://www.redbus.in/bus-tickets/kannur-to-ba...
7,Kozhikode to Thiruvananthapuram,https://www.redbus.in/bus-tickets/kozhikode-to...
8,Mysore to Kozhikode,https://www.redbus.in/bus-tickets/mysore-to-ko...
9,Bangalore to Kalpetta (kerala),https://www.redbus.in/bus-tickets/bangalore-to...


#### BUS_DETAILS

In [21]:
# retrive the bus details
driver_K = webdriver.Chrome()
Bus_names_K = []
Bus_types_K = []
Start_Time_K = []
End_Time_K = []
Ratings_K = []
Total_Duration_K = []
Prices_K = []
Seats_Available_K = []
Route_names = []
Route_links = []

for i,r in df_2.iterrows():
    link=r["Route_link"]
    routes=r["Route_name"]

    # Loop through each link
    driver_K.get(link)
    driver_K.maximize_window()
    time.sleep(2)  

    # ClicK on elements to reveal bus details
    elements = driver_K.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    # clicK elements to views bus
    try:
        clicks = driver_K.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_K, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_K.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_K).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_K.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:  

        scrolling = True
        while scrolling:
            old_page_source = driver_K.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_K).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_K.page_source

            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver_K.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_K.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_K.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_K.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_K.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_K.find_elements(By.XPATH,"//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_K.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_K.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_K.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_K.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_K.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_K.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_K.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_K.append(ratings.text)

    for price_elem in price:
        Prices_K.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_K.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [22]:
# convert lists to a dictionary
data_2 = {
    'Bus_name': Bus_names_K,
    'Bus_type': Bus_types_K,
    'Start_time': Start_Time_K,
    'End_time': End_Time_K,
    'Total_duration': Total_Duration_K,
    'Price': Prices_K,
    "Seats_Available":Seats_Available_K,
    "Ratings":Ratings_K,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# convert dictionary to dataframe
df_buses_2 = pd.DataFrame(data_2)

# convert dataframe to csv and read
df_buses_2.to_csv("df_buses_2.csv",index=False)
df_buses_2 = pd.read_csv("df_buses_2.csv")
df_buses_2

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,KSRTC (Kerala) - 51,Swift Deluxe Non AC Air Bus (2+2),19:04,04:24,09h 20m,INR 640,10 Seats available,4.2\n44,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
1,KSRTC (Kerala) - 789,Super Fast Non AC Seater (2+3),20:00,04:25,08h 25m,INR 473,2 Seats available,3.4\n70,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
2,KSRTC (Kerala) - 1585,Super Express Non AC Seater Air Bus (2+2),20:31,05:36,09h 05m,INR 640,11 Seats available,3.9\n53,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
3,KSRTC (Kerala) - 1586,Super Express Non AC Seater Air Bus (2+2),21:31,06:36,09h 05m,INR 640,6 Seats available,3.6\n33,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
4,KSRTC (Kerala) - 2133,Swift Deluxe Non AC Air Bus (2+2),22:12,07:36,09h 24m,INR 640,5 Seats available,4.1\n18,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
...,...,...,...,...,...,...,...,...,...,...
313,EMERALD TRAVELS,Bharat Benz A/C Semi Sleeper (2+2),23:00,04:30,05h 30m,INR 1300,14 Seats available,3.2\n11,https://www.redbus.in/bus-tickets/kottayam-to-...,Kottayam to Kozhikode
314,A1 Travels,A/C Seater / Sleeper (2+1),23:59,07:00,07h 01m,INR 800,1 Seat available,2.2\n17,https://www.redbus.in/bus-tickets/kottayam-to-...,Kottayam to Kozhikode
315,Madhavi Travels,Volvo Multi-Axle I-Shift B11R Semi Sleeper (2+2),23:59,04:30,04h 31m,INR 1499,23 Seats available,3.9,https://www.redbus.in/bus-tickets/kottayam-to-...,Kottayam to Kozhikode
316,Holy Maria Travels,A/C Semi Sleeper (2+2),18:20,02:10,07h 50m,INR 900,15 Seats available,3.6,https://www.redbus.in/bus-tickets/kottayam-to-...,Kottayam to Kozhikode


### 3. GOA

#### ROUTE_NAMES AND ROUTE_LINKS

In [23]:
# opens browser
driver_G = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_G.get("https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile")
driver_G.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_G, 10)

def Goa_routes_and_links(path):
    Goa_routes = []
    Goa_links = []

    # route links
    for i in range(1, 5):
        paths = driver_G.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            Goa_links.append(l)

         # route names
        for route in paths:
            Goa_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_G)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2) 

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return Goa_routes, Goa_links

Goa_routes, Goa_links = Goa_routes_and_links("//a[@class='route']")

No more pages to paginate


In [24]:
# Convert lists into df
df_G = pd.DataFrame({"Route_name":Goa_routes, "Route_link":Goa_links})

# Convert df to csv file 
df_G.to_csv('df_G.csv', index = False)

# Read the csv file
df_3 = pd.read_csv('df_G.csv')
df_3

Unnamed: 0,Route_name,Route_link
0,Pune to Goa,https://www.redbus.in/bus-tickets/pune-to-goa
1,Goa to Pune,https://www.redbus.in/bus-tickets/goa-to-pune
2,Mumbai to Goa,https://www.redbus.in/bus-tickets/mumbai-to-goa
3,Bangalore to Goa,https://www.redbus.in/bus-tickets/bangalore-to...
4,Goa to Bangalore,https://www.redbus.in/bus-tickets/goa-to-banga...
5,Goa to Mumbai,https://www.redbus.in/bus-tickets/goa-to-mumbai
6,Pandharpur to Goa,https://www.redbus.in/bus-tickets/pandharpur-t...
7,Goa to Pandharpur,https://www.redbus.in/bus-tickets/goa-to-pandh...
8,Solapur to Goa,https://www.redbus.in/bus-tickets/solapur-to-goa
9,Calangute (goa) to Goa Airport,https://www.redbus.in/bus-tickets/calangute-go...


#### BUS_DETAILS

In [25]:
# retrive the bus details
driver_G = webdriver.Chrome()
Bus_names_G = []
Bus_types_G = []
Start_Time_G = []
End_Time_G = []
Ratings_G = []
Total_Duration_G = []
Prices_G = []
Seats_Available_G = []
Route_names = []
Route_links = []

for i,r in df_3.iterrows():
    link=r["Route_link"]
    routes=r["Route_name"]

    # Loop through each link
    driver_G.get(link)
    driver_G.maximize_window()
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver_G.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    try:
        clicks = driver_G.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_G, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_G.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_G).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_G.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:

        scrolling = True
        while scrolling:
            old_page_source = driver_G.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_G).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_G.page_source

            if new_page_source == old_page_source:
                scrolling = False       

    # Extract bus details
    bus_name = driver_G.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_G.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_G.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_G.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_G.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_G.find_elements(By.XPATH,"//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_G.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_G.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_G.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_G.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_G.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_G.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_G.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_G.append(ratings.text)

    for price_elem in price:
        Prices_G.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_G.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [26]:
# convert lists to a dictionary
data_3 = {
    'Bus_name': Bus_names_G,
    'Bus_type': Bus_types_G,
    'Start_time': Start_Time_G,
    'End_time': End_Time_G,
    'Total_duration': Total_Duration_G,
    'Price': Prices_G,
    "Seats_Available":Seats_Available_G,
    "Ratings":Ratings_G,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# convert dictionary to dataframe
df_buses_3 = pd.DataFrame(data_3)

# convert dataframe to csv and read
df_buses_3.to_csv("df_buses_3.csv",index=False)
df_buses_3 = pd.read_csv("df_buses_3.csv")
df_buses_3

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,Kadamba Transport Corporation Limited (KTCL) -...,A/C Sleeper (2+1),19:15,05:50,10h 35m,INR 1000,5 Seats available,3.7\n336,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
1,Ashray Travels,Non A/C Seater / Sleeper (2+1),21:15,07:45,10h 30m,INR 349,26 Seats available,4.2\n340,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
2,IntrCity SmartBus,Bharat Benz A/C Seater /Sleeper (2+1),21:00,08:05,11h 05m,INR 399,18 Seats available,4.3\n404,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
3,AdIntrCity SmartBus,Bharat Benz A/C Sleeper (2+1),21:30,08:45,11h 15m,INR 599,6 Seats available,4.2\n433,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
4,Gogte Anand Travels,Volvo Multi-Axle I-Shift A/C Sleeper (2+1),21:15,07:55,10h 40m,INR 699,23 Seats available,4.0\n106,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
...,...,...,...,...,...,...,...,...,...,...
426,Kadamba Transport Corporation Limited (KTCL) -...,A/C Seater (2+3),17:30,18:00,00h 30m,INR 150,47 Seats available,3.7,https://www.redbus.in/bus-tickets/mopa-airport...,Mopa Airport to Margao
427,Kadamba Transport Corporation Limited (KTCL) -...,A/C Seater (2+1),20:00,20:30,00h 30m,INR 150,15 Seats available,3.7,https://www.redbus.in/bus-tickets/mopa-airport...,Mopa Airport to Margao
428,Kadamba Transport Corporation Limited (KTCL) -...,A/C Seater (2+3),21:00,23:00,02h 00m,INR 400,47 Seats available,4.9,https://www.redbus.in/bus-tickets/mopa-airport...,Mopa Airport to Margao
429,Kadamba Transport Corporation Limited (KTCL) -...,A/C Seater (2+3),21:00,21:55,00h 55m,INR 200,47 Seats available,3.7,https://www.redbus.in/bus-tickets/mopa-airport...,Mopa Airport to Margao


### 4. RAJASTHAN

#### ROUTE_NAMES AND ROUTE_LINKS

In [3]:
# opens browser
driver_R = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_R.get("https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile")
driver_R.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_R, 10)

def Rajasthan_routes_and_links(path):
    Rajasthan_routes = []
    Rajasthan_links = []

    # route links
    for i in range(1, 3):
        paths = driver_R.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            Rajasthan_links.append(l)

         # route names
        for route in paths:
            Rajasthan_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_R)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return Rajasthan_routes, Rajasthan_links

Rajasthan_routes, Rajasthan_links = Rajasthan_routes_and_links("//a[@class='route']") 

No more pages to paginate


In [4]:
# Convert lists into df
df_R = pd.DataFrame({"Route_name":Rajasthan_routes, "Route_link":Rajasthan_links})

# Convert df to csv file 
df_R.to_csv('df_R.csv', index = False)

# Read the csv file
df_4 = pd.read_csv('df_R.csv')
df_4

Unnamed: 0,Route_name,Route_link
0,Udaipur to Jodhpur,https://www.redbus.in/bus-tickets/udaipur-to-j...
1,Jodhpur to Ajmer,https://www.redbus.in/bus-tickets/jodhpur-to-a...
2,Beawar (Rajasthan) to Jaipur (Rajasthan),https://www.redbus.in/bus-tickets/beawer-to-ja...
3,Sikar to Jaipur (Rajasthan),https://www.redbus.in/bus-tickets/sikar-to-jaipur
4,Jaipur (Rajasthan) to Jodhpur,https://www.redbus.in/bus-tickets/jaipur-to-jo...
5,Aligarh (uttar pradesh) to Jaipur (Rajasthan),https://www.redbus.in/bus-tickets/aligarh-utta...
6,Jaipur (Rajasthan) to Aligarh (uttar pradesh),https://www.redbus.in/bus-tickets/jaipur-to-al...
7,Jodhpur to Beawar (Rajasthan),https://www.redbus.in/bus-tickets/jodhpur-to-b...
8,Jaipur (Rajasthan) to Pilani,https://www.redbus.in/bus-tickets/jaipur-to-pi...
9,Kishangarh to Jaipur (Rajasthan),https://www.redbus.in/bus-tickets/kishangarh-t...


#### BUS_DETAILS

In [6]:
# retrive the bus details
driver_R = webdriver.Chrome()
Bus_names_R = []
Bus_types_R = []
Start_Time_R = []
End_Time_R = []
Ratings_R = []
Total_Duration_R = []
Prices_R = []
Seats_Available_R = []
Route_names = []
Route_links = []

for i,r in df_4.iterrows():
    link=r["Route_link"]
    routes=r["Route_name"]

    # Loop through each link
    driver_R.get(link)
    driver_R.maximize_window()
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver_R.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    try:   
        clicks = driver_R.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_R, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_R.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_R).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_R.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:

        scrolling = True
        while scrolling:
            old_page_source = driver_R.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_R).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_R.page_source

            if new_page_source == old_page_source:
                scrolling = False            

    # Extract bus details
    bus_name = driver_R.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_R.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_R.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_R.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_R.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_R.find_elements(By.XPATH,"//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_R.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_R.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_R.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_R.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_R.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_R.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_R.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_R.append(ratings.text)

    for price_elem in price:
        Prices_R.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_R.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [7]:
# convert lists to a dictionary
data_4 = {
    'Bus_name': Bus_names_R,
    'Bus_type': Bus_types_R,
    'Start_time': Start_Time_R,
    'End_time': End_Time_R,
    'Total_duration': Total_Duration_R,
    'Price': Prices_R,
    "Seats_Available":Seats_Available_R,
    "Ratings":Ratings_R,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# convert dictionary to dataframe
df_buses_4 = pd.DataFrame(data_4)

# convert dataframe to csv and read
df_buses_4.to_csv("df_buses_4.csv",index=False)
df_buses_4 = pd.read_csv("df_buses_4.csv")
df_buses_4

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,Jain travels regd,A/C Seater / Sleeper (2+1),22:15,04:30,06h 15m,INR 460,21 Seats available,4.1\n251,https://www.redbus.in/bus-tickets/udaipur-to-j...,Udaipur to Jodhpur
1,Raj Travel,A/C Sleeper (2+1),22:30,05:30,07h 00m,INR 560,12 Seats available,4.6\n70,https://www.redbus.in/bus-tickets/udaipur-to-j...,Udaipur to Jodhpur
2,Jakhar Travels,A/C Seater / Sleeper (2+1),22:00,04:30,06h 30m,INR 400,17 Seats available,3.9\n259,https://www.redbus.in/bus-tickets/udaipur-to-j...,Udaipur to Jodhpur
3,Jain travels regd,NON AC Seater / Sleeper 2+1,22:00,04:00,06h 00m,INR 350,25 Seats available,3.6\n83,https://www.redbus.in/bus-tickets/udaipur-to-j...,Udaipur to Jodhpur
4,Kalpana Travels,NON AC Seater / Sleeper 2+1,22:00,04:54,06h 54m,INR 300,19 Seats available,3.6\n101,https://www.redbus.in/bus-tickets/udaipur-to-j...,Udaipur to Jodhpur
...,...,...,...,...,...,...,...,...,...,...
519,Geetanjali Travels,NON AC Seater / Sleeper 2+1,23:00,04:30,05h 30m,INR 300,40 Seats available,2.2,https://www.redbus.in/bus-tickets/jaipur-to-ko...,Jaipur (Rajasthan) to Kota(Rajasthan)
520,A One Travel Agency,Non A/C Seater / Sleeper (2+1),21:00,02:30,05h 30m,INR 350,27 Seats available,1.6,https://www.redbus.in/bus-tickets/jaipur-to-ko...,Jaipur (Rajasthan) to Kota(Rajasthan)
521,Ganesh Travels,A/C Sleeper (2+1),20:40,01:30,04h 50m,INR 1049,2 Seats available,2.9,https://www.redbus.in/bus-tickets/jaipur-to-ko...,Jaipur (Rajasthan) to Kota(Rajasthan)
522,Intercity Travels Indore,A/C Sleeper (2+1),20:45,00:30,03h 45m,INR 990,2 Seats available,2.0\n7,https://www.redbus.in/bus-tickets/jaipur-to-ko...,Jaipur (Rajasthan) to Kota(Rajasthan)


### 5. SOUTH BENGAL

#### ROUTE_NAMES AND ROUTE_LINKS

In [19]:
# opens browser
driver_SB = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_SB.get("https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc/?utm_source=rtchometile")
driver_SB.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_SB, 10)

def South_Bengal_routes_and_links(path):
    South_Bengal_routes = []
    South_Bengal_links = []

    # route links
    for i in range(1, 6):
        paths = driver_SB.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            South_Bengal_links.append(l)

         # route names
        for route in paths:
            South_Bengal_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_SB)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return South_Bengal_routes, South_Bengal_links

South_Bengal_routes, South_Bengal_links = South_Bengal_routes_and_links("//a[@class='route']") 

No more pages to paginate


In [20]:
# Convert lists into df
df_SB = pd.DataFrame({"Route_name":South_Bengal_routes, "Route_link":South_Bengal_links})

# Convert df to csv file 
df_SB.to_csv('df_SB.csv', index = False)

# Read the csv file
df_5 = pd.read_csv('df_SB.csv')
df_5

Unnamed: 0,Route_name,Route_link
0,Durgapur to Calcutta,https://www.redbus.in/bus-tickets/durgapur-to-...
1,Kolkata to Burdwan,https://www.redbus.in/bus-tickets/kolkata-to-b...
2,Haldia to Calcutta,https://www.redbus.in/bus-tickets/haldia-to-ko...
3,Kolkata to Haldia,https://www.redbus.in/bus-tickets/kolkata-to-h...
4,Kolkata to Durgapur (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-d...
5,Kolkata to Arambagh (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-a...
6,Midnapore to Kolkata,https://www.redbus.in/bus-tickets/midnapore-to...
7,Kolkata to Digha,https://www.redbus.in/bus-tickets/kolkata-to-d...
8,Digha to Calcutta,https://www.redbus.in/bus-tickets/digha-to-kol...
9,Kolkata to Bankura,https://www.redbus.in/bus-tickets/kolkata-to-b...


#### BUS_DETAILS

In [23]:
# retrive the bus details
driver_SB = webdriver.Chrome()
Bus_names_SB = []
Bus_types_SB = []
Start_Time_SB = []
End_Time_SB = []
Ratings_SB = []
Total_Duration_SB = []
Prices_SB = []
Seats_Available_SB = []
Route_names = []
Route_links = []

for i,r in df_5.iterrows():
    link=r["Route_link"]
    routes=r["Route_name"]

    # Loop through each link
    driver_SB.get(link)
    driver_SB.maximize_window()
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver_SB.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    try:
        clicks = driver_SB.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_SB, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_SB.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_SB).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_SB.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:
        scrolling = True
        while scrolling:
            old_page_source = driver_SB.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_SB).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_SB.page_source

            if new_page_source == old_page_source:
                scrolling = False            

    # Extract bus details
    bus_name = driver_SB.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_SB.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_SB.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_SB.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_SB.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_SB.find_elements(By.XPATH,"//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_SB.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_SB.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_SB.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_SB.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_SB.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_SB.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_SB.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_SB.append(ratings.text)

    for price_elem in price:
        Prices_SB.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_SB.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [24]:
# convert lists to a dictionary
data_5 = {
    'Bus_name': Bus_names_SB,
    'Bus_type': Bus_types_SB,
    'Start_time': Start_Time_SB,
    'End_time': End_Time_SB,
    'Total_duration': Total_Duration_SB,
    'Price': Prices_SB,
    "Seats_Available":Seats_Available_SB,
    "Ratings":Ratings_SB,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# convert dictionary to dataframe
df_buses_5 = pd.DataFrame(data_5)

# convert dataframe to csv and read
df_buses_5.to_csv("df_buses_5.csv",index=False)
df_buses_5 = pd.read_csv("df_buses_5.csv")
df_buses_5

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,SBSTC-DURGAPUR - KOLKATA - 05:00 (DGP DEPOT) -...,Non AC Seater (2+3),05:00,08:30,03h 30m,INR 150,4 Seats available,4.2\n155,https://www.redbus.in/bus-tickets/durgapur-to-...,Durgapur to Calcutta
1,SBSTC-DURGAPUR - KOLKATA - 05:30 (DGP DEPOT) -...,Non AC Seater (2+3),05:30,09:00,03h 30m,INR 150,24 Seats available,3.6\n104,https://www.redbus.in/bus-tickets/durgapur-to-...,Durgapur to Calcutta
2,SBSTC-DURGAPUR - KOLKATA - L/S - 06:00 (DGP DE...,Non AC Seater (2+3),06:00,09:30,03h 30m,INR 150,30 Seats available,3.0\n98,https://www.redbus.in/bus-tickets/durgapur-to-...,Durgapur to Calcutta
3,SBSTC-DURGAPUR - KOLKATA - 06:30 (DGP DEPOT) -...,Non AC Seater (2+3),06:30,10:00,03h 30m,INR 150,34 Seats available,3.9\n78,https://www.redbus.in/bus-tickets/durgapur-to-...,Durgapur to Calcutta
4,SBSTC-ASANSOL - KOLKATA - 05:35 (ASANSOL DEPOT...,Non AC Seater (2+3),06:45,10:15,03h 30m,INR 150,46 Seats available,3.7\n36,https://www.redbus.in/bus-tickets/durgapur-to-...,Durgapur to Calcutta
...,...,...,...,...,...,...,...,...,...,...
1660,Snemita Parisheba,AC Seater (2+3),05:50,10:20,04h 30m,304,20 Seats available,1.0,https://www.redbus.in/bus-tickets/barasat-west...,Barasat (West Bengal) to Digha
1661,SANTOSH BUS SERVICE,A/C Seater (2+3),05:55,10:20,04h 25m,INR 320,27 Seats available,New,https://www.redbus.in/bus-tickets/barasat-west...,Barasat (West Bengal) to Digha
1662,Satya Paribahan,A/C Executive (2+3),06:00,10:00,04h 00m,INR 320,61 Seats available,1.7\n4,https://www.redbus.in/bus-tickets/barasat-west...,Barasat (West Bengal) to Digha
1663,Aradhana Travels,A/C Seater / Sleeper (2+2),02:30,08:00,05h 30m,555,72 Seats available,3.4,https://www.redbus.in/bus-tickets/barasat-west...,Barasat (West Bengal) to Digha


### 6. HIMACHAL

#### ROUTE_NAMES AND ROUTE_LINKS

In [26]:
# opens browser
driver_H = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_H.get("https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile")
driver_H.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_H, 10)

def Himachal_routes_and_links(path):
    Himachal_routes = []
    Himachal_links = []

    # route links
    for i in range(1, 5):
        paths = driver_H.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            Himachal_links.append(l)

         # route names
        for route in paths:
            Himachal_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_H)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return Himachal_routes, Himachal_links

Himachal_routes, Himachal_links = Himachal_routes_and_links("//a[@class='route']") 

No more pages to paginate


In [27]:
# Convert lists into df
df_H = pd.DataFrame({"Route_name":Himachal_routes, "Route_link":Himachal_links})

# Convert df to csv file 
df_H.to_csv('df_H.csv', index = False)

# Read the csv file
df_6 = pd.read_csv('df_H.csv')
df_6

Unnamed: 0,Route_name,Route_link
0,Delhi to Shimla,https://www.redbus.in/bus-tickets/delhi-to-shimla
1,Chandigarh to Hamirpur (Himachal Pradesh),https://www.redbus.in/bus-tickets/chandigarh-t...
2,Hamirpur (Himachal Pradesh) to Chandigarh,https://www.redbus.in/bus-tickets/hamirpur-him...
3,Shimla to Delhi,https://www.redbus.in/bus-tickets/shimla-to-delhi
4,Delhi to Chandigarh,https://www.redbus.in/bus-tickets/delhi-to-cha...
5,Hamirpur (Himachal Pradesh) to Delhi,https://www.redbus.in/bus-tickets/hamirpur-him...
6,Chamba (Himachal Pradesh) to Chandigarh,https://www.redbus.in/bus-tickets/chamba-himac...
7,Delhi to Hamirpur (Himachal Pradesh),https://www.redbus.in/bus-tickets/delhi-to-ham...
8,Chandigarh to Dharamshala (Himachal Pradesh),https://www.redbus.in/bus-tickets/chandigarh-t...
9,Delhi to Chamba (Himachal Pradesh),https://www.redbus.in/bus-tickets/delhi-to-cha...


#### BUS_DETAILS

In [29]:
# retrive the bus details
driver_H = webdriver.Chrome()
Bus_names_H = []
Bus_types_H = []
Start_Time_H = []
End_Time_H = []
Ratings_H = []
Total_Duration_H = []
Prices_H = []
Seats_Available_H = []
Route_names = []
Route_links = []

for i,r in df_6.iterrows():
    link=r["Route_link"]
    routes=r["Route_name"]

    # Loop through each link
    driver_H.get(link)
    driver_H.maximize_window()
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver_H.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    try:   
        clicks = driver_H.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_H, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()      
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_H.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_H).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_H.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:
        scrolling = True
        while scrolling:
            old_page_source = driver_H.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_H).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_H.page_source

            if new_page_source == old_page_source:
                scrolling = False            

    # Extract bus details
    bus_name = driver_H.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_H.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_H.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_H.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_H.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_H.find_elements(By.XPATH,"//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")   
    price = driver_H.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_H.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_H.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_H.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_H.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_H.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_H.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_H.append(ratings.text)

    for price_elem in price:
        Prices_H.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_H.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [30]:
# convert lists to a dictionary
data_6 = {
    'Bus_name': Bus_names_H,
    'Bus_type': Bus_types_H,
    'Start_time': Start_Time_H,
    'End_time': End_Time_H,
    'Total_duration': Total_Duration_H,
    'Price': Prices_H,
    "Seats_Available":Seats_Available_H,
    "Ratings":Ratings_H,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# convert dictionary to dataframe
df_buses_6 = pd.DataFrame(data_6)

# convert dataframe to csv and read
df_buses_6.to_csv("df_buses_6.csv",index=False)
df_buses_6 = pd.read_csv("df_buses_6.csv")
df_buses_6

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,HRTC - 69,Himsuta AC Seater Volvo/Scania 2+2,00:40,09:30,08h 50m,INR 912,26 Seats available,4.7\n60,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
1,HRTC - 6,Himsuta AC Seater Volvo/Scania 2+2,06:45,16:10,09h 25m,INR 912,30 Seats available,4.4\n58,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
2,HRTC - 592,A/C Executive (2+3),08:05,18:10,10h 05m,INR 632,42 Seats available,2.7\n13,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
3,HRTC - 129,Ordinary 3+2 Non AC Seater,08:50,18:40,09h 50m,INR 512,37 Seats available,2.6\n4,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
4,HRTC - 7,Himsuta AC Seater Volvo/Scania 2+2,09:25,18:30,09h 05m,INR 912,28 Seats available,4.6\n80,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
...,...,...,...,...,...,...,...,...,...,...
1243,Heera Himachal Holidays,Volvo A/C Semi Sleeper (2+2),19:30,10:00,14h 30m,INR 849,39 Seats available,2.9,https://www.redbus.in/bus-tickets/manali-to-delhi,Manali to Delhi
1244,Yatrabybus.com,Bharat Benz A/C Semi Sleeper (2+2),18:00,04:00,10h 00m,INR 1199,49 Seats available,2.2,https://www.redbus.in/bus-tickets/manali-to-delhi,Manali to Delhi
1245,Royal Travels Hills,Volvo A/C Semi Sleeper (2+2),18:00,06:00,12h 00m,INR 1099,41 Seats available,1.9,https://www.redbus.in/bus-tickets/manali-to-delhi,Manali to Delhi
1246,Travel hub,Volvo A/C Semi Sleeper (2+2),19:30,07:30,12h 00m,INR 1099,41 Seats available,1.4,https://www.redbus.in/bus-tickets/manali-to-delhi,Manali to Delhi


### 7. ASSAM

#### ROUTE_NAMES AND ROUTE_LINKS

In [31]:
# opens browser
driver_AS = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_AS.get("https://www.redbus.in/online-booking/astc/?utm_source=rtchometile")
driver_AS.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_AS, 10)

def Assam_routes_and_links(path):
    Assam_routes = []
    Assam_links = []

    # route links
    for i in range(1, 6):
        paths = driver_AS.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            Assam_links.append(l)

         # route names
        for route in paths:
            Assam_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_AS)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return Assam_routes, Assam_links

Assam_routes, Assam_links = Assam_routes_and_links("//a[@class='route']") 

No more pages to paginate


In [32]:
# Convert lists into df
df_AS = pd.DataFrame({"Route_name":Assam_routes, "Route_link":Assam_links})

# Convert df to csv file 
df_AS.to_csv('df_AS.csv', index = False)

# Read the csv file
df_7 = pd.read_csv('df_AS.csv')
df_7

Unnamed: 0,Route_name,Route_link
0,Tezpur to Guwahati,https://www.redbus.in/bus-tickets/tezpur-to-gu...
1,Guwahati to Tezpur,https://www.redbus.in/bus-tickets/guwahati-to-...
2,Guwahati to Nagaon (Assam),https://www.redbus.in/bus-tickets/guwahati-to-...
3,Nagaon (Assam) to Guwahati,https://www.redbus.in/bus-tickets/nagaon-to-gu...
4,Goalpara to Guwahati,https://www.redbus.in/bus-tickets/goalpara-to-...
5,Jorhat to North Lakhimpur,https://www.redbus.in/bus-tickets/jorhat-to-no...
6,Dhubri to Guwahati,https://www.redbus.in/bus-tickets/dhubri-to-gu...
7,Jorhat to Dibrugarh,https://www.redbus.in/bus-tickets/jorhat-to-di...
8,North Lakhimpur to Jorhat,https://www.redbus.in/bus-tickets/north-lakhim...
9,North Lakhimpur to Sibsagar,https://www.redbus.in/bus-tickets/north-lakhim...


#### BUS_DETAILS

In [33]:
# retrive the bus details
driver_AS = webdriver.Chrome()
Bus_names_AS = []
Bus_types_AS = []
Start_Time_AS = []
End_Time_AS = []
Ratings_AS = []
Total_Duration_AS = []
Prices_AS = []
Seats_Available_AS = []
Route_names = []
Route_links = []

for i,r in df_7.iterrows():
    link=r["Route_link"]
    routes=r["Route_name"]

    # Loop through each link
    driver_AS.get(link)
    driver_AS.maximize_window()
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver_AS.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)
    try:
        clicks = driver_AS.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_AS, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_AS.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_AS).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_AS.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:

        scrolling = True
        while scrolling:
            old_page_source = driver_AS.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_AS).send_keys(Keys.END).perform()

            time.sleep(5)  

            new_page_source = driver_AS.page_source

            if new_page_source == old_page_source:
                scrolling = False      

    # Extract bus details
    bus_name = driver_AS.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_AS.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_AS.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_AS.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_AS.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_AS.find_elements(By.XPATH,"//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_AS.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_AS.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_AS.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_AS.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_AS.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_AS.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_AS.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_AS.append(ratings.text)

    for price_elem in price:
        Prices_AS.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_AS.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [34]:
# convert lists to a dictionary
data_7 = {
    'Bus_name': Bus_names_AS,
    'Bus_type': Bus_types_AS,
    'Start_time': Start_Time_AS,
    'End_time': End_Time_AS,
    'Total_duration': Total_Duration_AS,
    'Price': Prices_AS,
    "Seats_Available":Seats_Available_AS,
    "Ratings":Ratings_AS,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# convert dictionary to dataframe
df_buses_7 = pd.DataFrame(data_7)

# convert dataframe to csv and read
df_buses_7.to_csv("df_buses_7.csv",index=False)
df_buses_7 = pd.read_csv("df_buses_7.csv")
df_buses_7

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,NEELKANTH TRAVELS,A/C Seater Push Back (2+2),05:15,08:30,03h 15m,INR 300,13 Seats available,4.4\n141,https://www.redbus.in/bus-tickets/tezpur-to-gu...,Tezpur to Guwahati
1,MAA Anada (UNDER ASTC),AC Seater (2+2),05:20,08:30,03h 10m,INR 304.2,30 Seats available,3.3\n44,https://www.redbus.in/bus-tickets/tezpur-to-gu...,Tezpur to Guwahati
2,Kanchan Travels,AC Seater (2+2),05:30,08:45,03h 15m,INR 349,26 Seats available,4.3\n48,https://www.redbus.in/bus-tickets/tezpur-to-gu...,Tezpur to Guwahati
3,Mahi Travels(Under ASTC),A/C Seater (2+2),05:40,08:55,03h 15m,349,11 Seats available,4.5\n368,https://www.redbus.in/bus-tickets/tezpur-to-gu...,Tezpur to Guwahati
4,WARISPIYA TRAVELS,AC Seater (2+2),05:45,09:00,03h 15m,INR 360,24 Seats available,4.2\n273,https://www.redbus.in/bus-tickets/tezpur-to-gu...,Tezpur to Guwahati
...,...,...,...,...,...,...,...,...,...,...
494,ZAMZAM TRAVELS,NON A/C Seater (2+1),20:15,03:30,07h 15m,INR 500,35 Seats available,3.9,https://www.redbus.in/bus-tickets/guwahati-to-...,Guwahati to Golaghat
495,ZAMZAM TRAVELS,A/C Seater (2+1),13:05,19:45,06h 40m,INR 550,27 Seats available,3.9,https://www.redbus.in/bus-tickets/guwahati-to-...,Guwahati to Golaghat
496,Assam State Transport Corporation (ASTC) - 142178,Bharat Benz A/C Seater (2+2),06:45,12:00,05h 15m,INR 339,3 Seats available,3.2\n15,https://www.redbus.in/bus-tickets/jorhat-to-go...,Jorhat to Gogamukh
497,Air Bus Travels,NON AC Seater / Sleeper 2+1,19:40,05:00,09h 20m,INR 600,43 Seats available,2.9\n44,https://www.redbus.in/bus-tickets/dibrugarh-to...,Dibrugarh to Biswanath Charali


### 8. WEST BENGAL

#### ROUTE_NAMES AND ROUTE_LINKS

In [35]:
# opens browser
driver_WB = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_WB.get("https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile")
driver_WB.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_WB, 10)

def West_Bengal_routes_and_links(path):
    West_Bengal_routes = []
    West_Bengal_links = []

    # route links
    for i in range(1, 5):
        paths = driver_WB.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            West_Bengal_links.append(l)

         # route names
        for route in paths:
            West_Bengal_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_WB)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return West_Bengal_routes, West_Bengal_links

West_Bengal_routes, West_Bengal_links = West_Bengal_routes_and_links("//a[@class='route']") 

No more pages to paginate


In [36]:
# Convert lists into df
df_WB = pd.DataFrame({"Route_name":West_Bengal_routes, "Route_link":West_Bengal_links})

# Convert df to csv file 
df_WB.to_csv('df_WB.csv', index = False)

# Read the csv file
df_8 = pd.read_csv('df_WB.csv')
df_8

Unnamed: 0,Route_name,Route_link
0,Digha to Barasat (West Bengal),https://www.redbus.in/bus-tickets/digha-to-bar...
1,Durgapur to Calcutta,https://www.redbus.in/bus-tickets/durgapur-to-...
2,Digha to Calcutta,https://www.redbus.in/bus-tickets/digha-to-kol...
3,Kolkata to Digha,https://www.redbus.in/bus-tickets/kolkata-to-d...
4,Barasat (West Bengal) to Digha,https://www.redbus.in/bus-tickets/barasat-west...
5,Kolkata to Suri,https://www.redbus.in/bus-tickets/kolkata-to-suri
6,Barasat (West Bengal) to Midnapore,https://www.redbus.in/bus-tickets/barasat-west...
7,Midnapore to Kolkata,https://www.redbus.in/bus-tickets/midnapore-to...
8,Barasat (West Bengal) to Kolaghat,https://www.redbus.in/bus-tickets/barasat-west...
9,Barasat (West Bengal) to Contai (Kanthi),https://www.redbus.in/bus-tickets/barasat-west...


#### BUS_DETAILS

In [44]:
# retrive the bus details
driver_WB = webdriver.Chrome()
Bus_names_WB = []
Bus_types_WB = []
Start_Time_WB = []
End_Time_WB = []
Ratings_WB = []
Total_Duration_WB = []
Prices_WB = []
Seats_Available_WB = []
Route_names = []
Route_links = []

for i, r in df_8.iterrows():
    link = r["Route_link"]
    routes = r["Route_name"]

    # Loop through each link
    driver_WB.get(link)
    driver_WB.maximize_window()
    time.sleep(2)

    # Click on elements to reveal bus details
    elements = driver_WB.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    try:
        clicks = driver_WB.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_WB, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_WB.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_WB).send_keys(Keys.END).perform()

            time.sleep(5)

            new_page_source = driver_WB.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:
        scrolling = True
        while scrolling:
            old_page_source = driver_WB.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_WB).send_keys(Keys.END).perform()

            time.sleep(5)

            new_page_source = driver_WB.page_source

            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver_WB.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_WB.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_WB.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_WB.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_WB.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_WB.find_elements(By.XPATH, "//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_WB.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_WB.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_WB.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_WB.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_WB.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_WB.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_WB.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_WB.append(ratings.text)

    for price_elem in price:
        Prices_WB.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_WB.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [45]:
# convert lists to a dictionary
data_8 = {
    'Bus_name': Bus_names_WB,
    'Bus_type': Bus_types_WB,
    'Start_time': Start_Time_WB,
    'End_time': End_Time_WB,
    'Total_duration': Total_Duration_WB,
    'Price': Prices_WB,
    "Seats_Available":Seats_Available_WB,
    "Ratings":Ratings_WB,
    'Route_link': Route_links,
    'Route_name': Route_names,
}

# convert dictionary to dataframe
df_buses_8 = pd.DataFrame(data_8)

# convert dataframe to csv and read
df_buses_8.to_csv("df_buses_8.csv",index=False)
df_buses_8 = pd.read_csv("df_buses_8.csv")
df_buses_8

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,WBTC (CTC) HABRA-DIGHA via Esplanade - 176|04:00,Non AC Seater (2+3),04:00,09:50,05h 50m,INR 165,24 Seats available,2.1\n7,https://www.redbus.in/bus-tickets/digha-to-bar...,Digha to Barasat (West Bengal)
1,WBTC (CTC) BARASAT-DIGHA via Bally - 38|05:40,Non AC Seater (2+3),05:40,10:10,04h 30m,INR 161,26 Seats available,3.2\n100,https://www.redbus.in/bus-tickets/digha-to-bar...,Digha to Barasat (West Bengal)
2,WBTC (CTC) BADURIA - DIGHA via Bally - 155|09:00,Non AC Seater (2+3),09:00,12:25,03h 25m,INR 161,33 Seats available,2.7\n17,https://www.redbus.in/bus-tickets/digha-to-bar...,Digha to Barasat (West Bengal)
3,WBTC (CTC) HABRA-DIGHA via Bally - 26|10:00,Non AC Seater (2+3),10:00,14:45,04h 45m,INR 161,25 Seats available,3.4\n47,https://www.redbus.in/bus-tickets/digha-to-bar...,Digha to Barasat (West Bengal)
4,"WBTC (CTC) BARASAT-DIGHA via Esplanade, Karuna...",Non AC Seater (2+3),10:15,15:35,05h 20m,INR 165,41 Seats available,3.0\n4,https://www.redbus.in/bus-tickets/digha-to-bar...,Digha to Barasat (West Bengal)
...,...,...,...,...,...,...,...,...,...,...
1089,WBTC (CTC) HABRA-DIGHA via Bally - 26|12:45,Non AC Seater (2+3),12:45,16:40,03h 55m,INR 130,47 Seats available,4.2,https://www.redbus.in/bus-tickets/habra-to-heria,Habra to Heria
1090,WBTC (CTC) Habra-Midnapur - 194|14:45,Non AC Seater (2+3),14:45,17:40,02h 55m,INR 111,51 Seats available,4.0,https://www.redbus.in/bus-tickets/midnapore-to...,Midnapore to Kolkata Airport
1091,WBTC (CTC) Habra-Midnapur - 194|15:40,Non AC Seater (2+3),15:40,18:40,03h 00m,INR 111,39 Seats available,3.8\n22,https://www.redbus.in/bus-tickets/midnapore-to...,Midnapore to Kolkata Airport
1092,WBTC (CTC) Habra-Midnapur - 194|17:00,Non AC Seater (2+3),17:00,19:55,02h 55m,INR 111,51 Seats available,5.0\n4,https://www.redbus.in/bus-tickets/midnapore-to...,Midnapore to Kolkata Airport


### 9. CHANDIGARH

#### ROUTE_NAMES AND ROUTE_LINKS

In [46]:
# opens browser
driver_C = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_C.get("https://www.redbus.in/online-booking/chandigarh-transport-undertaking-ctu")
driver_C.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_C, 10)

def Chandigarh_routes_and_links(path):
    Chandigarh_routes = []
    Chandigarh_links = []

    # route links
    for i in range(1, 6):
        paths = driver_C.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            Chandigarh_links.append(l)

         # route names
        for route in paths:
            Chandigarh_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_C)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return Chandigarh_routes, Chandigarh_links

Chandigarh_routes, Chandigarh_links = Chandigarh_routes_and_links("//a[@class='route']") 

No more pages to paginate


In [47]:
# Convert lists into df
df_C = pd.DataFrame({"Route_name":Chandigarh_routes, "Route_link":Chandigarh_links})

# Convert df to csv file 
df_C.to_csv('df_C.csv', index = False)

# Read the csv file
df_9 = pd.read_csv('df_C.csv')
df_9

Unnamed: 0,Route_name,Route_link
0,Yamuna Nagar to Chandigarh,https://www.redbus.in/bus-tickets/yamuna-nagar...
1,Chandigarh to Delhi,https://www.redbus.in/bus-tickets/chandigarh-t...
2,Delhi to Chandigarh,https://www.redbus.in/bus-tickets/delhi-to-cha...
3,Ludhiana to Chandigarh,https://www.redbus.in/bus-tickets/ludhiana-to-...
4,Chandigarh to Yamuna Nagar,https://www.redbus.in/bus-tickets/chandigarh-t...
5,Chandigarh to Ludhiana,https://www.redbus.in/bus-tickets/chandigarh-t...
6,Hamirpur (Himachal Pradesh) to Chandigarh,https://www.redbus.in/bus-tickets/hamirpur-him...
7,Chandigarh to Vrindavan,https://www.redbus.in/bus-tickets/chandigarh-t...
8,Chandigarh to Hamirpur (Himachal Pradesh),https://www.redbus.in/bus-tickets/chandigarh-t...
9,Chandigarh to Pathankot,https://www.redbus.in/bus-tickets/chandigarh-t...


#### BUS_DETAILS

In [48]:
# retrive the bus details
driver_C = webdriver.Chrome()
Bus_names_C = []
Bus_types_C = []
Start_Time_C = []
End_Time_C = []
Ratings_C = []
Total_Duration_C = []
Prices_C = []
Seats_Available_C = []
Route_names = []
Route_links = []

for i, r in df_9.iterrows():
    link = r["Route_link"]
    routes = r["Route_name"]

    # Loop through each link
    driver_C.get(link)
    driver_C.maximize_window()
    time.sleep(2)

    # Click on elements to reveal bus details
    elements = driver_C.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    try:
        clicks = driver_C.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_C, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_C.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_C).send_keys(Keys.END).perform()

            time.sleep(5)

            new_page_source = driver_C.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:
        scrolling = True
        while scrolling:
            old_page_source = driver_C.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_C).send_keys(Keys.END).perform()

            time.sleep(5)

            new_page_source = driver_C.page_source

            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver_C.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_C.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_C.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_C.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_C.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_C.find_elements(By.XPATH, "//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_C.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_C.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_C.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_C.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_C.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_C.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_C.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_C.append(ratings.text)

    for price_elem in price:
        Prices_C.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_C.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [49]:
# convert lists to a dictionary
data_9 = {
    'Bus_name': Bus_names_C,
    'Bus_type': Bus_types_C,
    'Start_time': Start_Time_C,
    'End_time': End_Time_C,
    'Total_duration': Total_Duration_C,
    'Price': Prices_C,
    "Seats_Available":Seats_Available_C,
    "Ratings":Ratings_C,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# convert dictionary to dataframe
df_buses_9 = pd.DataFrame(data_9)

# convert dataframe to csv and read
df_buses_9.to_csv("df_buses_9.csv",index=False)
df_buses_9 = pd.read_csv("df_buses_9.csv")
df_buses_9

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,Chandigarh Transport Undertaking (CTU) - 165750,HVAC Seater (2+3),12:20,15:05,02h 45m,INR 195.6,44 Seats available,4.0\n40,https://www.redbus.in/bus-tickets/yamuna-nagar...,Yamuna Nagar to Chandigarh
1,Chandigarh Transport Undertaking (CTU) - 165752,HVAC Seater (2+3),14:20,17:05,02h 45m,INR 195.6,46 Seats available,4.0\n55,https://www.redbus.in/bus-tickets/yamuna-nagar...,Yamuna Nagar to Chandigarh
2,Zimindara Travels,AC Sleeper (2+1),20:30,00:30,04h 00m,INR 699,36 Seats available,4.1,https://www.redbus.in/bus-tickets/yamuna-nagar...,Yamuna Nagar to Chandigarh
3,SHRI KRISHNA TRAVELS (JAI SHREE GANESH YATRA CO.),VE A/C Seater / Sleeper (2+1),20:40,23:30,02h 50m,INR 599,28 Seats available,3.7,https://www.redbus.in/bus-tickets/yamuna-nagar...,Yamuna Nagar to Chandigarh
4,SHRI KRISHNA TRAVELS (JAI SHREE GANESH YATRA CO.),VE A/C Seater / Sleeper (2+1),00:30,02:30,02h 00m,INR 699,20 Seats available,3.7,https://www.redbus.in/bus-tickets/yamuna-nagar...,Yamuna Nagar to Chandigarh
...,...,...,...,...,...,...,...,...,...,...
1448,Jujhar Travels,Volvo A/C Semi Sleeper (2+2),14:13,18:18,04h 05m,INR 650,40 Seats available,3.8\n35,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh to Firozpur
1449,Jujhar Travels,Volvo A/C Semi Sleeper (2+2),15:25,19:30,04h 05m,INR 650,39 Seats available,3.6\n22,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh to Firozpur
1450,Jujhar Travels,Volvo A/C Semi Sleeper (2+2),17:25,21:20,03h 55m,INR 650,45 Seats available,4.3\n16,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh to Firozpur
1451,Orbit Aviation Pvt. Ltd.,A/C Seater (2+2),17:55,21:55,04h 00m,INR 666,45 Seats available,3.1\n9,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh to Firozpur


### 10. UTTAR PRADESH

#### ROUTE_NAMES AND ROUTE_LINKS

In [51]:
# opens browser
driver_UP = webdriver.Chrome()
time.sleep(2)

# loads webpage
driver_UP.get("https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile")
driver_UP.maximize_window()

# waits for the webpage to load it's contents
wait = WebDriverWait(driver_UP, 10)

def Uttar_Pradesh_routes_and_links(path):
    Uttar_Pradesh_routes = []
    Uttar_Pradesh_links = []

    # route links
    for i in range(1, 6):
        paths = driver_UP.find_elements(By.XPATH, path)

        for link in paths:
            l = link.get_attribute("href")
            Uttar_Pradesh_links.append(l)

         # route names
        for route in paths:
            Uttar_Pradesh_routes.append(route.text)

        try:
            # Waits for the required element to be present
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            # Finds the element
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()={i+1}]')
            
            actions = ActionChains(driver_UP)
            actions.move_to_element(next_pg_btn).perform()
            time.sleep(2)

            # Clicks the element
            next_pg_btn.click()
            
        except NoSuchElementException:
            print('No more pages to paginate')
            break

    return Uttar_Pradesh_routes, Uttar_Pradesh_links

Uttar_Pradesh_routes, Uttar_Pradesh_links = Uttar_Pradesh_routes_and_links("//a[@class='route']")

No more pages to paginate


In [52]:
# Convert lists into df
df_UP = pd.DataFrame({"Route_name":Uttar_Pradesh_routes, "Route_link":Uttar_Pradesh_links})

# Convert df to csv file 
df_UP.to_csv('df_UP.csv', index = False)

# Read the csv file
df_10 = pd.read_csv('df_UP.csv')
df_10

Unnamed: 0,Route_name,Route_link
0,Delhi to Bareilly,https://www.redbus.in/bus-tickets/delhi-to-bar...
1,Bareilly to Delhi,https://www.redbus.in/bus-tickets/bareilly-to-...
2,Aligarh (uttar pradesh) to Delhi,https://www.redbus.in/bus-tickets/aligarh-utta...
3,Delhi to Aligarh (uttar pradesh),https://www.redbus.in/bus-tickets/delhi-to-ali...
4,Lucknow to Allahabad,https://www.redbus.in/bus-tickets/lucknow-to-a...
5,Lucknow to Delhi,https://www.redbus.in/bus-tickets/lucknow-to-d...
6,Delhi to Farrukhabad (Uttar Pradesh),https://www.redbus.in/bus-tickets/delhi-to-far...
7,Farrukhabad (Uttar Pradesh) to Delhi,https://www.redbus.in/bus-tickets/farrukhabad-...
8,Badaun to Delhi,https://www.redbus.in/bus-tickets/badaun-to-delhi
9,Allahabad to Lucknow,https://www.redbus.in/bus-tickets/allahabad-to...


BUS DETAILS

In [53]:
# retrive the bus details
driver_UP = webdriver.Chrome()
Bus_names_UP = []
Bus_types_UP = []
Start_Time_UP = []
End_Time_UP = []
Ratings_UP = []
Total_Duration_UP = []
Prices_UP = []
Seats_Available_UP = []
Route_names = []
Route_links = []

for i, r in df_10.iterrows():
    link = r["Route_link"]
    routes = r["Route_name"]

    # Loop through each link
    driver_UP.get(link)
    driver_UP.maximize_window()
    time.sleep(2)

    # Click on elements to reveal bus details
    elements = driver_UP.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)

    try:
        clicks = driver_UP.find_element(By.XPATH, "//div[@class='button']")
        clicks = WebDriverWait(driver_UP, 5).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
        clicks.click()
        time.sleep(2)

        scrolling = True
        while scrolling:
            old_page_source = driver_UP.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_UP).send_keys(Keys.END).perform()

            time.sleep(5)

            new_page_source = driver_UP.page_source

            if new_page_source == old_page_source:
                scrolling = False

    except NoSuchElementException:
        scrolling = True
        while scrolling:
            old_page_source = driver_UP.page_source

            # Use ActionChains to go to the end of the page
            ActionChains(driver_UP).send_keys(Keys.END).perform()

            time.sleep(5)

            new_page_source = driver_UP.page_source

            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver_UP.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver_UP.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver_UP.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver_UP.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver_UP.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver_UP.find_elements(By.XPATH, "//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
    price = driver_UP.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver_UP.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_UP.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)

    for bus_type_elem in bus_type:
        Bus_types_UP.append(bus_type_elem.text)

    for start_time_elem in start_time:
        Start_Time_UP.append(start_time_elem.text)

    for end_time_elem in end_time:
        End_Time_UP.append(end_time_elem.text)

    for total_duration_elem in total_duration:
        Total_Duration_UP.append(total_duration_elem.text)

    for ratings in rating:
        Ratings_UP.append(ratings.text)

    for price_elem in price:
        Prices_UP.append(price_elem.text)

    for seats_elem in seats:
        Seats_Available_UP.append(seats_elem.text)

print("Successfully Completed")

Successfully Completed


In [54]:
# convert lists to a dictionary
data_10 = {
    'Bus_name': Bus_names_UP,
    'Bus_type': Bus_types_UP,
    'Start_time': Start_Time_UP,
    'End_time': End_Time_UP,
    'Total_duration': Total_Duration_UP,
    'Price': Prices_UP,
    "Seats_Available":Seats_Available_UP,
    "Ratings":Ratings_UP,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# convert dictionary to dataframe
df_buses_10 = pd.DataFrame(data_10)

# convert dataframe to csv and read
df_buses_10.to_csv("df_buses_10.csv",index=False)
df_buses_10 = pd.read_csv("df_buses_10.csv")
df_buses_10

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,UPSRTC - RKD0134,Janrath AC Seater 2+3,00:31,05:31,05h 00m,INR 520,46 Seats available,1.9\n17,https://www.redbus.in/bus-tickets/delhi-to-bar...,Delhi to Bareilly
1,UPSRTC - STD0161,Ordinary Non AC Seater 2+3,06:00,13:16,07h 16m,INR 448,52 Seats available,3.3,https://www.redbus.in/bus-tickets/delhi-to-bar...,Delhi to Bareilly
2,UPSRTC - GRH0229,Ordinary Non AC Seater 2+3,06:00,12:30,06h 30m,INR 418,52 Seats available,3.3,https://www.redbus.in/bus-tickets/delhi-to-bar...,Delhi to Bareilly
3,UPSRTC - RKD0129,Janrath AC Seater 2+2,06:01,11:01,05h 00m,INR 598,35 Seats available,3.1\n17,https://www.redbus.in/bus-tickets/delhi-to-bar...,Delhi to Bareilly
4,UPSRTC - GRH0230,Ordinary Non AC Seater 2+3,06:30,13:00,06h 30m,INR 418,52 Seats available,3.3,https://www.redbus.in/bus-tickets/delhi-to-bar...,Delhi to Bareilly
...,...,...,...,...,...,...,...,...,...,...
2573,Betrwanti Travels,A/C Sleeper (2+1),07:50,09:50,02h 00m,INR 800,20 Seats available,2.7,https://www.redbus.in/bus-tickets/kanpur-to-lu...,Kanpur (Uttar Pradesh) to Lucknow
2574,Betrwanti Travels,A/C Sleeper (2+1),08:30,10:45,02h 15m,INR 1200,36 Seats available,2.7,https://www.redbus.in/bus-tickets/kanpur-to-lu...,Kanpur (Uttar Pradesh) to Lucknow
2575,MISHRA BANDHU BUS SERVICE,A/C Seater / Sleeper (2+1),18:30,21:30,03h 00m,INR 1500,33 Seats available,2.2\n5,https://www.redbus.in/bus-tickets/kanpur-to-lu...,Kanpur (Uttar Pradesh) to Lucknow
2576,KANAHAIYA TOUR & TRAVELS,A/C Seater (2+3),19:00,23:00,04h 00m,INR 3000,36 Seats available,,https://www.redbus.in/bus-tickets/kanpur-to-lu...,Kanpur (Uttar Pradesh) to Lucknow


# DATA CLEANING

In [5]:
# csv to dataframe
df_buses_1=pd.read_csv("df_buses_1.csv")
df_buses_2=pd.read_csv("df_buses_2.csv")
df_buses_3=pd.read_csv("df_buses_3.csv")
df_buses_4=pd.read_csv("df_buses_4.csv")
df_buses_5=pd.read_csv("df_buses_5.csv")
df_buses_6=pd.read_csv("df_buses_6.csv")
df_buses_7=pd.read_csv("df_buses_7.csv")
df_buses_8=pd.read_csv("df_buses_8.csv")
df_buses_9=pd.read_csv("df_buses_9.csv")
df_buses_10=pd.read_csv("df_buses_10.csv")

Final_df=pd.concat([df_buses_1,df_buses_2,df_buses_3,df_buses_4,df_buses_5,df_buses_6,
                    df_buses_7,df_buses_8,df_buses_9,df_buses_10],ignore_index=True)
Final_df

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Ratings,Route_link,Route_name
0,Sri KVR Travels,A/C Sleeper (2+1),23:35,05:15,05h 40m,INR 560,13 Seats available,4.6\n295,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
1,FRESHBUS,Electric A/C Seater (2+2),23:10,05:35,06h 25m,450,28 Seats available,4.5\n504,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
2,IntrCity SmartBus,A/C Seater / Sleeper (2+1),23:50,05:35,05h 45m,INR 433,20 Seats available,4.4\n949,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
3,AdSri Durga Malleswari Travels,A/C Sleeper (2+1),23:10,04:35,05h 25m,665,17 Seats available,4.0\n111,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
4,IntrCity SmartBus,Scania AC Multi Axle Sleeper (2+1),23:59,06:30,06h 31m,INR 595,16 Seats available,4.5\n465,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
...,...,...,...,...,...,...,...,...,...,...
13449,Betrwanti Travels,A/C Sleeper (2+1),07:50,09:50,02h 00m,INR 800,20 Seats available,2.7,https://www.redbus.in/bus-tickets/kanpur-to-lu...,Kanpur (Uttar Pradesh) to Lucknow
13450,Betrwanti Travels,A/C Sleeper (2+1),08:30,10:45,02h 15m,INR 1200,36 Seats available,2.7,https://www.redbus.in/bus-tickets/kanpur-to-lu...,Kanpur (Uttar Pradesh) to Lucknow
13451,MISHRA BANDHU BUS SERVICE,A/C Seater / Sleeper (2+1),18:30,21:30,03h 00m,INR 1500,33 Seats available,2.2\n5,https://www.redbus.in/bus-tickets/kanpur-to-lu...,Kanpur (Uttar Pradesh) to Lucknow
13452,KANAHAIYA TOUR & TRAVELS,A/C Seater (2+3),19:00,23:00,04h 00m,INR 3000,36 Seats available,,https://www.redbus.in/bus-tickets/kanpur-to-lu...,Kanpur (Uttar Pradesh) to Lucknow


In [6]:
# data about the data
Final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13454 entries, 0 to 13453
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Bus_name         13454 non-null  object
 1   Bus_type         13454 non-null  object
 2   Start_time       13454 non-null  object
 3   End_time         13454 non-null  object
 4   Total_duration   13454 non-null  object
 5   Price            13454 non-null  object
 6   Seats_Available  13454 non-null  object
 7   Ratings          13454 non-null  object
 8   Route_link       13454 non-null  object
 9   Route_name       13454 non-null  object
dtypes: object(10)
memory usage: 1.0+ MB


In [7]:
#convert prices to numeric
Final_df["Price"]=Final_df["Price"].str.replace("INR","")
Final_df["Price"]=Final_df["Price"].astype(float)
Final_df["Price"].fillna(0) 

0         560.0
1         450.0
2         433.0
3         665.0
4         595.0
          ...  
13449     800.0
13450    1200.0
13451    1500.0
13452    3000.0
13453     999.0
Name: Price, Length: 13454, dtype: float64

In [8]:
#convert Ratings to numeric
Final_df["Ratings"]=Final_df["Ratings"].str.replace("New","")
Final_df["Ratings"]=Final_df["Ratings"].str.strip()
Final_df["Ratings"]=Final_df["Ratings"].str.split().str[0]
Final_df["Ratings"] = pd.to_numeric(Final_df["Ratings"], errors='coerce')
Final_df["Ratings"].fillna(0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Final_df["Ratings"].fillna(0,inplace=True)


In [15]:
Final_df['Seats_Available'] = Final_df['Seats_Available'].str.extract(r'(\d+)').astype(int)
Final_df['Seats_Available']



0        13
1        28
2        20
3        17
4        16
         ..
13449    20
13450    36
13451    33
13452    36
13453    19
Name: Seats_Available, Length: 13450, dtype: int32

In [17]:
Final_df['Total_duration'] = Final_df['Total_duration'].str.replace('h', ':').str.replace('m', ':00').str.replace(' ', '')
Final_df['Total_duration']

0        05:40:00
1        06:25:00
2        05:45:00
3        05:25:00
4        06:31:00
           ...   
13449    02:00:00
13450    02:15:00
13451    03:00:00
13452    04:00:00
13453    02:30:00
Name: Total_duration, Length: 13450, dtype: object

In [18]:
# info after the data type change
Final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13450 entries, 0 to 13453
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Bus_name         13450 non-null  object 
 1   Bus_type         13450 non-null  object 
 2   Start_time       13450 non-null  object 
 3   End_time         13450 non-null  object 
 4   Total_duration   13450 non-null  object 
 5   Price            13450 non-null  float64
 6   Seats_Available  13450 non-null  int32  
 7   Ratings          13450 non-null  float64
 8   Route_link       13450 non-null  object 
 9   Route_name       13450 non-null  object 
dtypes: float64(2), int32(1), object(7)
memory usage: 1.1+ MB


In [19]:
Final_df = Final_df[Final_df["Price"] <= 7000]

In [20]:
# replacing the nan value
Final_df = Final_df.replace({np.nan: None})

In [21]:
# change dataframe to csv
Final_df.to_csv("Final_df.csv",index=False)