In [None]:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import mysql.connector as sconn
from mysql.connector import Error

# ------------------------------------------------ Selenium Web Scraping ------------------------------------------------ #

def open_url(url):
    """Opens the Redbus website."""
    try:
        # Optionally run in headless mode
        no_page = Options()
        no_page.add_argument('--headless')
        # driver = webdriver.Chrome(options=no_page)  # Headless
        driver = webdriver.Chrome()  # Visible browser
        driver.get(url)
        print("URL opened successfully")
    except Exception as e:
        print("Error occurred when initializing Webdriver:", e)
    return driver

def maximize_window(driver):
    """Maximizes the browser window."""
    try:
        driver.maximize_window()
        print("Window maximized")
    except Exception as e:
        print("Error occurred when maximizing the window:", e)

def scrolling(driver):
    """Scrolls down the page."""
    try:
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
        time.sleep(1)
        print("Page scrolled")
    except Exception as e:
        print("Error occurred when scrolling the page:", e)

def press_viewall(driver):
    """Selects and presses the 'View All' button."""
    try:
        view_all = driver.find_elements(By.XPATH, "//a[@class='OfferSection__ViewAllText-sc-16xojcc-1 eVcjqm']")
        ref = view_all[1].get_attribute('href')
        time.sleep(2)
        driver.get(ref)  # Navigate to rtc-directory
        print("View all button pressed")
    except Exception as e:
        print("Error occurred when selecting 'View All' button:", e)
    return driver

def fetch_state_names(driver):
    """Fetches the state names and their links."""
    state_name = []
    state_links = []
    try:
        state_elements = driver.find_elements(By.XPATH, "//div[@class='D113_ul_rtc']/ul/li/a")
        for element in state_elements:
            state_name.append(element.text)
            state_links.append(element.get_attribute('href'))
        print(f"State names fetched: {len(state_links)}")
    except Exception as e:
        print("Error when fetching state names:", e)
    return state_links, state_name

def route_name_ref(driver, state_links, state_name):
    """Fetches route names and their links."""
    route_name_link = []
    route_link = []
    route_num = []
    count = 0
    route_no = 1
    state_index = 0
    wait = WebDriverWait(driver, 10)
    try:
        for link in state_links:
            if count >= 11:
                break
            driver.get(link)
            time.sleep(2)

            page_no = driver.find_elements(By.XPATH, "//div[@class='DC_117_paginationTable']/div")
            print(f'{link} - page count - {len(page_no)}')

            if len(page_no) != 0:
                try:
                    route = driver.find_elements(By.XPATH, "//div[@class='route_link']/div/a")
                    for j in route:
                        route_link.append(j.get_attribute('href'))
                        route_name_link.append((route_no, state_name[state_index], j.get_attribute('title'), j.get_attribute('href')))
                        route_num.append(route_no)
                        route_no += 1
                except Exception as e:
                    pass

                no = 0
                page_number = 0
                while no < len(page_no):
                    try:
                        pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='root']/div/div[4]/div[12]")))
                        next_page_button = pagination_container.find_element(By.XPATH, f'//div[contains(@class,"DC_117_pageTabs") and text()="{page_number + 1}"]')
                        driver.execute_script("arguments[0].scrollIntoView();", next_page_button)
                        next_page_button.click()
                        wait.until(EC.text_to_be_present_in_element((By.XPATH, "//div[@class='DC_117_pageTabs DC_117_pageActive']"), str(page_number + 1)))
                        time.sleep(1)
                        route = driver.find_elements(By.XPATH, "//div[@class='route_link']/div/a")
                        for j in route:
                            route_link.append(j.get_attribute('href'))
                            route_name_link.append((route_no, state_name[state_index], j.get_attribute('title'), j.get_attribute('href')))
                            route_num.append(route_no)
                            route_no += 1
                    except Exception as e:
                        pass
                    no += 1
                    page_number += 1
                count += 1
            state_index += 1
        print(f"Route names and links fetched: {len(route_name_link)}, Length of route link: {len(route_link)}")
        print("Total route_no", route_num)
    except Exception as e:
        print("Error occurred when scraping bus route and link:", e)
    return route_name_link, route_link, route_num

def fetch_bus_datas(driver, route_link, no_route):
    """Fetches bus details from each route."""
    wait = WebDriverWait(driver, 10)
    bus_datas = []
    reference = 0
    bus_no = 0
    while reference < len(route_link):
        driver.get(route_link[reference])
        print(f"Fetching data from route: {route_link[reference]}")
        time.sleep(1)

        try:
            goverment_buses = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[@class='button']")))
            buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
            button_index = len(buttons) - 1
            if len(buttons) != 0:
                for _ in buttons:
                    buttons[button_index].click()
                    time.sleep(3)
                    button_index -= 1
        except Exception as e:
            pass

        old_page = ""
        looping = True
        while looping:
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
            time.sleep(1)
            new_page = driver.page_source
            if new_page == old_page:
                looping = False
            else:
                old_page = new_page
            total_bus = driver.find_elements(By.XPATH, "//div[@class='clearfix row-one']")
            time.sleep(1)

        for bus_l in total_bus:
            try:
                bus_name = bus_l.find_element(By.CSS_SELECTOR, "div.travels.lh-24.f-bold.d-color").text
                bus_type = bus_l.find_element(By.CSS_SELECTOR, "div.bus-type.f-12.m-top-16.l-color.evBus").text
                departing_time = bus_l.find_element(By.CSS_SELECTOR, "div.dp-time.f-19.d-color.f-bold").text
                duration = bus_l.find_element(By.CSS_SELECTOR, "div.dur.l-color.lh-24").text
                reaching_time = bus_l.find_element(By.CSS_SELECTOR, "div.bp-time.f-19.d-color.disp-Inline").text
                star_rating = bus_l.find_element(By.CSS_SELECTOR, "div.rating-sec span").text
                price = bus_l.find_element(By.CSS_SELECTOR, "span.f-19.f-bold").text
                total_seat_availability = bus_l.find_element(By.CSS_SELECTOR, ".seat-left").text
                seat_availability = total_seat_availability.split()[0]

                bus_datas.append((no_route[bus_no], bus_name, bus_type, departing_time, duration, reaching_time, star_rating, price, seat_availability))
            except Exception as e:
                continue

        print(f"Bus no: {bus_no} == Route no: {no_route[bus_no]}")
        bus_no += 1
        reference += 1
        print(f"Total bus data entries fetched: {len(bus_datas)}")
    return bus_datas

def quit_driver(driver):
    """Quits the WebDriver."""
    driver.quit()

# Main Execution for Web Scraping
url = "https://www.redbus.in/"

driver = open_url(url)
maximize_window(driver)
scrolling(driver)
driver = press_viewall(driver)

link_states, state_name = fetch_state_names(driver)
name_link_state, route_ref, route_number = route_name_ref(driver, link_states, state_name)
bus_details = fetch_bus_datas(driver, route_ref, route_number)
quit_driver(driver)

# ------------------------------------------------- Data Processing ------------------------------------------------- #

# Convert the bus details into a DataFrame
bus_data = pd.DataFrame(data=bus_details, columns=['bus_no', 'bus_name', 'bus_type', 'departing_time', 'duration', 'reaching_time', 'star_rating', 'price', 'seat_availability'])

# Convert star_rating to numeric, replace non-numeric values with 0
bus_data['star_rating'] = pd.to_numeric(bus_data['star_rating'], errors='coerce').fillna(0)

# Convert the route details into a DataFrame
normal_route_data = pd.DataFrame(data=name_link_state, columns=['route_no', 'state_name', 'route_name', 'route_ref'])
print("Length of route_data:", len(normal_route_data))

# Ensure bus_no and route_no match
unique_bus_no = bus_data['bus_no']

# Save bus data to CSV
bus_data.to_csv('bus_data.csv', index=False)

# Save route data to CSV
normal_route_data.to_csv('route_data.csv', index=False)