In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service as EdgeService  # Import EdgeService
from bs4 import BeautifulSoup
import pandas as pd  # for storing data in a structured way (optional)

def extract_station_data_from_page(driver):
    """Extracts station data from the current page of the table."""
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('table', class_='stn-dir-list-tbl')  # Find the table

    if table is None:
        print("Table not found on this page.")
        return []

    station_data = []
    rows = table.find_all('tr')  # Get all table rows

    # Extract header information
    header_row = rows[0]
    header_cells = header_row.find_all('th')
    headers = [cell.text.strip() for cell in header_cells]

    for row in rows[1:]:  # Skip the header row
        cells = row.find_all('td')
        if not cells:
            continue  # Skip empty rows

        row_data = {}
        for i, cell in enumerate(cells):
            if i == 1:  # Handle the station name (which contains a link)
                link = cell.find('a')
                if link:
                    row_data[headers[i]] = link.text.strip()  # Extract link text
                else:
                    row_data[headers[i]] = cell.text.strip()
            else:
                row_data[headers[i]] = cell.text.strip()

        station_data.append(row_data)

    return station_data

def extract_all_pages_data(driver, letter):
    """Extracts station data from all pages for a given letter."""
    all_data = []
    page_num = 1  # Start at page 1

    while True:
        print(f"Extracting data from page {page_num} for letter '{letter}'")
        page_data = extract_station_data_from_page(driver)  #Extract data from current page
        if not page_data:
            print(f"No data found on page {page_num} for letter '{letter}'. Stopping.")
            break

        all_data.extend(page_data)

        # Find the "Next" button and click it. Use a try-except block in case the button is disabled or not found
        try:
            next_button = driver.find_element(By.LINK_TEXT, "Next →")
            if 'disabled' in next_button.get_attribute('class'):
                print("Next button is disabled, no more pages")
                break # Exit if no next page
            next_button.click()
            driver.implicitly_wait(5) #wait for next page to load
            page_num += 1
        except Exception as e:
            print(f"Next button not found or could not be clicked. Assuming last page reached. {e}")
            break

    return all_data


# def extract_station_data_for_letter(url, letter):
#     """Extracts all station data for a given letter, handling multiple pages."""

#     driver = None  # Initialize driver outside the try block
#     try:
#         # 1. Set up Selenium Edge WebDriver
#         driver = webdriver.Edge()
#         driver.get(url)

#         # 2. Click on the specified letter
#         letter_element = driver.find_element(By.XPATH, f'//a[@class="StationName" and @data-station-name="{letter}"]')
#         letter_element.click()
#         driver.implicitly_wait(5)

#         # 3. Extract data from all pages
#         all_data = extract_all_pages_data(driver, letter)  # Modified function call
#         return all_data

#     except Exception as e:
#         print(f"An error occurred: {e}")
#         return []

#     finally:
#         # 5. Close the WebDriver
#         if driver:  # Check if the driver was initialized
#             driver.quit()
#             # Modify the URL to include the letter and page number directly
def extract_station_data_for_letter(url, letter):
    """Extracts all station data for a given letter, handling multiple pages."""

    driver = None  # Initialize driver outside the try block
    try:
        # 1. Set up Selenium Edge WebDriver
        driver = webdriver.Edge()

        # 2. Extract data from all pages
        all_data = []
        page_num = 1
        while True:
            # Construct the URL with the letter and page number
            page_url = f"{url}?name={letter}&page={page_num}"
            print(f"Accessing URL: {page_url}")
            driver.get(page_url)
            driver.implicitly_wait(5)

            # Extract data from the current page
            page_data = extract_station_data_from_page(driver)
            if not page_data:
                print(f"No data found on page {page_num} for letter '{letter}'. Stopping.")
                break

            all_data.extend(page_data)
            page_num += 1

        return all_data

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

    finally:
        # 3. Close the WebDriver
        if driver:  # Check if the driver was initialized
            driver.quit()
# Example Usage:
url = "https://www.railyatri.in/stations"  # Replace with the actual URL of the webpage
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"  # All letters to process

all_data = {} # Store data for all letters

for letter in letters:
    
    
    data = extract_station_data_for_letter(url, letter)

    if data:
        print(f"Found {len(data)} stations starting with {letter}")
        all_data[letter] = data # Store data for the letter
    else:
        print(f"No data found for letter '{letter}'.")
        all_data[letter] = []

# Optional: Convert all data to Pandas DataFrame and save to CSV
all_dfs = {}
for letter, data in all_data.items():
    all_dfs[letter] = pd.DataFrame(data)

# Combine all dataframes into a single dataframe:
combined_df = pd.concat(all_dfs.values(), ignore_index=True)

# Save the combined DataFrame to a CSV file
combined_df.to_csv("all_stations.csv", index=False)
print("Data saved to all_stations.csv")



In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException

# --- Configuration ---

PAGE_URL = "https://www.ixigo.com/hotels"
DESTINATION_CITY = "Lucknow"

# --- WebDriver Setup ---

driver = webdriver.Edge()
driver.maximize_window()


try:
    print(f"Navigating to {PAGE_URL}...")
    driver.get(PAGE_URL)

    wait = WebDriverWait(driver, 10)

    # --- 1. Handle Destination Input ---
    print(f"Locating and updating Destination input...")

    # Retry logic for StaleElementReferenceException
    for attempt in range(3):  # Try up to 3 times
        try:
            destination_input_locator = (By.XPATH, "//p[text()='Destination']/following-sibling::input")
            destination_input = wait.until(EC.element_to_be_clickable(destination_input_locator))

            # *CLICK THE INPUT FIELD FIRST*
            destination_input.click()
            print("Clicked on the Destination input field.")

            destination_input.clear()
            destination_input.send_keys(DESTINATION_CITY)
            print(f"Entered '{DESTINATION_CITY}' into Destination input.")
            break  # If successful, exit the loop
        except StaleElementReferenceException:
            print(f"StaleElementReferenceException occurred (attempt {attempt + 1}). Retrying...")
            time.sleep(1)  # Wait a bit before retrying
    else:  # If the loop completes without a 'break'
        print("Failed to locate and update Destination input after multiple attempts.")
        raise


    try:

        # Use the provided HTML to locate the correct element:
        mumbai_option_locator = (By.XPATH, f"//div[@data-testid='{DESTINATION_CITY}' and @role='button']")
        mumbai_option = wait.until(EC.element_to_be_clickable(mumbai_option_locator))
        mumbai_option.click()
    except TimeoutException:
        print("Error: Timed out waiting for the 'Mumbai' dropdown item to be clickable.")
        raise
    except Exception as e:
        print(f"Error occurred while selecting 'Mumbai' from dropdown: {e}")
        raise

    # --- 3. Handle Search Button ---
    print("Locating Search button...")
    search_button_locator = (By.CSS_SELECTOR, 'button[data-testid="search-hotels"]')
    search_button = wait.until(EC.element_to_be_clickable(search_button_locator))
    print("Clicking Search button...")
    search_button.click()
    print("Search initiated successfully!")

    time.sleep(5)
    
    new_url = driver.current_url
    print(f"New URL: {new_url}")

except TimeoutException:
    print("Error: Timed out waiting for one or more elements to load or become interactive.")

except Exception as e:
    print(f"An error occurred: {e}")