### Step 1: Importing required libraries

In [1]:
from selenium import webdriver # to interact with the web browser
from selenium.webdriver.common.by import By # to specify element-finding strategies
from selenium.webdriver.chrome.options import Options # to configure browser options
from selenium.webdriver.support.ui import WebDriverWait # to make Selenium wait until certain conditions (like element visibility) are met
from selenium.webdriver.support import expected_conditions as EC # to specify condition while waiting
from selenium.webdriver.common.keys import Keys # to mimic physical keyboard actions
from bs4 import BeautifulSoup # to pull the content of HTML file into a format that can be parsed
import pandas as pd # to handle and analyze data
import time # to control waiting times during execution

### Step 2: Setting up the webpage using chrome driver for further operations

In [20]:
# Initiating webdriver with specified options and executable path object
driver_path = r"C:\Users\Dell\Desktop\chromedriver129\chromedriver.exe" # specify local path for chrome driver
chrome_options = Options() # Configure Chrome options
driver = webdriver.Chrome(executable_path = driver_path, options=chrome_options)

# Loading the webpage
url = " https://enquiry.indianrail.gov.in/mntes/"
driver.get(url) # Opens the specified url

# Find 'More Info' in the webpage and click (wait for atleast 30 seconds for the element to be located)
link1 = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="sidebar"]/form/ul/li[6]')))
link1.click()
time.sleep(5) # Wait for 5 second before continuing execution

# Find 'Average Delay' in the webpage and click (wait for atleast 30 seconds for the element to be located)
link2 = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="pageSubmenu2"]/li[3]')))
link2.click()

### Step 3: Looping through list containing train number to scrap raw data

In [None]:
# Load the csv file containing train numbers
Train_df = pd.read_csv('Train number.csv', dtype={'Train No': 'str'}) # dtype={'Train No': 'str'} ensures that train numbers starting with 0 do not lose the starting digit
Train_list = Train_df['Train No'].to_list() # extract train numbers into a list

# Loop through the train list, scrap data for each train number and save it as raw data
for i in Train_list: # select one vlaue  from the train number list
    try:
        # find the input field for train numbers (wait for 30 seconds for the elemnt to be located), clear existing inputs and send new input for train number
        input_field = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="trainNo"]'))) # find the input field
        input_field.send_keys(Keys.CONTROL + "a") # mimic keyboard action (ctrl + a) to select all content in the input field
        input_field.send_keys(Keys.BACKSPACE) # mimic backspace key to clear input field
        input_field.send_keys(f'{i}') # send new train number
        time.sleep(2) # wait for 2 second before continuing execution

        # find 'Go' in webpage and click (wait for 30 seconds for the elemnt to be located) 
        link3 = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="content"]/form[2]/table/tbody/tr/td[3]/span/input')))
        link3.click()
        time.sleep(5) # wait for 5 second before continuing execution

        # parses the HTML content from the current page loaded by Selenium's WebDriverpage using 'html.parser' and save as soup variable
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # selects all the table rows (<tr> elements) from the page
        rows = soup.select("tr")
    

        # create empty list to store data from tables
        table_data = []

        # loop through table rows
        for row in rows:
            cells = row.find_all("td") # finds all the <td> elements representing data cells
            row_data = [cell.get_text(strip=True) for cell in cells] # extracts the text from each cell, strips out any extra spaces or line breaks and collects the text from all cells in the current row
            table_data.append(row_data) # store it in the table_data list
            
        # create dataframe to store data for each train number, name the columns as 'sr_no', 'station','station_code','avg_arr_delay','avg_dep_delay'
        data = pd.DataFrame(table_data, columns=['sr_no', 'station','station_code','avg_arr_delay','avg_dep_delay'])

        # save data corresponding to each file in the folder "Final Raw Data" in current working directory in the format "Train number.csv"
        data.to_csv(f'Final Raw Data/{i}.csv', index=False) 

    except ValueError: # handling 'ValueError' due to absence of data for some train numbers
        print(f'No delay data available for Train No. {i}') # printing the train numbers that do not have delay data available
    except TimeoutException:
        print('Timed out: Continue to next code cell')
        break

### Step 4: Cleaning raw data and scrapping for missing data

Creating list of train numbers for which
1. Data could not be scraped due to network issues
2. No data is available

Run the code below till network_error_scrap_again is empty.

In [None]:
# Initialize lists to store train numbers with issues
network_error_scrap_again = []  # empty list to store train numbers for which data could not be scraped
no_delay_data = []  # empty list to store train numbers for which delay data is not available

# First pass: loop through list of train numbers, clean the data, and save as CSV
for i in Train_list:
    try:
        # Load data and add train number column
        data = pd.read_csv(f'Final Raw Data/{i}.csv') # Load CSV file for each train where i is the train number
        data['train_no'] = i # Add a new column train_no to store the train number in the dataframe for reference

        # Extract and assign day of run and type of train
        data['days'] = data['sr_no'][2].split(':')[1].strip()
        data['type'] = data['station'][2].split(':')[1].strip()
        
        # Clean the data: drop irrelevant rows (first 4 and last), reset index, replace 'On Time' with '00:00', and fill NaN values with 0
        data = data.drop(index=data.index[:4]).iloc[:-1].reset_index(drop=True)
        data[['avg_arr_delay', 'avg_dep_delay']] = data[['avg_arr_delay', 'avg_dep_delay']].replace('On Time', '00:00')
        data = data.fillna(0)
        
        # Save cleaned data
        data.to_csv(f'Cleaned Data/{i}_cleaned.csv')

    except FileNotFoundError:  # handle cases having no delay data and thus no raw data file
        no_delay_data.append(i)
        print(f'No delay data available for Train No. {i}')
    except KeyError:  # handle cases having blank raw data csv file
        network_error_scrap_again.append(i)
        print(f'Data not scrapped for Train No. {i}')

# Retry scraping for network errors until 'network_error_scrap_again' list is empty
while network_error_scrap_again:
    for i in network_error_scrap_again:
        try:
            # Wait for the input field to be present and interact with it
            input_field = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="trainNo"]')))
            input_field.send_keys(Keys.CONTROL + "a")
            input_field.send_keys(Keys.BACKSPACE)
            input_field.send_keys(f'{i}')
            time.sleep(2)

            # Click the button to submit the form
            link3 = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//*[@id="content"]/form[2]/table/tbody/tr/td[3]/span/input')))
            link3.click()
            time.sleep(5)

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, "html.parser")
            rows = soup.select("tr")
            table_data = []

            for row in rows:
                cells = row.find_all("td")
                row_data = [cell.get_text(strip=True) for cell in cells]
                table_data.append(row_data)

            # Create a DataFrame and save it to CSV
            data = pd.DataFrame(table_data, columns=["sr_no", "station", 'station_code', 'avg_arr_delay', 'avg_dep_delay'])
            data.to_csv(f'Final Raw Data/{i}.csv', index=False)

            # Remove the processed train number from the list if successful
            network_error_scrap_again.remove(i)
            print(f'Successfully processed Train No. {i}')

        except ValueError:
            print(f'No delay data available for Train No. {i}')
            network_error_scrap_again.remove(i)  # Remove from the list even if there's an error
        except Exception as e:
            print(f'An error occurred while processing Train No. {i}: {e}')

print("Train numbers not scrapped due to network error:", network_error_scrap_again)
print("No delay data train numbers:", no_delay_data)

### Step 5: Preparing data for generating recommendation chart

In [12]:
# Remove train numbers with missing delay data
Train_list = list(set(Train_list) - set(no_delay_data))

# Load and merge CSV files
dataframes = [pd.read_csv(f'Cleaned Data/{i}_cleaned.csv', dtype={'train_no': 'str'}) for i in Train_list]
merged_df = pd.concat(dataframes, ignore_index=True).iloc[:, 2:]

# Convert delay time from HH:MM to minutes
def convert_time(time_str):
    try:
        hr, min = map(int, time_str.split(':')) # splits time_str wherever a colon : appears, which separates hours and minutes in a time string, store in a list and convert each element in list into integers
        return hr * 60 + min
    except (AttributeError, IndexError, ValueError):
        return None

data = merged_df.fillna(0)
data['avg_arr_delay'] = data['avg_arr_delay'].apply(convert_time)
data = data.fillna(0)
data.drop(data.columns[[1, 3]], axis=1, inplace=True)

# Save processed data to CSV
data.to_csv('data_for_input_command.csv', index=False)

### Step 6: Generating recommendations based on the choice of destination and day of journey

In [None]:
# Get destination input, load data, and filter by common characters in the station name
destination = input('Enter destination (city name or station name (when city and station names are different: )').upper()
data = pd.read_csv('data_for_input_command.csv', dtype={'train_no': 'str'})

# Use str.contains to find stations with common characters ('na=False' - treats NaN values as if they don't match)
output = data[data['station'].str.contains(destination, case=False, na=False)].sort_values(by='avg_arr_delay')[['train_no', 'station', 'avg_arr_delay', 'days', 'type']]

# Ask for day input
day_input = input('Enter the day of the week: ').strip().lower()

# Use only the first three characters for matching
day_input_short = day_input[:3]  # Get the first three characters

# Match input day with the days of run (including "daily" as a match for any day)
if day_input != "daily":
    output = output[output['days'].str.contains(day_input_short, case=False, na=False) | output['days'].str.contains("daily", case=False)]

# Rename columns and reset index to start from 1, excluding the 'days' column
output = output[['train_no', 'station', 'avg_arr_delay', 'type']].rename(columns={
    'train_no': 'Train Number',
    'station': 'Station',
    'avg_arr_delay': 'Delay Time (in Minutes)', 
    'type': 'Type'
})
output.reset_index(drop=True, inplace=True)
output.index += 1  # Start index from 1

print(f"Recommendation chart for Journey to {destination} on {day_input.capitalize()}")
output