In [5]:
import pandas as pd

# Load dataset
df = pd.read_excel("Railway_info.csv.xlsx")

# Display the first 10 rows
print("First 10 rows of the dataset:")
print(df.head(10))

# Check data structure and missing values
print("\nData types and missing values:")
print(df.info())
print("\nMissing value counts:")
print(df.isnull().sum())


First 10 rows of the dataset:
   Train_No    Train_Name           Source_Station_Name  \
0       107  SWV-MAO-VLNK               SAWANTWADI ROAD   
1       108  VLNK-MAO-SWV                   MADGOAN JN.   
2       128  MAO-KOP SPEC                   MADGOAN JN.   
3       290  PALACE ON WH             DELHI-SAFDAR JANG   
4       401  BSB BHARATDA                    AURANGABAD   
5       421  LKO-SVDK FTR                   LUCKNOW JN.   
6       422  SVDK-LKO FTR  SHRI MATA VAISHNO DEVI KATRA   
7       477  FTR TRAIN NO                         SIRSA   
8       502  RJPB-UMB FTR        RAJENDRANAGAR TERMINAL   
9       504  PNBE-BTI FTR                     PATNA JN.   

             Destination_Station_Name       days  
0                         MADGOAN JN.   Saturday  
1                     SAWANTWADI ROAD     Friday  
2  CHHATRAPATI SHAHU MAHARAJ TERMINUS     Friday  
3                   DELHI-SAFDAR JANG  Wednesday  
4                        VARANASI JN.   Saturday  
5        SHRI 

In [6]:
# Total number of trains
num_trains = df.shape[0]

# Unique source and destination stations
unique_sources = df['Source_Station_Name'].nunique()
unique_destinations = df['Destination_Station_Name'].nunique()

# Most common source and destination stations
most_common_source = df['Source_Station_Name'].mode()[0]
most_common_destination = df['Destination_Station_Name'].mode()[0]

print(f"Total trains: {num_trains}")
print(f"Unique source stations: {unique_sources}")
print(f"Unique destination stations: {unique_destinations}")
print(f"Most common source station: {most_common_source}")
print(f"Most common destination station: {most_common_destination}")


Total trains: 11113
Unique source stations: 921
Unique destination stations: 924
Most common source station: CST-MUMBAI
Most common destination station: CST-MUMBAI


In [13]:
# Check missing values again
print("Missing values per column:")
print(df.isnull().sum())

# Option 1: Drop rows with critical missing values
df_cleaned = df.dropna(subset=['Source_Station_Name', 'Destination_Station_Name'])

# Option 2: Fill non-critical missing values with placeholders
df_cleaned = df_cleaned.fillna("Unknown")


Missing values per column:
Train_No                    0
Train_Name                  0
Source_Station_Name         0
Destination_Station_Name    0
days                        0
dtype: int64


In [14]:
# Standardize station names to uppercase
df_cleaned['Source_Station_Name'] = df_cleaned['Source_Station_Name'].str.upper()
df_cleaned['Destination_Station_Name'] = df_cleaned['Destination_Station_Name'].str.upper()

# ✅ Display updated values
print("\nSample cleaned station names:")
print(df_cleaned[['Source_Station_Name', 'Destination_Station_Name']].head(10))



Sample cleaned station names:
            Source_Station_Name            Destination_Station_Name
0               SAWANTWADI ROAD                         MADGOAN JN.
1                   MADGOAN JN.                     SAWANTWADI ROAD
2                   MADGOAN JN.  CHHATRAPATI SHAHU MAHARAJ TERMINUS
3             DELHI-SAFDAR JANG                   DELHI-SAFDAR JANG
4                    AURANGABAD                        VARANASI JN.
5                   LUCKNOW JN.        SHRI MATA VAISHNO DEVI KATRA
6  SHRI MATA VAISHNO DEVI KATRA                         LUCKNOW JN.
7                         SIRSA                               SIRSA
8        RAJENDRANAGAR TERMINAL                     AMBALA CANTT JN
9                     PATNA JN.                         BATHINDA JN
