### **1. Library Installation**

In [139]:
'''
%pip install folium
%pip install requests
%pip install pandas
'''

'\n%pip install folium\n%pip install requests\n%pip install pandas\n'

### **2. Library Importing**

In [140]:
import pandas as pd
import folium
import requests

### **3. Data Preprocessing**

**3.1 General Exploring and Cleaning**

In [141]:
# Reading Metro Network '.csv' file into a pandas dataframe
data = pd.read_csv("data\\Delhi-Metro-Network.csv")

In [142]:
data.head()

Unnamed: 0,Station ID,Station Name,Distance from Start (km),Line,Opening Date,Station Layout,Latitude,Longitude
0,1,Jhil Mil,10.3,Red line,2008-04-06,Elevated,28.67579,77.31239
1,2,Welcome [Conn: Red],46.8,Pink line,2018-10-31,Elevated,28.6718,77.27756
2,3,DLF Phase 3,10.0,Rapid Metro,2013-11-14,Elevated,28.4936,77.0935
3,4,Okhla NSIC,23.8,Magenta line,2017-12-25,Elevated,28.554483,77.264849
4,5,Dwarka Mor,10.2,Blue line,2005-12-30,Elevated,28.61932,77.03326


In [143]:
# Checking data dimensionality
data.shape

(285, 8)

In [144]:
# Checking for null data in all columns
null_counts = data.isnull().sum()
print(null_counts)

Station ID                  0
Station Name                0
Distance from Start (km)    0
Line                        0
Opening Date                0
Station Layout              0
Latitude                    0
Longitude                   0
dtype: int64


In [145]:
# Exploring the columns of the dataframe
data.columns

Index(['Station ID', 'Station Name', 'Distance from Start (km)', 'Line',
       'Opening Date', 'Station Layout', 'Latitude', 'Longitude'],
      dtype='object')

In [146]:
# Dropping Stations ID, because there is no need to manually index rows
data.drop('Station ID', axis=1, inplace=True)

In [147]:
# Checking for value counts in categorical attributes: Line & Station Layout.
columns = ['Line','Station Layout']
for column in columns:
    print(f"Value counts for {column}:")
    print(data[column].value_counts())
    print()

Value counts for Line:
Line
Blue line            49
Pink line            38
Yellow line          37
Voilet line          34
Red line             29
Magenta line         25
Aqua line            21
Green line           21
Rapid Metro          11
Blue line branch      8
Orange line           6
Gray line             3
Green line branch     3
Name: count, dtype: int64

Value counts for Station Layout:
Station Layout
Elevated       214
Underground     68
At-Grade         3
Name: count, dtype: int64



In [148]:
# Replacing 'line' to remove data redundancy
data['Line'] = data['Line'].str.replace(' line','')

# Removing branch to keep data consistency ('Green line branch' & 'Green line' surely represent one category) 
data['Line'] = data['Line'].replace({
    'Green line branch': 'Green line',
    'Blue line branch': 'Blue line'
})

In [149]:
# Replacing 'At-Grade' with 'Ground' for better data explanation
data['Station Layout'] = data['Station Layout'].replace({
    'At-Grade': 'Ground'
})

**3.2 Cleaning Column 'Station Name'** 

There is duplicate combinations of 'Latitude' and 'Longitude' data with different 'Station Name' which I considered as inconsistent data, as there can not be two different Stations in one reference point. So I used Opeen Street Map API to get more information about the coordinate.

In [151]:
# Function for making an API call to Open Street Map and returning information retrieved from the web
# This function is time consuming for larger datasets since it makes an API call for each row at once
def get_street_data(latitude, longitude):
    url = f"https://nominatim.openstreetmap.org/reverse?format=json&lat={latitude}&lon={longitude}&zoom=18&addressdetails=1"
    response = requests.get(url)
    data = response.json()
    #print(f"URL for testing Open Street Map API {url}")
    return data

In [152]:
latitude = 28.4089049
longitude = 76.9155232

street_data = get_street_data(latitude, longitude)
station_name = street_data.get('display_name')
print()
print(f"Display name: {station_name}")

URL for testing Open Street Map API https://nominatim.openstreetmap.org/reverse?format=json&lat=28.4089049&lon=76.9155232&zoom=18&addressdetails=1

Display name: Mewka, Sector 92, Gurgaon, Gurugram District, Haryana, 122505, India


In [153]:
# Generating a new feature 'Latitude_Longitude' to search for duplicate records
data['Latitude_Longitude'] = data['Latitude'].astype(str) + ' ' + data['Longitude'].astype(str)

# Checking for duplicate combinations of 'Latitude_Longitude'
duplicate_combinations = data[data.duplicated(subset=['Latitude_Longitude'], keep=False)]
duplicate_combinations_count = duplicate_combinations['Latitude_Longitude'].count()

if not duplicate_combinations.empty:
    print(f"{duplicate_combinations_count} duplicates found.\n")
    print("Duplicate combinations of latitude and longitude:\n")
    print(duplicate_combinations[['Station Name','Line','Latitude_Longitude']])

52 duplicates found.

Duplicate combinations of latitude and longitude:

                                       Station Name          Line  \
1                               Welcome [Conn: Red]          Pink   
2                                       DLF Phase 3   Rapid Metro   
10               Central Secretariat [Conn: Violet]        Yellow   
18                                 Belvedere Towers   Rapid Metro   
28                            Delta 1 Greater Noida          Aqua   
31                     Kashmere Gate [Conn: Yellow]        Voilet   
36   Dwarka Sector 21(First station) [Conn: Orange]          Blue   
39                                    Old Faridabad        Voilet   
54                                Inderlok Conn:Red         Green   
57                                 Noida Sector 148          Aqua   
62                                  Noida Sector 50          Aqua   
66              Kashmere Gate [Conn: Violet,Yellow]           Red   
71                            

In [None]:
# Function for fetching new station name
def fetch_station_name(row):
    street_data = get_street_data(row['Latitude'], row['Longitude'])
    return street_data.get('display_name', None)

data['New Station Name'] = data.apply(fetch_station_name, axis=1)

In [157]:
data['New Station Name'] = data['New Station Name'].str.split(',').str[0]

In [158]:
# Adding feature 'Status' for
#data['Status'] = 'Unique'
#data.loc[data[data.loc[:, ['Latitude', 'Longitude']].duplicated(keep=False)].index, 'Status'] = 'Duplicate'

In [159]:
# Saving the processed dataframe into new folder destined for cleaned and processed data.
data.to_csv('data_preprocessed\\Delhi-Metro-Network.csv')