## 4. Exploratory Data Analysis and Visualization

#### 4.1 Introduction to this notebook

#### 4.2 Data Loading and Setup

In [9]:
!pip install pandoc
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting pandas data 
import seaborn as sns # improving visual aesthetics and offers wide range of plots
import plotly.express as px # interactive mapping (used for Network Analysis and Line Analysis)
import plotly.graph_objects as go
import folium # interactive mapping (used for Network Analysis and Line Analysis)
from folium.plugins import MarkerCluster
from IPython.display import display, Markdown
import warnings
warnings.filterwarnings('ignore')

# Add path to the Static Data directory
path_to_dir = "../data/processed_data/"

# Processed data
routes = pd.read_csv(path_to_dir + 'routes.csv')
stations = pd.read_csv(path_to_dir + 'stations.csv')
shapes = pd.read_csv(path_to_dir + 'shapes.csv')
stop_times = pd.read_csv(path_to_dir + 'stop_times.csv')
trips = pd.read_csv(path_to_dir + 'trips.csv')

Collecting pandoc
  Downloading pandoc-2.4.tar.gz (34 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting plumbum (from pandoc)
  Downloading plumbum-1.9.0-py3-none-any.whl.metadata (10 kB)
Collecting ply (from pandoc)
  Downloading ply-3.11-py2.py3-none-any.whl.metadata (844 bytes)
Downloading plumbum-1.9.0-py3-none-any.whl (127 kB)
Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
Building wheels for collected packages: pandoc
  Building wheel for pandoc (pyproject.toml): started
  Building wheel for pandoc (pyproject.toml): finished with status 'done'
  Created wheel for pandoc: filename=pandoc-2.4-py3-none-any.whl size=34898 sha256=97426b5067de9dd20d2fafaebe45cabc4fd73443204f28379202f26c63a6a3a


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


#### 4.3 Initial Data Cleaning and Exploration

##### 4.3.1 Missing Values

In [3]:
datasets = [routes, stations, shapes, stop_times, trips]
for df, name in zip(datasets, ['Routes', 'Stations', 'Shapes', 'Stop Times', 'Trips']):
    print(f"Missing values in {name}:\n{df.isnull().sum()}\n")

Missing values in Routes:
route_id               0
metro_line             0
route_name             0
route_start            0
route_end              0
route_type             0
stations               0
interchangeStations    0
shape_id               0
dtype: int64

Missing values in Stations:
station_id      0
station_name    0
metro_line      0
routes          0
latitude        0
longitude       0
station_type    0
dtype: int64

Missing values in Shapes:
shape_id               0
route_id               0
metro_line             0
shape_pt_sequence      0
shape_pt_lat           0
shape_pt_lon           0
shape_dist_traveled    0
dtype: int64

Missing values in Stop Times:
trip_id                0
station_id             0
station_sequence       0
arrival_time           0
departure_time         0
shape_dist_traveled    0
dtype: int64

Missing values in Trips:
route_id        0
service_type    0
trip_id         0
shape_id        0
dtype: int64



No missing values since we have already dealt with it in the data cleaning and processing stage.

##### 4.3.2 Data Types

In [4]:
for df, name in zip(datasets, ['Routes', 'Stations', 'Shapes', 'Stop Times', 'Trips']):
    print(f"Data types in {name}:\n{df.dtypes}\n")

Data types in Routes:
route_id                int64
metro_line             object
route_name             object
route_start            object
route_end              object
route_type             object
stations               object
interchangeStations    object
shape_id               object
dtype: object

Data types in Stations:
station_id        int64
station_name     object
metro_line       object
routes           object
latitude        float64
longitude       float64
station_type     object
dtype: object

Data types in Shapes:
shape_id                object
route_id                 int64
metro_line              object
shape_pt_sequence        int64
shape_pt_lat           float64
shape_pt_lon           float64
shape_dist_traveled    float64
dtype: object

Data types in Stop Times:
trip_id                  int64
station_id               int64
station_sequence         int64
arrival_time            object
departure_time          object
shape_dist_traveled    float64
dtype: object

Data 

##### 4.3.3 Datetime Formatting

In [5]:
stop_times['arrival_time'] = pd.to_datetime(stop_times['arrival_time'], format='%H:%M:%S', errors='coerce')
stop_times['departure_time'] = pd.to_datetime(stop_times['departure_time'], format='%H:%M:%S', errors='coerce')

##### 4.3.4 Basic Statistics

In [6]:
for df, name in zip(datasets, ['Routes', 'Stations', 'Shapes', 'Stop Times', 'Trips']):
    print(f"Basic statistics for {name}:\n{df.describe()}\n")

Basic statistics for Routes:
        route_id
count  36.000000
mean   17.500000
std    10.535654
min     0.000000
25%     8.750000
50%    17.500000
75%    26.250000
max    35.000000

Basic statistics for Stations:
       station_id    latitude   longitude
count  286.000000  286.000000  286.000000
mean   145.132867   28.598539   77.215460
std    122.565972    0.079988    0.118197
min      1.000000   28.339899   76.919128
25%     61.250000   28.550648   77.127554
50%    121.500000   28.614899   77.214692
75%    190.750000   28.667693   77.296618
max    520.000000   28.742872   77.544075

Basic statistics for Shapes:
          route_id  shape_pt_sequence  shape_pt_lat  shape_pt_lon  \
count  6643.000000        6643.000000   6643.000000   6643.000000   
mean     16.332530         124.625922     28.602008     77.210184   
std      10.203636         100.105057      0.072177      0.111904   
min       0.000000           1.000000     28.339899     76.919144   
25%       6.000000          47.00

#### 4.4 Geographical Analysis

##### 4.4.1 Plotting stations

In [7]:
metro_color_codes = {
    'Red Line': "#FF4040",
    'Yellow Line': "#FFDF00",
    'Blue Line': "#4169E1",
    'Green Line': "#20B2AA",
    'Violet Line': "#553592",
    'Pink Line': "#FC8EAC",
    'Magenta Line': "#CC338B",
    'Gray Line': "#838996",
    'Orange/airport Line': "#FF8C00",
    'Rapid Line': "cadetblue",
    'Aqua Line': "aqua"
}

In [8]:
# Create a blank map
fig = go.Figure()

# Plot each route
for _, route in routes.iterrows():
    # Get the shape points for the route
    shape_data = shapes[shapes['route_id'] == route['route_id']]
    
    # Add route lines to the figure
    fig.add_trace(go.Scattermapbox(
        mode='lines',
        lon=shape_data['shape_pt_lon'],
        lat=shape_data['shape_pt_lat'],
        line=dict(width=3, color=metro_color_codes.get(route['metro_line'], 'gray')),
        name=route['metro_line'],
        hoverinfo='text',
        hovertext=f"Route: {route['route_name']}"
    ))

# Plot the stations
fig.add_trace(go.Scattermapbox(
    mode='markers+text',
    lon=stations['longitude'],
    lat=stations['latitude'],
    marker=dict(size=11, color=stations['color'], symbol='circle'),
    text=stations['station_name'],
    textposition='top center',
    hoverinfo='text',
    hovertext=stations['station_name'] + '<br>Metro Line: ' + stations['metro_line']
))

# Update layout
fig.update_layout(
    mapbox=dict(
        style='carto-positron',
        center=dict(lat=28.6139, lon=77.2090),
        zoom=11
    ),
    title="Delhi Metro Network",
    title_x=0.5,
    margin={"r":0,"t":50,"l":0,"b":0},
    showlegend=False,
)

# Display the map
fig.show()

KeyError: 'color'

In [None]:
# Create a base map centered around Delhi
map_delhi = folium.Map(location=[28.6139, 77.2090], tiles="cartodbpositron", zoom_start=11)

# Define custom icons for stations (optional, you can use a URL to an icon if you have it)
custom_icon_url = './img/metro_pin.png'  # Replace with your icon's URL
custom_icon = folium.CustomIcon(custom_icon_url, icon_size=(25, 25))

# Plot the routes
for _, route in routes.iterrows():
    # Get the shape points for the route
    shape_data = shapes[shapes['route_id'] == route['route_id']]
    
    # Create a list of coordinates (latitude, longitude) for the polyline
    route_coords = list(zip(shape_data['shape_pt_lat'], shape_data['shape_pt_lon']))
    
    # Add the route as a polyline on the map
    folium.PolyLine(
        route_coords,
        color=metro_color_codes.get(route['metro_line'], 'gray'),  # Get color for metro line
        weight=3,  # Line thickness
        tooltip=f"Route: {route['route_name']} ({route['metro_line']})",  # Hover information
    ).add_to(map_delhi)

# Add stations with hover info and custom marker (use MarkerCluster to cluster nearby stations)
marker_cluster = MarkerCluster().add_to(map_delhi)

for _, station in stations.iterrows():
    # Use a custom icon if available, otherwise use a default folium marker
    color = 'black' if station['station_type'] in ['Interchange', 'Terminal/Interchange'] else metro_color_codes.get(station['metro_line'], 'blue')
    icon = 'info' if station['station_type'] in ['Interchange', 'Terminal/Interchange'] else 'train'
    
    folium.Marker(
        location=[station['latitude'], station['longitude']],
        popup=f"Station: <b>{station['station_name']}</b><br>Metro Line: {station['metro_line']}",
        icon=folium.Icon(color="white", icon_color=color, icon=icon, prefix='fa'),
        tooltip=f"<b>{station['station_name']}</b><br>Metro Line: {station['metro_line']}"
    ).add_to(map_delhi)

# Display the map
map_delhi