In [66]:
import pandas as pd
from tqdm import tqdm
from math import sin, cos, sqrt, atan2, radians

In [95]:
bus_stop_data = pd.read_csv('data/bus_stops.csv')
mrt_latlong_data = pd.read_csv('data/mrt_stations_with_geo_data.csv').drop_duplicates()
mrt_station_codes = pd.read_excel('data/Train Station Codes and CHinese Names.xls')

In [96]:
bus_stop_data.head()

Unnamed: 0,BusStopCode,RoadName,Description,Latitude,Longitude
0,1012,Victoria St,Hotel Grand Pacific,1.296848,103.852536
1,1013,Victoria St,St. Joseph's Ch,1.29771,103.853225
2,1019,Victoria St,Bras Basah Cplx,1.29699,103.853022
3,1029,Nth Bridge Rd,Opp Natl Lib,1.296673,103.854414
4,1039,Nth Bridge Rd,Bugis Cube,1.298208,103.855491


In [97]:
mrt_latlong_data.head()

Unnamed: 0,station_name,latitude,longitude,postal,x,y,street
0,Jurong East,1.331864,103.740927,608513,17717.794105,34896.479188,JURONG EAST BUS INTERCHANGE
1,Bukit Batok,1.349719,103.750969,650631,18835.371845,36870.793752,BUKIT BATOK BUS INTERCHANGE
2,Bukit Gombak,1.367755,103.757472,667988,19559.140114,38865.049165,BUKIT GOMBAK CAMP
3,Choa Chu Kang,1.385393,103.744067,689811,18067.42694,40815.409193,CHOA CHU KANG BUS INTERCHANGE
4,Yew Tee,1.397194,103.745923,NIL,18274.010378,42120.341121,HEART OF YEW TEE


In [98]:
mrt_station_codes.head()

Unnamed: 0,stn_code,mrt_station_english,mrt_station_chinese,mrt_line_english,mrt_line_chinese
0,NS1,Jurong East,裕廊东,North-South Line,南北线
1,NS2,Bukit Batok,武吉巴督,North-South Line,南北线
2,NS3,Bukit Gombak,武吉甘柏,North-South Line,南北线
3,NS4,Choa Chu Kang,蔡厝港,North-South Line,南北线
4,NS5,Yew Tee,油池,North-South Line,南北线


In [99]:
# We need to get the mrt_line_english into the mrt_station_data. So, we will join the two tables.
mrt_latlong_data["station_name"] = mrt_latlong_data["station_name"].str.strip().str.title()
mrt_station_codes["mrt_station_english"] = mrt_station_codes["mrt_station_english"].str.strip().str.title()

print(mrt_station_codes.shape)
print(mrt_latlong_data.shape)

mrt_station_codes["mrt_station_english"].value_counts(sort=True,ascending=False)



(211, 5)
(180, 7)


mrt_station_english
Outram Park       3
Marina Bay        3
Dhoby Ghaut       3
Bugis             2
Raffles Place     2
                 ..
Buangkok          1
Bras Basah        1
Esplanade         1
Nicoll Highway    1
Bayshore          1
Name: count, Length: 179, dtype: int64

Note that different in number of rows between the 2 dataframe. It is because an MRT station can have multiple MRT lines

In [100]:
mrt_station_data = pd.merge(mrt_latlong_data, 
                            mrt_station_codes, 
                            left_on='station_name', 
                            right_on='mrt_station_english', 
                            how='left')

In [101]:
#function to calculate distance given lat and long of two points and return the distance in metres
def calculate_distance(lat1, lon1, lat2, lon2):
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c * 1000

    return distance


In [102]:
# Change Circle line Extension values to Circle Line of mrt_station_data

mrt_station_data.loc[mrt_station_data["mrt_line_english"] == "Circle Line Extension","mrt_line_english"] = "Circle Line"
mrt_station_data = mrt_station_data[mrt_station_data["mrt_line_english"] != "Changi Airport Branch Line"]
mrt_station_data["mrt_line_english"] = mrt_station_data["mrt_line_english"].str.strip().map({'North-South Line':'NSL',
                                                                                             'East-West Line':'EWL',
                                                                                             'Bukit Panjang LRT':'BPLRT',
                                                                                             'Thomson-East Coast Line':'TEL',
                                                                                             'Circle Line':'CCL',
                                                                                             'Downtown Line':'DTL',
                                                                                             'North East Line':'NEL',
                                                                                             'Sengkang LRT':'SGLRT',
                                                                                             'Punggol LRT':'PGLRT'
                                                                                             }
                                                                                             )
mrt_lines = mrt_station_data["mrt_line_english"].unique()
mrt_lines.sort()
mrt_lines
# 9 MRT Lines

array(['BPLRT', 'CCL', 'DTL', 'EWL', 'NEL', 'NSL', 'PGLRT', 'SGLRT',
       'TEL'], dtype=object)

In [103]:
# This array will have rows that we wil join side by side with the original bus stops dataframe.
# Each row will have: [BusStopCode, BPLRT_distance, BPLRT_station_name, CCL_distance, CCL_station_name, ...]
rows = []

for i,row in tqdm(bus_stop_data.iterrows()):
    row_data = []
    row_data.append(row["BusStopCode"])
    for j, mrt_line in enumerate(mrt_lines):

        min_distance = 1000000000
        nearest_station = ""
        for k, station_row in mrt_station_data[mrt_station_data["mrt_line_english"] == mrt_line].iterrows():
            dist = calculate_distance(row["Latitude"], row["Longitude"], station_row["latitude"], station_row["longitude"])
            if dist < min_distance:
                min_distance = dist
                nearest_station = station_row["station_name"]
    
        row_data.extend([min_distance,nearest_station])
    rows.append(row_data)


5137it [00:35, 144.94it/s]


In [104]:
#Turn rows into a dataframe. Then join the bus_stop_data dataframe with the rows dataframe side by side.

column_names = [(f"{mrt_line}_distance",f"{mrt_line}_station_name") for mrt_line in mrt_lines]

rows = pd.DataFrame(rows, 
             columns=["BusStopCode"] + 
             [val for pair in column_names for val in pair]
             )

bus_stop_data_updated = pd.merge(bus_stop_data, rows, on="BusStopCode", how="left",validate="1:1")

In [111]:
bus_stop_data_updated.to_csv("processed_data/bus_stops_with_nearest_mrt_data.csv",index=False)