In [1]:
import pandas as pd
import numpy as np

# Definitions
* **Bus line**:

    Defined as: (bus_service_number,direction) 
    
    There are 2 directions for each bus service, 1 and 2.

* **Distance between MRT from a Bus Stop**:

    Defined using Euclidean Distance

    This a proxy for walking distance.

# Approach
We will split the work into multiple notebooks for faster computation and debugging

* get_nearest_mrt_to_bus_stops.ipynb -> ETL to get bus_stops_with_nearest_mrt_data.csv

    The new csv will have at least the following columns: 
     - Bus Stop Code
     - MRT Station Name
     - CCL_station_name
     - CCL_distance
     - NSL_station_name
     - NSL_distance
     ... (we will have 2 columns for each MRT line)
    
    Primary Key is Bus Stop Code.

* Bus Algorithm.ipynb -> dictionary of scores



In [2]:
bus_routes_df = pd.read_csv('data/bus_routes.csv')
bus_routes_df.head()

Unnamed: 0,ServiceNo,Operator,Direction,StopSequence,BusStopCode,Distance,WD_FirstBus,WD_LastBus,SAT_FirstBus,SAT_LastBus,SUN_FirstBus,SUN_LastBus
0,10,SBST,1,1,75009,0.0,500,2300,500,2300,500,2300
1,10,SBST,1,2,76059,0.6,502,2302,502,2302,502,2302
2,10,SBST,1,3,76069,1.1,504,2304,504,2304,503,2304
3,10,SBST,1,4,96289,2.3,508,2308,508,2309,507,2308
4,10,SBST,1,5,96109,2.7,509,2310,509,2311,508,2309


In [3]:
bus_stops_with_nearest_mrt_df = pd.read_csv('processed_data/bus_stops_with_nearest_mrt_data.csv')
bus_stops_with_nearest_mrt_df.head()

Unnamed: 0,BusStopCode,RoadName,Description,Latitude,Longitude,BPLRT_distance,BPLRT_station_name,CCL_distance,CCL_station_name,DTL_distance,...,NEL_distance,NEL_station_name,NSL_distance,NSL_station_name,PGLRT_distance,PGLRT_station_name,SGLRT_distance,SGLRT_station_name,TEL_distance,TEL_station_name
0,1012,Victoria St,Hotel Grand Pacific,1.296848,103.852536,12627.645635,Pending,593.41522,Bras Basah,360.474492,...,830.710778,Dhoby Ghaut,371.033933,Somerset,1871.146185,Nibong,10381.52018,Fernvale,1917.33853,Gardens By The Bay
1,1013,Victoria St,St. Joseph's Ch,1.29771,103.853225,12616.136712,Pending,471.411686,Bras Basah,239.135214,...,877.652738,Dhoby Ghaut,368.800944,Somerset,1903.57113,Nibong,10272.490723,Fernvale,1935.477848,Gardens By The Bay
2,1019,Victoria St,Bras Basah Cplx,1.29699,103.853022,12655.453666,Pending,545.294931,Bras Basah,320.881807,...,876.763197,Dhoby Ghaut,397.758557,Somerset,1851.135676,Nibong,10355.453514,Fernvale,1891.161808,Gardens By The Bay
3,1029,Nth Bridge Rd,Opp Natl Lib,1.296673,103.854414,12791.429559,Pending,490.588813,Bras Basah,326.541656,...,1035.017656,Dhoby Ghaut,459.503618,City Hall,1734.383323,Nibong,10361.025686,Fernvale,1761.810539,Gardens By The Bay
4,1039,Nth Bridge Rd,Bugis Cube,1.298208,103.855491,12761.714232,Pending,288.150419,Bras Basah,208.321318,...,1117.203665,Dhoby Ghaut,585.584287,Somerset,1823.333386,Nibong,10171.916411,Fernvale,1825.80406,Gardens By The Bay


In [4]:
# For [BPLRT_distance,CCL_distance,DTL_distance,EWL_distance,NEL_distance,NSL_distance,PGLRT_distance,SGLRT_distance,TEL_distance] columns, pick the smallest distance and create a new column on it
def fn_get_distance(bus_stop_row:pd.Series) -> float:
    return min(bus_stop_row['BPLRT_distance'],bus_stop_row['CCL_distance'],bus_stop_row['DTL_distance'],bus_stop_row['EWL_distance'],bus_stop_row['NEL_distance'],bus_stop_row['NSL_distance'],bus_stop_row['PGLRT_distance'],bus_stop_row['SGLRT_distance'],bus_stop_row['TEL_distance'])

bus_stops_with_nearest_mrt_df['nearest_mrt_distance'] = bus_stops_with_nearest_mrt_df.apply(fn_get_distance,axis=1)

def fn_get_nearest_mrt(bus_stop_row:pd.Series) -> str:
    if bus_stop_row['nearest_mrt_distance'] == bus_stop_row['BPLRT_distance']:
        return 'BPLRT_distance'
    elif bus_stop_row['nearest_mrt_distance'] == bus_stop_row['CCL_distance']:
        return 'CCL_distance'
    elif bus_stop_row['nearest_mrt_distance'] == bus_stop_row['DTL_distance']:
        return 'DTL_distance'
    elif bus_stop_row['nearest_mrt_distance'] == bus_stop_row['EWL_distance']:
        return 'EWL_distance'
    elif bus_stop_row['nearest_mrt_distance'] == bus_stop_row['NEL_distance']:
        return 'NEL_distance'
    elif bus_stop_row['nearest_mrt_distance'] == bus_stop_row['NSL_distance']:
        return 'NSL_distance'
    elif bus_stop_row['nearest_mrt_distance'] == bus_stop_row['PGLRT_distance']:
        return 'PGLRT_distance'
    elif bus_stop_row['nearest_mrt_distance'] == bus_stop_row['SGLRT_distance']:
        return 'SGLRT_distance'
    elif bus_stop_row['nearest_mrt_distance'] == bus_stop_row['TEL_distance']:
        return 'TEL_distance'
bus_stops_with_nearest_mrt_df['nearest_mrt_station_name'] = bus_stops_with_nearest_mrt_df.apply(fn_get_nearest_mrt,axis=1)

In [5]:
# Convert df to dict for fast look up
bus_stops_with_nearest_mrt_df.set_index('BusStopCode', inplace=True)
bus_stops_with_nearest_mrt_dict = bus_stops_with_nearest_mrt_df.to_dict(orient='index')
# E.g.
bus_stops_with_nearest_mrt_dict[17091]

{'RoadName': 'Clementi Rd',
 'Description': 'Aft Clementi Ave 1',
 'Latitude': 1.30904805161426,
 'Longitude': 103.77157783850733,
 'BPLRT_distance': 7462.235844557487,
 'BPLRT_station_name': 'Pending',
 'CCL_distance': 2042.3107185462184,
 'CCL_station_name': 'One-North',
 'DTL_distance': 2711.8760874703285,
 'DTL_station_name': 'King Albert Park',
 'EWL_distance': 1345.036173889697,
 'EWL_station_name': 'Dover',
 'NEL_distance': 7413.860801613406,
 'NEL_station_name': 'Harbourfront',
 'NSL_distance': 4249.411619989203,
 'NSL_station_name': 'Jurong East',
 'PGLRT_distance': 3159.473265091157,
 'PGLRT_station_name': 'Oasis',
 'SGLRT_distance': 14155.159113654605,
 'SGLRT_station_name': 'Fernvale',
 'TEL_distance': 5274.605123069488,
 'TEL_station_name': 'Napier',
 'nearest_mrt_distance': 1345.036173889697,
 'nearest_mrt_station_name': 'EWL_distance'}

In [6]:
busline_scores = {} #Key: (bus_service_number,direction), Value: hashmap of MRT line scores.
#Hashmap of MRT line scores will have: key: MRT line, value: score

for _, row in bus_routes_df[["ServiceNo","Direction"]].drop_duplicates().iterrows(): # 1 iteration per bus line
    service_number = row['ServiceNo']
    direction = row['Direction']
    mrt_line_scores = {}
    for mrt_line in ['BPLRT', 'CCL', 'DTL', 'EWL', 'NEL', 'NSL', 'PGLRT', 'SGLRT','TEL','nearest_mrt']:
        distances_to_mrt = []
        bus_stops = bus_routes_df.loc[(bus_routes_df["ServiceNo"] == service_number)
                                      & (bus_routes_df["Direction"] == direction),
                                      "BusStopCode"]
        for bus_stop in bus_stops:
            # Get the distance to the nearest MRT station in the MRT line
            distance = bus_stops_with_nearest_mrt_dict[bus_stop][f"{mrt_line}_distance"]
            distances_to_mrt.append(distance)

        mrt_line_scores[mrt_line] = int(np.median(distances_to_mrt)) # round
    #Sort mrt_line_scores by value
    mrt_line_scores = {k: v for k, v in sorted(mrt_line_scores.items(), key=lambda item: item[1])}
    busline_scores[(service_number,direction)] = mrt_line_scores

In [7]:
# We will sort the bus lines base on the minimum score of the MRT lines
busline_scores = dict(sorted(busline_scores.items(), key=lambda kv_pair: min(kv_pair[1].values())))

# Print first 50 bus lines
for i, (busline, mrt_scores) in enumerate(busline_scores.items()):
    if i == 50:
        break
    print(busline, mrt_scores)

('384', 1) {'PGLRT': 190, 'nearest_mrt': 190, 'NEL': 712, 'SGLRT': 1601, 'CCL': 6732, 'NSL': 7143, 'DTL': 7176, 'EWL': 7785, 'TEL': 7907, 'BPLRT': 14368}
('163A', 1) {'SGLRT': 193, 'nearest_mrt': 193, 'NEL': 902, 'PGLRT': 2050, 'CCL': 4212, 'NSL': 5473, 'DTL': 5933, 'TEL': 6179, 'EWL': 6487, 'BPLRT': 13257}
('374', 1) {'SGLRT': 210, 'nearest_mrt': 210, 'NEL': 1063, 'PGLRT': 1484, 'CCL': 4787, 'NSL': 5546, 'TEL': 6263, 'DTL': 6689, 'EWL': 7368, 'BPLRT': 12866}
('991B', 1) {'BPLRT': 217, 'nearest_mrt': 217, 'NSL': 680, 'DTL': 1805, 'EWL': 4460, 'TEL': 5623, 'CCL': 9318, 'PGLRT': 11443, 'SGLRT': 13835, 'NEL': 13975}
('973A', 1) {'BPLRT': 220, 'nearest_mrt': 220, 'DTL': 807, 'NSL': 1795, 'EWL': 5391, 'TEL': 5826, 'CCL': 8101, 'PGLRT': 10464, 'SGLRT': 11553, 'NEL': 11834}
('114A', 1) {'NEL': 248, 'nearest_mrt': 248, 'SGLRT': 800, 'PGLRT': 2980, 'CCL': 3248, 'NSL': 4458, 'TEL': 4940, 'DTL': 5433, 'EWL': 5938, 'BPLRT': 12105}
('976', 2) {'BPLRT': 262, 'nearest_mrt': 262, 'DTL': 1231, 'NSL': 1

In [None]:
# Convert dictionary to csv to save it
busline_score_csv = pd.DataFrame.from_dict(busline_scores, orient='index')

busline_score_csv.to_csv('processed_data/busline_score.csv')

busline_score_csv = pd.read_csv('processed_data/busline_score.csv')

busline_score_csv.rename(columns={'Unnamed: 0': 'BusService', 'Unnamed: 1': 'Direction','nearest_mrt':'nearest_mrt_lrt'}, inplace=True)

busline_score_csv.to_csv('processed_data/busline_score.csv', index = False)
busline_score_csv.head()

Unnamed: 0,BusService,Direction,PGLRT,nearest_mrt_lrt,NEL,SGLRT,CCL,NSL,DTL,EWL,TEL,BPLRT
0,384,1,190,190,712,1601,6732,7143,7176,7785,7907,14368
1,163A,1,2050,193,902,193,4212,5473,5933,6487,6179,13257
2,374,1,1484,210,1063,210,4787,5546,6689,7368,6263,12866
3,991B,1,11443,217,13975,13835,9318,680,1805,4460,5623,217
4,973A,1,10464,220,11834,11553,8101,1795,807,5391,5826,220


In [None]:
#Verify for TEL line
pd.read_csv('processed_data/busline_score.csv').sort_values(by='TEL').head()

Unnamed: 0,BusService,Direction,PGLRT,nearest_mrt_lrt,NEL,SGLRT,CCL,NSL,DTL,EWL,TEL,BPLRT
112,902,1,12960,407,12300,11031,12223,1561,7554,12004,407,5785
124,196e,2,1344,417,5368,8927,2324,6210,2377,1765,443,17143
181,911,1,12882,456,12201,10940,12220,1449,7436,11749,456,5841
63,196e,1,1208,345,5763,8901,2730,6563,2442,1818,471,17505
110,269,1,5889,406,4269,3782,3018,692,6222,7471,512,7559
...,...,...,...,...,...,...,...,...,...,...,...,...
691,182,1,18024,1461,21921,27307,17242,12028,14604,1461,18299,13423
689,182M,1,18024,1448,21888,27323,17241,12044,14621,1448,18316,13441
634,248,1,17869,1141,21669,27100,17097,11732,14384,1141,18459,13355
579,247,1,17092,911,20922,26654,16304,11405,13954,911,18512,13127
