# Taxi Compass
by Alejandro Seif

# **Day 1: Data Collection**

## LTA Taxi Availability data API

* Import statement

In [26]:
import requests
import pandas as pd
import time
from datetime import datetime
import numpy as np

### Sequential Taxi Data API Interaction

This Information updates every 50-60 seconds, so this service can be called at most 1x per minute

In [3]:
gov = 'https://api.data.gov.sg/v1'
taxi_url = '/transport/taxi-availability'

url = gov + taxi_url
response = requests.get(url).json()

response

{'type': 'FeatureCollection',
 'crs': {'type': 'link',
  'properties': {'href': 'http://spatialreference.org/ref/epsg/4326/ogcwkt/',
   'type': 'ogcwkt'}},
 'features': [{'type': 'Feature',
   'geometry': {'type': 'MultiPoint',
    'coordinates': [[103.6208, 1.27543],
     [103.629028816667, 1.29692938333333],
     [103.63047, 1.30362],
     [103.64044, 1.32894],
     [103.64229, 1.3313],
     [103.64394, 1.32579],
     [103.64865, 1.32125],
     [103.64888, 1.33658],
     [103.65387, 1.32779],
     [103.65936, 1.32],
     [103.66102, 1.31064],
     [103.66284, 1.31551],
     [103.66703, 1.29552],
     [103.67245, 1.27815],
     [103.68104, 1.33],
     [103.685260883333, 1.34188793333333],
     [103.68657, 1.34],
     [103.68709, 1.34],
     [103.68712, 1.34325],
     [103.688713166667, 1.34112716666667],
     [103.688912833333, 1.34283283333333],
     [103.68945, 1.34515],
     [103.69029, 1.33968],
     [103.69039, 1.3424],
     [103.690400183333, 1.34169023333333],
     [103.69138, 

In [4]:
timestamp_str = response['features'][0]['properties']['timestamp']
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00')
print(timestamp)

2021-12-21 20:29:39


In [5]:
taxi_coordinates = response['features'][0]['geometry']['coordinates']
taxi_coordinates_df= pd.DataFrame.from_dict(taxi_coordinates)
taxi_coordinates_df= taxi_coordinates_df.rename(columns = {0:'lon',1:'lat'})
taxi_coordinates_df['timestamp']=timestamp
taxi_coordinates_df.astype({'lat':'float32','lon':'float32'})
taxi_coordinates_df

Unnamed: 0,lon,lat,timestamp
0,103.620800,1.275430,2021-12-21 20:29:39
1,103.629029,1.296929,2021-12-21 20:29:39
2,103.630470,1.303620,2021-12-21 20:29:39
3,103.640440,1.328940,2021-12-21 20:29:39
4,103.642290,1.331300,2021-12-21 20:29:39
...,...,...,...
2892,103.990760,1.360780,2021-12-21 20:29:39
2893,103.990820,1.359610,2021-12-21 20:29:39
2894,103.998017,1.389153,2021-12-21 20:29:39
2895,104.001980,1.374470,2021-12-21 20:29:39


Now we are going to be setting up a function to retrieve and append so that it can be called

In [6]:
def taxi_coordinates_append(df):
    '''
    Pass an existing dataframe to append more data
    '''
    gov = 'https://api.data.gov.sg/v1'
    taxi_url = '/transport/taxi-availability'

    url = gov + taxi_url
    response = requests.get(url).json()
    
    timestamp_str = response['features'][0]['properties']['timestamp']
    timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00')
    
    taxi_coordinates = response['features'][0]['geometry']['coordinates']
    temp_taxi_coordinates_df= pd.DataFrame.from_dict(taxi_coordinates)
    temp_taxi_coordinates_df= temp_taxi_coordinates_df.rename(columns = {0:'lon',1:'lat'})
    temp_taxi_coordinates_df['timestamp']=timestamp
    temp_taxi_coordinates_df.astype({'lat':'float32','lon':'float32'})
    return pd.concat([df,temp_taxi_coordinates_df])
    

In [7]:
taxi_coordinates_append(taxi_coordinates_df)

Unnamed: 0,lon,lat,timestamp
0,103.620800,1.275430,2021-12-21 20:29:39
1,103.629029,1.296929,2021-12-21 20:29:39
2,103.630470,1.303620,2021-12-21 20:29:39
3,103.640440,1.328940,2021-12-21 20:29:39
4,103.642290,1.331300,2021-12-21 20:29:39
...,...,...,...
2892,103.990760,1.360780,2021-12-21 20:29:39
2893,103.990820,1.359610,2021-12-21 20:29:39
2894,103.998017,1.389153,2021-12-21 20:29:39
2895,104.001980,1.374470,2021-12-21 20:29:39


### Weather data API Interaction

This service updates once every 5 minutes

In [8]:
gov = 'https://api.data.gov.sg/v1'
weather_api = '/environment/rainfall'

url = gov+weather_api
response = requests.get(url).json()

response

{'metadata': {'stations': [{'id': 'S77',
    'device_id': 'S77',
    'name': 'Alexandra Road',
    'location': {'latitude': 1.2937, 'longitude': 103.8125}},
   {'id': 'S109',
    'device_id': 'S109',
    'name': 'Ang Mo Kio Avenue 5',
    'location': {'latitude': 1.3764, 'longitude': 103.8492}},
   {'id': 'S90',
    'device_id': 'S90',
    'name': 'Bukit Timah Road',
    'location': {'latitude': 1.3191, 'longitude': 103.8191}},
   {'id': 'S114',
    'device_id': 'S114',
    'name': 'Choa Chu Kang Avenue 4',
    'location': {'latitude': 1.38, 'longitude': 103.73}},
   {'id': 'S50',
    'device_id': 'S50',
    'name': 'Clementi Road',
    'location': {'latitude': 1.3337, 'longitude': 103.7768}},
   {'id': 'S107',
    'device_id': 'S107',
    'name': 'East Coast Parkway',
    'location': {'latitude': 1.3135, 'longitude': 103.9625}},
   {'id': 'S215',
    'device_id': 'S215',
    'name': 'GEYLANG EAST CENTRAL',
    'location': {'latitude': 1.32785, 'longitude': 103.88899}},
   {'id': 'S118

In [9]:
response['items'][0]['timestamp']

'2021-12-21T20:20:00+08:00'

In [10]:
response['metadata']['stations'][0]

{'id': 'S77',
 'device_id': 'S77',
 'name': 'Alexandra Road',
 'location': {'latitude': 1.2937, 'longitude': 103.8125}}

In [11]:
response['items'][0]['readings'][0]

{'station_id': 'S77', 'value': 0}

In [12]:
{'id':response['metadata']['stations'][0]['id'], 
 'lat':response['metadata']['stations'][0]['location']['latitude'],
'lon':response['metadata']['stations'][0]['location']['longitude'],
'value':response['items'][0]['readings'][0]['value'],
'timestamp':response['items'][0]['timestamp']}

{'id': 'S77',
 'lat': 1.2937,
 'lon': 103.8125,
 'value': 0,
 'timestamp': '2021-12-21T20:20:00+08:00'}

In [13]:
weather_list = []
for index,value in enumerate(response['items'][0]['readings']):
    weather_list.append({'id':response['metadata']['stations'][index]['id'], 
 'lat':response['metadata']['stations'][index]['location']['latitude'],
'lon':response['metadata']['stations'][index]['location']['longitude'],
'value':response['items'][0]['readings'][index]['value'],
'timestamp':response['items'][0]['timestamp']})

In [14]:
pd.DataFrame(weather_list)

Unnamed: 0,id,lat,lon,value,timestamp
0,S77,1.29370,103.81250,0,2021-12-21T20:20:00+08:00
1,S109,1.37640,103.84920,0,2021-12-21T20:20:00+08:00
2,S90,1.31910,103.81910,0,2021-12-21T20:20:00+08:00
3,S114,1.38000,103.73000,0,2021-12-21T20:20:00+08:00
4,S50,1.33370,103.77680,0,2021-12-21T20:20:00+08:00
...,...,...,...,...,...
62,S69,1.37000,103.80500,0,2021-12-21T20:20:00+08:00
63,S08,1.37010,103.82710,0,2021-12-21T20:20:00+08:00
64,S116,1.28100,103.75400,0,2021-12-21T20:20:00+08:00
65,S104,1.44387,103.78538,0,2021-12-21T20:20:00+08:00


In [15]:
def parse_rainfall_data():
    '''
    This function calls the rainfall API
    and parses it into a dataframe.
    Available columns are id, lat, lon, value and timestamp
    
    value represents rainfall, where 0 is no rain.
    '''
    gov = 'https://api.data.gov.sg/v1'
    weather_api = '/environment/rainfall'

    url = gov+weather_api
    response = requests.get(url).json()

    timestamp_str = response['items'][0]['timestamp']
    timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00')
    weather_list = []
    for index,value in enumerate(response['items'][0]['readings']):
        weather_list.append({'id':response['metadata']['stations'][index]['id'], 
     'lat':response['metadata']['stations'][index]['location']['latitude'],
    'lon':response['metadata']['stations'][index]['location']['longitude'],
    'value':response['items'][0]['readings'][index]['value'],
    'timestamp': timestamp})
        
    return pd.DataFrame(weather_list)

parse_rainfall_data()

Unnamed: 0,id,lat,lon,value,timestamp
0,S77,1.29370,103.81250,0,2021-12-21 20:20:00
1,S109,1.37640,103.84920,0,2021-12-21 20:20:00
2,S90,1.31910,103.81910,0,2021-12-21 20:20:00
3,S114,1.38000,103.73000,0,2021-12-21 20:20:00
4,S50,1.33370,103.77680,0,2021-12-21 20:20:00
...,...,...,...,...,...
62,S69,1.37000,103.80500,0,2021-12-21 20:20:00
63,S08,1.37010,103.82710,0,2021-12-21 20:20:00
64,S116,1.28100,103.75400,0,2021-12-21 20:20:00
65,S104,1.44387,103.78538,0,2021-12-21 20:20:00


## Ranked distance between lat lon points - pandas apply

Given 2 sets of (lat,lon) this function is to calculate the 
haversine distance between a point and a set of fixed points 
then find the nearby pairs, ranked by distance

In [16]:
from math import radians, sin, cos, asin, sqrt

def haversine_distance(lon1, lat1, lon2, lat2):
    """
    Compute distance between two pairs of coordinates (lon1, lat1, lon2, lat2)
    See - (https://en.wikipedia.org/wiki/Haversine_formula)
    Distance is measured in kilometers when r = 6371
    r = 6371  Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    Lats and Longs are converted to radians first then computed used haversine
    """
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    radius = 6371
    return 2 * radius * asin(sqrt(a))

df = parse_rainfall_data()
df

Unnamed: 0,id,lat,lon,value,timestamp
0,S77,1.29370,103.81250,0,2021-12-21 20:20:00
1,S109,1.37640,103.84920,0,2021-12-21 20:20:00
2,S90,1.31910,103.81910,0,2021-12-21 20:20:00
3,S114,1.38000,103.73000,0,2021-12-21 20:20:00
4,S50,1.33370,103.77680,0,2021-12-21 20:20:00
...,...,...,...,...,...
62,S69,1.37000,103.80500,0,2021-12-21 20:20:00
63,S08,1.37010,103.82710,0,2021-12-21 20:20:00
64,S116,1.28100,103.75400,0,2021-12-21 20:20:00
65,S104,1.44387,103.78538,0,2021-12-21 20:20:00


In [17]:
test_lat, test_lon = 1.41720, 103.74855

df['distance']=df.apply(lambda row: haversine_distance(row.lon, row.lat, test_lon, test_lat)
                        , axis=1)
df.sort_values(by='distance')

Unnamed: 0,id,lat,lon,value,timestamp,distance
66,S100,1.41720,103.74855,0,2021-12-21 20:20:00,0.000000
34,S211,1.42918,103.75711,0,2021-12-21 20:20:00,1.637055
12,S66,1.43870,103.73630,0,2021-12-21 20:20:00,2.751303
33,S210,1.44003,103.76904,0,2021-12-21 20:20:00,3.410601
3,S114,1.38000,103.73000,0,2021-12-21 20:20:00,4.621935
...,...,...,...,...,...,...
30,S207,1.32485,103.95836,0,2021-12-21 20:20:00,25.483672
5,S107,1.31350,103.96250,0,2021-12-21 20:20:00,26.431268
61,S24,1.36780,103.98260,0,2021-12-21 20:20:00,26.591031
46,S224,1.34392,103.98409,0,2021-12-21 20:20:00,27.421859


In [110]:
def point_to_set_ranked_distance(test_lat, test_lon, df, cutoff=10):
    '''
    Provide a test latitude and longitude to compare with 
    a dataframe df, which has columns 'lat' and 'lon'
    '''
    df = df[['lat','lon']]
    df['distance']=df.apply(lambda row: haversine_distance(row.lon, row.lat, test_lon, test_lat)
                        , axis=1)
    return df.sort_values(by='distance').iloc[:cutoff]

In [113]:
df = df.drop(columns=['distance'])
point_to_set_ranked_distance(1.3137, 103.99, df)

Unnamed: 0,lat,lon,distance
31,1.3136,104.00317,1.464095
5,1.3135,103.9625,3.057138
46,1.34392,103.98409,3.423933
30,1.32485,103.95836,3.729395
35,1.31835,103.93574,6.053966
54,1.3437,103.9444,6.068271
61,1.3678,103.9826,6.07163
21,1.3662,103.9528,7.15402
16,1.30648,103.9104,8.885147
22,1.30703,103.89067,11.066984


So far this is OK-fast for comparing 1 point with a set of fixed points (e.g. a taxi with all taxi-stands).
Doing this for all taxis (and all taxi stands) might be time consuming (1-2 minutes?)

## Day 3


### Taxi Stop Data

This data is avaiable as a GeoJSON in:
https://data.gov.sg/dataset/lta-taxi-stop?resource_id=113b362f-ffc7-4e06-adb1-1c06e8b8ae90



In [20]:
import json

with open('../raw_data/lta-taxi-stop-geojson.geojson') as geofile:
    taxi_stands_json = json.load(geofile)

taxi_stands_json

{'type': 'FeatureCollection',
 'crs': {'type': 'name',
  'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}},
 'features': [{'type': 'Feature',
   'properties': {'Name': 'kml_1',
    'Description': '<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> <th>TYPE_CD</th> <td>TSTAND</td> </tr><tr bgcolor=""> <th>TYPE_CD_DE</th> <td>TAXI STAND</td> </tr><tr bgcolor="#E3E3F3"> <th>INC_CRC</th> <td>58C6A265B6301AC8</td> </tr><tr bgcolor=""> <th>FMEL_UPD_D</th> <td>20200722155614</td> </tr></table></center>'},
   'geometry': {'type': 'Point',
    'coordinates': [103.844358266128, 1.28126053659933, 0.0]}},
  {'type': 'Feature',
   'properties': {'Name': 'kml_2',
    'Description': '<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> <th>TYPE_CD</th> <td>TSTAND</td> </tr><tr bgcolor=""> <th>TYPE_CD_DE</th> <td>TAXI STAND</td> </tr><tr bgcolor="#E3E3F3"> <th>INC_CRC</th> <td>F771A37F645

I will now parse the contents so as to keep the Name and Coordinates of taxi stands

In [22]:
taxi_stands_dict = []

for ts in taxi_stands_json['features']:
    taxi_stands_dict.append( {'ts_id':ts['properties']['Name'],'lat':ts['geometry']['coordinates'][1] ,
                                                'lon':ts['geometry']['coordinates'][0]})
ts_df = pd.DataFrame(taxi_stands_dict)
ts_df.head()

Unnamed: 0,ts_id,lat,lon
0,kml_1,1.281261,103.844358
1,kml_2,1.281409,103.847446
2,kml_3,1.310559,103.84751
3,kml_4,1.282391,103.84527
4,kml_5,1.282479,103.846251


### Ranked distance between lat lon points - numpy broadcasting

Given 2 sets of (lat,lon) this function is to calculate the 
haversine distance between a point and a set of fixed points 
then find the nearby pairs, ranked by distance

In [65]:
def find_nearest_taxi_stand(taxi_lat=1.281261,taxi_lon=103.846358):
    '''
    Given all the static positions of the nearby taxi stands
    we can get the distance with all of them, and return the nearest 10 taxi stands.
    
    We can assume that a taxi is IN the taxi stand when the distance is < 100 m = 0.1 km
    
    Python wise we'll be using broadcasting method, so we do the difference of all taxi stands (ts)
    with a given taxi_lat and taxi_lon in one shot, not in a for loop.
    '''

    taxi_lat_rad = np.deg2rad(taxi_lat)
    taxi_lon_rad = np.deg2rad(taxi_lon)

    ts_lat = np.array(ts_df['lat'].tolist())
    ts_lat_rad = np.deg2rad(ts_lat)

    ts_lon = np.array(ts_df['lon'].tolist())
    ts_lon_rad = np.deg2rad(ts_lon)

    dlat = ts_lat_rad - taxi_lat_rad
    dlon = ts_lon_rad - taxi_lon_rad

    d = np.sin(dlat/2)**2 + np.cos(ts_lat_rad)*np.cos(taxi_lat_rad) * np.sin(dlon/2)**2
    distance =  2 * 6371 * np.arcsin(np.sqrt(d))

    df = ts_df.copy()
    df['distance']=distance
    df.sort_values(by='distance', inplace=True)
    return df.iloc[:10]

taxi_lat = 1.2732
taxi_long = 103.9435

find_nearest_taxi_stand(taxi_lat, taxi_lon)

Unnamed: 0,ts_id,lat,lon,distance
203,kml_204,1.273704,103.844904,0.171066
201,kml_202,1.274667,103.847052,0.180481
202,kml_203,1.275701,103.846129,0.279302
206,kml_207,1.273448,103.843802,0.28553
205,kml_206,1.275052,103.843317,0.395832
204,kml_205,1.27418,103.842683,0.422783
247,kml_248,1.275872,103.843248,0.455838
188,kml_189,1.277374,103.847483,0.480658
189,kml_190,1.277435,103.848901,0.549259
194,kml_195,1.278369,103.845087,0.591854


### Count taxis in taxi_stands

Here we'll use the available taxi information to find all the taxis that are already near (<100m) taxistands.
I will have a counter for each timestamp, so that we can return a timestamped dataframe with the taxi count for each taxi stand. I expect most to be zero.


In [116]:
taxi_url = "https://api.data.gov.sg/v1/transport/taxi-availability"
r = requests.get(taxi_url)
coordinates = r.json()["features"][0]["geometry"]["coordinates"]
timestamp_str = r.json()['features'][0]['properties']['timestamp']
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00')

In [None]:
ts_counter = dict(zip(ts_df['ts_id'].tolist(),[0 for _ in ts_df['ts_id'].tolist()]))

for taxi_coordinates in coordinates:
    lon,lat = taxi_coordinates
    d_df = find_nearest_taxi_stand(lat,lon)
    d_df = d_df[d_df['distance']<0.1]
    for ts in d_df['ts_id'].tolist():
        ts_counter[ts] += 1

In [117]:
tmp_taxi_stand_counter = pd.DataFrame.from_dict(ts_counter, orient='index')
tmp_taxi_stand_counter.reset_index(inplace=True)
tmp_taxi_stand_counter['timestamp'] = timestamp
tmp_taxi_stand_counter = tmp_taxi_stand_counter.rename(columns={0:'taxi_count', 'index':'ts_id'})
# tmp_taxi_stand_counter
ts_df.merge(tmp_taxi_stand_counter)

Unnamed: 0,ts_id,lat,lon,taxi_count,timestamp
0,kml_1,1.281261,103.844358,0,2021-12-21 21:55:43
1,kml_2,1.281409,103.847446,0,2021-12-21 21:55:43
2,kml_3,1.310559,103.847510,0,2021-12-21 21:55:43
3,kml_4,1.282391,103.845270,0,2021-12-21 21:55:43
4,kml_5,1.282479,103.846251,0,2021-12-21 21:55:43
...,...,...,...,...,...
345,kml_346,1.305252,103.851618,0,2021-12-21 21:55:43
346,kml_347,1.312201,103.837586,1,2021-12-21 21:55:43
347,kml_348,1.276219,103.820254,0,2021-12-21 21:55:43
348,kml_349,1.341340,103.961546,1,2021-12-21 21:55:43


In [115]:
r.json()['features'][0]['properties']['timestamp']

'2021-12-21T21:28:41+08:00'

In [118]:
def get_taxi_stands():
    taxi_stands_dict = []
    for ts in taxi_stands_json['features']:
        taxi_stands_dict.append( {'ts_id':ts['properties']['Name'],'lat':ts['geometry']['coordinates'][1] ,
                                                    'lon':ts['geometry']['coordinates'][0]})
    ts_df = pd.DataFrame(taxi_stands_dict)
    return ts_df

def count_taxis_in_ts():
    taxi_url = "https://api.data.gov.sg/v1/transport/taxi-availability"
    r = requests.get(taxi_url)
    coordinates = r.json()["features"][0]["geometry"]["coordinates"]
    timestamp_str = r.json()['features'][0]['properties']['timestamp']
    timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00')
    
    ts_counter = dict(zip(ts_df['ts_id'].tolist(),[0 for _ in ts_df['ts_id'].tolist()]))

    for taxi_coordinates in coordinates:
        lon,lat = taxi_coordinates
        d_df = find_nearest_taxi_stand(lat,lon)
        d_df = d_df[d_df['distance']<0.1]
        for ts in d_df['ts_id'].tolist():
            ts_counter[ts] += 1
    
    tmp_taxi_stand_counter = pd.DataFrame.from_dict(ts_counter, orient='index')
    tmp_taxi_stand_counter.reset_index(inplace=True)
    tmp_taxi_stand_counter['timestamp'] = timestamp
    tmp_taxi_stand_counter = tmp_taxi_stand_counter.rename(columns={0:'taxi_count', 'index':'ts_id'})
    # tmp_taxi_stand_counter
    return ts_df.merge(tmp_taxi_stand_counter)

ts_df = get_taxi_stands()
ts_taxi_df = count_taxis_in_ts()
ts_taxi_df

Unnamed: 0,ts_id,lat,lon,taxi_count,timestamp
0,kml_1,1.281261,103.844358,2,2021-12-23 10:36:37
1,kml_2,1.281409,103.847446,1,2021-12-23 10:36:37
2,kml_3,1.310559,103.847510,17,2021-12-23 10:36:37
3,kml_4,1.282391,103.845270,4,2021-12-23 10:36:37
4,kml_5,1.282479,103.846251,4,2021-12-23 10:36:37
...,...,...,...,...,...
345,kml_346,1.305252,103.851618,1,2021-12-23 10:36:37
346,kml_347,1.312201,103.837586,1,2021-12-23 10:36:37
347,kml_348,1.276219,103.820254,3,2021-12-23 10:36:37
348,kml_349,1.341340,103.961546,0,2021-12-23 10:36:37
