# Taxi Compass
by Alejandro Seif

# **Day 1: Data Collection**

## LTA Taxi Availability data API

* Import statement

In [19]:
import requests
import pandas as pd
import time
from datetime import datetime

### Sequential Taxi Data API Interaction

This Information updates every 50-60 seconds, so this service can be called at most 1x per minute

In [30]:
gov = 'https://api.data.gov.sg/v1'
taxi_url = '/transport/taxi-availability'

url = gov + taxi_url
response = requests.get(url).json()

response

{'type': 'FeatureCollection',
 'crs': {'type': 'link',
  'properties': {'href': 'http://spatialreference.org/ref/epsg/4326/ogcwkt/',
   'type': 'ogcwkt'}},
 'features': [{'type': 'Feature',
   'geometry': {'type': 'MultiPoint',
    'coordinates': [[103.619061166667, 1.27906536666667],
     [103.62537, 1.27539],
     [103.62537, 1.2758],
     [103.63092, 1.32002],
     [103.64278, 1.32967],
     [103.65751, 1.31602],
     [103.665565833333, 1.30568033333333],
     [103.67227, 1.33002],
     [103.6782, 1.33033],
     [103.68365, 1.34837],
     [103.688734833333, 1.34106733333333],
     [103.69187, 1.34723],
     [103.69235, 1.34293],
     [103.69255, 1.34682],
     [103.69305405, 1.34205151666667],
     [103.69314, 1.31112],
     [103.694180766667, 1.34476625],
     [103.694837216667, 1.34708483333333],
     [103.69604995, 1.34574571666667],
     [103.696065, 1.34568116666667],
     [103.69641, 1.33583],
     [103.69642, 1.33],
     [103.69719, 1.34],
     [103.69811, 1.33],
     [103.69

In [31]:
timestamp_str = response['features'][0]['properties']['timestamp']
timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00')
print(timestamp)

2021-12-16 19:47:18


In [47]:
taxi_coordinates = response['features'][0]['geometry']['coordinates']
taxi_coordinates_df= pd.DataFrame.from_dict(taxi_coordinates)
taxi_coordinates_df= taxi_coordinates_df.rename(columns = {0:'lon',1:'lat'})
taxi_coordinates_df['timestamp']=timestamp
taxi_coordinates_df.astype({'lat':'float32','lon':'float32'})
taxi_coordinates_df

Unnamed: 0,lon,lat,timestamp
0,103.619061,1.279065,2021-12-16 19:47:18
1,103.625370,1.275390,2021-12-16 19:47:18
2,103.625370,1.275800,2021-12-16 19:47:18
3,103.630920,1.320020,2021-12-16 19:47:18
4,103.642780,1.329670,2021-12-16 19:47:18
...,...,...,...
2525,103.988805,1.361094,2021-12-16 19:47:18
2526,103.988860,1.360980,2021-12-16 19:47:18
2527,103.988890,1.361210,2021-12-16 19:47:18
2528,103.989050,1.360000,2021-12-16 19:47:18


Now we are going to be setting up a function to retrieve and append so that it can be called

In [46]:
def taxi_coordinates_append(df):
    '''
    Pass an existing dataframe to append more data
    '''
    gov = 'https://api.data.gov.sg/v1'
    taxi_url = '/transport/taxi-availability'

    url = gov + taxi_url
    response = requests.get(url).json()
    
    timestamp_str = response['features'][0]['properties']['timestamp']
    timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00')
    
    taxi_coordinates = response['features'][0]['geometry']['coordinates']
    temp_taxi_coordinates_df= pd.DataFrame.from_dict(taxi_coordinates)
    temp_taxi_coordinates_df= temp_taxi_coordinates_df.rename(columns = {0:'lon',1:'lat'})
    temp_taxi_coordinates_df['timestamp']=timestamp
    temp_taxi_coordinates_df.astype({'lat':'float32','lon':'float32'})
    return pd.concat([df,temp_taxi_coordinates_df])
    

In [49]:
taxi_coordinates_append(taxi_coordinates_df)

Unnamed: 0,lon,lat,timestamp
0,103.619061,1.279065,2021-12-16 19:47:18
1,103.625370,1.275390,2021-12-16 19:47:18
2,103.625370,1.275800,2021-12-16 19:47:18
3,103.630920,1.320020,2021-12-16 19:47:18
4,103.642780,1.329670,2021-12-16 19:47:18
...,...,...,...
3081,103.988460,1.360600,2021-12-16 20:13:49
3082,103.988900,1.360000,2021-12-16 20:13:49
3083,103.990070,1.360900,2021-12-16 20:13:49
3084,103.998130,1.382430,2021-12-16 20:13:49


### Weather data API Interaction

This service updates once every 5 minutes

In [60]:
gov = 'https://api.data.gov.sg/v1'
weather_api = '/environment/rainfall'

url = gov+weather_api
response = requests.get(url).json()

response

{'metadata': {'stations': [{'id': 'S77',
    'device_id': 'S77',
    'name': 'Alexandra Road',
    'location': {'latitude': 1.2937, 'longitude': 103.8125}},
   {'id': 'S109',
    'device_id': 'S109',
    'name': 'Ang Mo Kio Avenue 5',
    'location': {'latitude': 1.3764, 'longitude': 103.8492}},
   {'id': 'S90',
    'device_id': 'S90',
    'name': 'Bukit Timah Road',
    'location': {'latitude': 1.3191, 'longitude': 103.8191}},
   {'id': 'S114',
    'device_id': 'S114',
    'name': 'Choa Chu Kang Avenue 4',
    'location': {'latitude': 1.38, 'longitude': 103.73}},
   {'id': 'S50',
    'device_id': 'S50',
    'name': 'Clementi Road',
    'location': {'latitude': 1.3337, 'longitude': 103.7768}},
   {'id': 'S107',
    'device_id': 'S107',
    'name': 'East Coast Parkway',
    'location': {'latitude': 1.3135, 'longitude': 103.9625}},
   {'id': 'S215',
    'device_id': 'S215',
    'name': 'GEYLANG EAST CENTRAL',
    'location': {'latitude': 1.32785, 'longitude': 103.88899}},
   {'id': 'S118

In [61]:
response['items'][0]['timestamp']

'2021-12-16T20:30:00+08:00'

In [76]:
response['metadata']['stations'][0]

{'id': 'S77',
 'device_id': 'S77',
 'name': 'Alexandra Road',
 'location': {'latitude': 1.2937, 'longitude': 103.8125}}

In [82]:
response['items'][0]['readings'][0]

{'station_id': 'S77', 'value': 0}

In [83]:
{'id':response['metadata']['stations'][0]['id'], 
 'lat':response['metadata']['stations'][0]['location']['latitude'],
'lon':response['metadata']['stations'][0]['location']['longitude'],
'value':response['items'][0]['readings'][0]['value'],
'timestamp':response['items'][0]['timestamp']}

{'id': 'S77',
 'lat': 1.2937,
 'lon': 103.8125,
 'value': 0,
 'timestamp': '2021-12-16T20:30:00+08:00'}

In [85]:
weather_list = []
for index,value in enumerate(response['items'][0]['readings']):
    weather_list.append({'id':response['metadata']['stations'][index]['id'], 
 'lat':response['metadata']['stations'][index]['location']['latitude'],
'lon':response['metadata']['stations'][index]['location']['longitude'],
'value':response['items'][0]['readings'][index]['value'],
'timestamp':response['items'][0]['timestamp']})

In [87]:
pd.DataFrame(weather_list)

Unnamed: 0,id,lat,lon,value,timestamp
0,S77,1.29370,103.81250,0,2021-12-16T20:30:00+08:00
1,S109,1.37640,103.84920,0,2021-12-16T20:30:00+08:00
2,S90,1.31910,103.81910,0,2021-12-16T20:30:00+08:00
3,S114,1.38000,103.73000,0,2021-12-16T20:30:00+08:00
4,S50,1.33370,103.77680,0,2021-12-16T20:30:00+08:00
...,...,...,...,...,...
62,S69,1.37000,103.80500,0,2021-12-16T20:30:00+08:00
63,S08,1.37010,103.82710,0,2021-12-16T20:30:00+08:00
64,S116,1.28100,103.75400,0,2021-12-16T20:30:00+08:00
65,S104,1.44387,103.78538,0,2021-12-16T20:30:00+08:00


In [90]:
def parse_rainfall_data():
    '''
    This function calls the rainfall API
    and parses it into a dataframe.
    Available columns are id, lat, lon, value and timestamp
    
    value represents rainfall, where 0 is no rain.
    '''
    gov = 'https://api.data.gov.sg/v1'
    weather_api = '/environment/rainfall'

    url = gov+weather_api
    response = requests.get(url).json()

    timestamp_str = response['items'][0]['timestamp']
    timestamp = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00')
    weather_list = []
    for index,value in enumerate(response['items'][0]['readings']):
        weather_list.append({'id':response['metadata']['stations'][index]['id'], 
     'lat':response['metadata']['stations'][index]['location']['latitude'],
    'lon':response['metadata']['stations'][index]['location']['longitude'],
    'value':response['items'][0]['readings'][index]['value'],
    'timestamp': timestamp})
        
    return pd.DataFrame(weather_list)

parse_rainfall_data()

Unnamed: 0,id,lat,lon,value,timestamp
0,S77,1.29370,103.81250,0,2021-12-17 19:05:00
1,S109,1.37640,103.84920,0,2021-12-17 19:05:00
2,S90,1.31910,103.81910,0,2021-12-17 19:05:00
3,S114,1.38000,103.73000,0,2021-12-17 19:05:00
4,S50,1.33370,103.77680,0,2021-12-17 19:05:00
...,...,...,...,...,...
62,S69,1.37000,103.80500,0,2021-12-17 19:05:00
63,S08,1.37010,103.82710,0,2021-12-17 19:05:00
64,S116,1.28100,103.75400,0,2021-12-17 19:05:00
65,S104,1.44387,103.78538,0,2021-12-17 19:05:00


## Ranked distance between lat lon points

Given 2 sets of (lat,lon) this function is to calculate the 
haversine distance between a point and a set of fixed points 
then find the nearby pairs, ranked by distance

In [93]:
from math import radians, sin, cos, asin, sqrt

def haversine_distance(lon1, lat1, lon2, lat2):
    """
    Compute distance between two pairs of coordinates (lon1, lat1, lon2, lat2)
    See - (https://en.wikipedia.org/wiki/Haversine_formula)
    """
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    return 2 * 6371 * asin(sqrt(a))

df = parse_rainfall_data()
df

Unnamed: 0,id,lat,lon,value,timestamp
0,S77,1.29370,103.81250,0,2021-12-17 19:20:00
1,S109,1.37640,103.84920,0,2021-12-17 19:20:00
2,S90,1.31910,103.81910,0,2021-12-17 19:20:00
3,S114,1.38000,103.73000,0,2021-12-17 19:20:00
4,S50,1.33370,103.77680,0,2021-12-17 19:20:00
...,...,...,...,...,...
62,S69,1.37000,103.80500,0,2021-12-17 19:20:00
63,S08,1.37010,103.82710,0,2021-12-17 19:20:00
64,S116,1.28100,103.75400,0,2021-12-17 19:20:00
65,S104,1.44387,103.78538,0,2021-12-17 19:20:00


In [109]:
test_lat, test_lon = 1.41720, 103.74855

df['distance']=df.apply(lambda row: haversine_distance(row.lon, row.lat, test_lon, test_lat)
                        , axis=1)
df.sort_values(by='distance')

Unnamed: 0,lat,lon,distance
66,1.41720,103.74855,0.000000
34,1.42918,103.75711,1.637055
12,1.43870,103.73630,2.751303
33,1.44003,103.76904,3.410601
3,1.38000,103.73000,4.621935
...,...,...,...
30,1.32485,103.95836,25.483672
5,1.31350,103.96250,26.431268
61,1.36780,103.98260,26.591031
46,1.34392,103.98409,27.421859


In [110]:
def point_to_set_ranked_distance(test_lat, test_lon, df, cutoff=10):
    '''
    Provide a test latitude and longitude to compare with 
    a dataframe df, which has columns 'lat' and 'lon'
    '''
    df = df[['lat','lon']]
    df['distance']=df.apply(lambda row: haversine_distance(row.lon, row.lat, test_lon, test_lat)
                        , axis=1)
    return df.sort_values(by='distance').iloc[:cutoff]

In [113]:
df = df.drop(columns=['distance'])
point_to_set_ranked_distance(1.3137, 103.99, df)

Unnamed: 0,lat,lon,distance
31,1.3136,104.00317,1.464095
5,1.3135,103.9625,3.057138
46,1.34392,103.98409,3.423933
30,1.32485,103.95836,3.729395
35,1.31835,103.93574,6.053966
54,1.3437,103.9444,6.068271
61,1.3678,103.9826,6.07163
21,1.3662,103.9528,7.15402
16,1.30648,103.9104,8.885147
22,1.30703,103.89067,11.066984


So far this is OK-fast for comparing 1 point with a set of fixed points (e.g. a taxi with all taxi-stands).
Doing this for all taxis (and all taxi stands) might be time consuming (1-2 minutes?)

## Day 3


### Taxi Stop Data

This data is avaiable as a GeoJSON in:
https://data.gov.sg/dataset/lta-taxi-stop?resource_id=113b362f-ffc7-4e06-adb1-1c06e8b8ae90



In [8]:
import json

with open('../raw_data/lta-taxi-stop-geojson.geojson') as geofile:
    taxi_stands_json = json.load(geofile)

taxi_stands_json

{'type': 'FeatureCollection',
 'crs': {'type': 'name',
  'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}},
 'features': [{'type': 'Feature',
   'properties': {'Name': 'kml_1',
    'Description': '<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> <th>TYPE_CD</th> <td>TSTAND</td> </tr><tr bgcolor=""> <th>TYPE_CD_DE</th> <td>TAXI STAND</td> </tr><tr bgcolor="#E3E3F3"> <th>INC_CRC</th> <td>58C6A265B6301AC8</td> </tr><tr bgcolor=""> <th>FMEL_UPD_D</th> <td>20200722155614</td> </tr></table></center>'},
   'geometry': {'type': 'Point',
    'coordinates': [103.844358266128, 1.28126053659933, 0.0]}},
  {'type': 'Feature',
   'properties': {'Name': 'kml_2',
    'Description': '<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> <th>TYPE_CD</th> <td>TSTAND</td> </tr><tr bgcolor=""> <th>TYPE_CD_DE</th> <td>TAXI STAND</td> </tr><tr bgcolor="#E3E3F3"> <th>INC_CRC</th> <td>F771A37F645

I will now parse the contents so as to keep the Name and Coordinates of taxi stands

In [20]:
taxi_stands_dict = []

for ts in taxi_stands_json['features']:
    taxi_stands_dict.append( {'ts_id':ts['properties']['Name'],'lat':ts['geometry']['coordinates'][1] ,
                                                'lon':ts['geometry']['coordinates'][0]})
pd.DataFrame(taxi_stands_dict)

Unnamed: 0,ts_id,lat,lon
0,kml_1,1.281261,103.844358
1,kml_2,1.281409,103.847446
2,kml_3,1.310559,103.847510
3,kml_4,1.282391,103.845270
4,kml_5,1.282479,103.846251
...,...,...,...
345,kml_346,1.305252,103.851618
346,kml_347,1.312201,103.837586
347,kml_348,1.276219,103.820254
348,kml_349,1.341340,103.961546
