In [1]:
import pandas as pd
import numpy as np
import requests
import math
import datetime

from google.cloud import bigquery, storage

from sklearn.model_selection import train_test_split

# Function get_time_distance and get_nearest_taxi_stand

In [2]:
# only for 3 days, need to generate again
ONEMAP_API_TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOjgyNjgsInVzZXJfaWQiOjgyNjgsImVtYWlsIjoianByYW1vbm8wMUBnbWFpbC5jb20iLCJmb3JldmVyIjpmYWxzZSwiaXNzIjoiaHR0cDpcL1wvb20yLmRmZS5vbmVtYXAuc2dcL2FwaVwvdjJcL3VzZXJcL3Nlc3Npb24iLCJpYXQiOjE2Mzk3MzA4OTAsImV4cCI6MTY0MDE2Mjg5MCwibmJmIjoxNjM5NzMwODkwLCJqdGkiOiJiMmNiZjk2ZTE0NmNmMjZjYzdmMGMyMTIwYWUyYWM2NCJ9.X3kCImlS9jyAz2d0G_I2QK9Z5P0QTRnDoLoWcrtBLcA"
LAT_1KM = 5/110
LON_1KM = 5/111.320*math.cos(1.3)

In [9]:
def get_time_distance(start_lat, start_lon, end_lat, end_lon):
    start = str(start_lat)+','+str(start_lon)
    end = str(end_lat)+','+str(end_lon)
    url = "https://developers.onemap.sg/privateapi/routingsvc/route"
    params = {"start" : start,
             "end": end,
             "routeType" : "drive",
             "token": ONEMAP_API_TOKEN}
    r = requests.get(url=url, params=params)
    total_distance = round(r.json()["route_summary"]["total_distance"]/1000,2) # in km
    total_time = round(r.json()["route_summary"]["total_time"]/60,2) # in mins
    return (total_time, total_distance)

In [30]:
def get_planning_area_loc (lat, lon):
    get_pln_loc_url = "https://developers.onemap.sg/privateapi/popapi/getPlanningarea"
    params = {'token':ONEMAP_API_TOKEN, 'lat': lat, 'lng': lon}
    response = requests.get(get_pln_loc_url, params = params)
    if response.status_code != 200:
        raise ValueError
    pln_area_loc = response.json()
    return pln_area_loc[0]['pln_area_n']
get_planning_area_loc(1.319728,103.8421)

'NOVENA'

In [10]:
get_time_distance(1.319728,103.8421,1.326762,103.8559)

(7.47, 3.06)

In [27]:
start_lat = 1.319728
start_lon = 103.8421

In [48]:
end = {"lat" : [(1.326762 + i*0.0091) for i in range(5)],
       "lon" : [(103.8559 + i*0.0024) for i in range(5)]}
end_df = pd.DataFrame(end)
end_df

Unnamed: 0,lat,lon
0,1.326762,103.8559
1,1.335862,103.8583
2,1.344962,103.8607
3,1.354062,103.8631
4,1.363162,103.8655


In [25]:
def get_nearest_taxi_stand(taxi_lat, taxi_lon, stand_df, top=5):
    stand_df["total_distance"] = stand_df.apply(lambda x : get_time_distance(start_lat, start_lon, 
                                                                             x["lat"], x["lon"])[1], axis = 1)
    stand_df = stand_df.sort_values(by="total_distance", ascending=False).reset_index(drop=True)
    return stand_df.iloc[:top]

In [49]:
get_nearest_taxi_stand(start_lat, start_lon, end_df, top=3)

Unnamed: 0,lat,lon,total_distance
0,1.363162,103.8655,8.53
1,1.354062,103.8631,7.38
2,1.344962,103.8607,5.0


# ETL for GCP

## parse taxi availability data

In [2]:
taxi_url = "https://api.data.gov.sg/v1/transport/taxi-availability"
r = requests.get(taxi_url)
coordinates = r.json()["features"][0]["geometry"]["coordinates"]
timestamp = r.json()["features"][0]["properties"]["timestamp"]
taxi_available = pd.DataFrame(np.array(coordinates), columns=["lon","lat"])
taxi_available["update_time"] = str(datetime.datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S+08:00"))
len(taxi_available), len(taxi_available.dropna())

KeyError: 'features'

In [52]:
time = datetime.datetime.now() + datetime.timedelta(hours=8)
rows = 0
dataset = []
while True:
    uri = f'http://datamall2.mytransport.sg/ltaodataservice/Taxi-Availability?$skip={rows}'
    headers = { 'AccountKey' : 'BehS/IpVR0KOFQ+BgFqM5g==',
    'accept' : 'application/json'} #this is by default
    # requests.get(url=uri, headers=headers).json()
    if len(requests.get(url=uri, headers=headers).json()["value"]) == 0:
        break
    dataset = dataset + requests.get(url=uri, headers=headers).json()["value"]
    rows += 500
taxi_available = pd.DataFrame(dataset).rename(columns={"Longitude": "lon", "Latitude": "lat"})
taxi_available["update_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
taxi_available.head()

Unnamed: 0,lon,lat,update_time
0,103.84585,1.29118,2021-12-29 03:52:15
1,103.76713,1.31342,2021-12-29 03:52:15
2,103.80222,1.43747,2021-12-29 03:52:15
3,103.81067,1.27713,2021-12-29 03:52:15
4,103.81448,1.32908,2021-12-29 03:52:15


In [51]:
time.strftime("%Y-%m-%d %H:%M:%S")

'2021-12-28 19:37:02'

In [47]:
taxi_available.columns

Index(['lon', 'lat', 'update_time'], dtype='object')

## parse rainfall data

In [24]:
def parse_rainfall_data(request):
    '''
    This function calls the rainfall API
    and parses it into a dataframe.
    Available columns are id, lat, lon, value and timestamp
    
    value represents rainfall, where 0 is no rain.
    '''
    gov = 'https://api.data.gov.sg/v1'
    weather_api = '/environment/rainfall'

    url = gov+weather_api
    response = requests.get(url).json()

    timestamp_str = response['items'][0]['timestamp']
    timestamp = str(datetime.datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S+08:00'))
    weather_list = []
    for index,value in enumerate(response['items'][0]['readings']):
        weather_list.append(
            {'station_id':response['metadata']['stations'][index]['id'], 
             'station_lat':response['metadata']['stations'][index]['location']['latitude'],
             'station_lon':response['metadata']['stations'][index]['location']['longitude'],
             'rainfall':float(response['items'][0]['readings'][index]['value']),
             'update_time': timestamp})
        
    weather_df = pd.DataFrame(weather_list)
    
    client = bigquery.Client(project='taxi-compass-lewagon')
    table_id = 'api_dataset.h_weather_rainfall'
    
    job = client.load_table_from_dataframe(
        weather_df, table_id
    )

    job.result()  # Wait for the job to complete.

    table = client.get_table(table_id)  # Make an API request.
    print(
        "Loaded {} rows and {} columns to {}".format(
            table.num_rows, len(table.schema), table_id
        )
    )
    return ("Done!", 200)

# Data Collection - Public Transport Operating API

Fields required:
1) Timestamp  
2) All station codes  
3) lat/lon  
4) Operating hours  
5) Operating or not (Boolean)  
6) Disrupted or not (Boolean)  

Pull from Train Service Alerts API

In [2]:
url = "http://datamall2.mytransport.sg/ltaodataservice/TrainServiceAlerts"
DATA_MALL_API_ACC = "BehS/IpVR0KOFQ+BgFqM5g=="

headers = {"AccountKey" : DATA_MALL_API_ACC}

r = requests.get(url = url, headers=headers)
r.json()

{'odata.metadata': 'http://datamall2.mytransport.sg/ltaodataservice/$metadata#TrainServicesAlerts',
 'value': {'Status': 1, 'AffectedSegments': [], 'Message': []}}

In [116]:
status = r.json()["value"]["Status"]
if status != 1:
    stations_list = []
    for d in r.json()["value"]["AffectedSegments"]:
        stations_list += d["Stations"].split(",")
    station_df = pd.DataFrame(stations_list, columns=["stn_id"])
    station_df["non_disruption_bool"] = 0
    station_df

In [3]:
BUCKET_NAME = 'static-file-storage'
BUCKET_MRT_STN_LIST_PATH = 'mrtsg.csv'

# Add Client() here
storage_client = storage.Client()
path = f"gs://{BUCKET_NAME}/{BUCKET_MRT_STN_LIST_PATH}"
bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(BUCKET_MRT_STN_LIST_PATH)

with blob.open('r') as mrt_csv:
    # with open(public_path) as geofile:
    '''
    Take geojson file provided by LTA where the taxi stands coordinates are provided
    '''
    mrt_list_df = pd.read_csv(mrt_csv).fillna(1)
    print('loaded mrt_list file successfully')

mrt_list_df.head()

loaded mrt_list file successfully


Unnamed: 0,stn_id,stn_name,stn_lat,stn_lon,stn_first_train,stn_last_train,in_operation_bool,non_disruption_bool,final_status,update_time
0,BP1,CHOA CHU KANG LRT STATION,1.38484,103.74458,5:13:00,23:30:00,1.0,1.0,1.0,1.0
1,BP10,FAJAR LRT STATION,1.38452,103.77083,5:04:00,23:43:00,1.0,1.0,1.0,1.0
2,BP11,SEGAR LRT STATION,1.38777,103.76962,5:03:00,23:45:00,1.0,1.0,1.0,1.0
3,BP12,JELAPANG LRT STATION,1.3867,103.76452,5:01:00,23:47:00,1.0,1.0,1.0,1.0
4,BP13,SENJA LRT STATION,1.38269,103.76239,5:00:00,23:48:00,1.0,1.0,1.0,1.0


In [4]:
mrt_list_df = mrt_list_df.set_index('stn_id')
status = r.json()["value"]["Status"]
if status != 1:
    stations_list = []
    for d in r.json()["value"]["AffectedSegments"]:
        stations_list += d["Stations"].split(",")
    station_df = pd.DataFrame(stations_list, columns=["stn_id"])
    station_df["non_disruption_bool"] = 0
    station_df = station_df.set_index('stn_id')
    mrt_list_df.update(station_df)

mrt_list_df.reset_index(inplace=True)
mrt_list_df

Unnamed: 0,stn_id,stn_name,stn_lat,stn_lon,stn_first_train,stn_last_train,in_operation_bool,non_disruption_bool,final_status,update_time
0,BP1,CHOA CHU KANG LRT STATION,1.38484,103.74458,5:13:00,23:30:00,1.0,1.0,1.0,1.0
1,BP10,FAJAR LRT STATION,1.38452,103.77083,5:04:00,23:43:00,1.0,1.0,1.0,1.0
2,BP11,SEGAR LRT STATION,1.38777,103.76962,5:03:00,23:45:00,1.0,1.0,1.0,1.0
3,BP12,JELAPANG LRT STATION,1.38670,103.76452,5:01:00,23:47:00,1.0,1.0,1.0,1.0
4,BP13,SENJA LRT STATION,1.38269,103.76239,5:00:00,23:48:00,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
188,TE5,LENTOR MRT STATION,1.38551,103.83574,5:49:00,23:53:00,1.0,1.0,1.0,1.0
189,TE6,MAYFLOWER MRT STATION,1.37146,103.83657,5:52:00,23:50:00,1.0,1.0,1.0,1.0
190,TE7,BRIGHT HILL MRT STATION,1.36367,103.83363,5:54:00,23:48:00,1.0,1.0,1.0,1.0
191,TE8,UPPER THOMSON MRT STATION,1.35442,103.83290,5:56:00,23:47:00,1.0,1.0,1.0,1.0


In [5]:
def get_first_time(update_time, stn_first_time):
    today_date = update_time.date()
    first_time = str(today_date) + ' ' + str(stn_first_time)
    return datetime.datetime.strptime(first_time, '%Y-%m-%d %H:%M:%S')

In [6]:
def get_last_time(update_time, stn_last_train):
    tmr_date = update_time.date()  + datetime.timedelta(days = 1)
    today_date = update_time.date()
    if datetime.datetime.strptime(stn_last_train, '%H:%M:%S').time() < datetime.time(2,0):
        last_time = str(tmr_date) + ' ' + str(stn_last_train)
    else:
        last_time = str(today_date) + ' ' + str(stn_last_train)
    return datetime.datetime.strptime(last_time, '%Y-%m-%d %H:%M:%S')

In [11]:
def check_operation_bool(update_time, start_train, last_train):
    if (start_train <= update_time) and (last_train >= update_time):
        return 1.0
    return 0.0

In [8]:
update_time = (datetime.datetime.now() + datetime.timedelta(hours=8))
mrt_list_df.loc[:,"stn_first_train_dt"] = mrt_list_df.apply(lambda x : get_last_time(update_time, x["stn_first_train"]), axis=1)
mrt_list_df.loc[:,"stn_last_train_dt"] = mrt_list_df.apply(lambda x : get_last_time(update_time, x["stn_last_train"]), axis=1)
mrt_list_df.loc[:,"in_operation_bool"] = mrt_list_df.apply(lambda x : check_operation_bool(update_time, x["stn_first_train_dt"], 
                                                                                     x["stn_last_train_dt"]), axis=1)
mrt_list_df["final_status"] = mrt_list_df["in_operation_bool"] * mrt_list_df["non_disruption_bool"]
mrt_list_df["update_time"] = update_time.strftime("%Y-%m-%d %H:%M:%S")
mrt_list_df = mrt_list_df.drop(columns=["stn_first_train_dt","stn_last_train_dt"])
mrt_list_df

Unnamed: 0,stn_id,stn_name,stn_lat,stn_lon,stn_first_train,stn_last_train,in_operation_bool,non_disruption_bool,final_status,update_time
0,BP1,CHOA CHU KANG LRT STATION,1.38484,103.74458,5:13:00,23:30:00,1.0,1.0,1.0,2022-01-03 19:37:02
1,BP10,FAJAR LRT STATION,1.38452,103.77083,5:04:00,23:43:00,1.0,1.0,1.0,2022-01-03 19:37:02
2,BP11,SEGAR LRT STATION,1.38777,103.76962,5:03:00,23:45:00,1.0,1.0,1.0,2022-01-03 19:37:02
3,BP12,JELAPANG LRT STATION,1.38670,103.76452,5:01:00,23:47:00,1.0,1.0,1.0,2022-01-03 19:37:02
4,BP13,SENJA LRT STATION,1.38269,103.76239,5:00:00,23:48:00,1.0,1.0,1.0,2022-01-03 19:37:02
...,...,...,...,...,...,...,...,...,...,...
188,TE5,LENTOR MRT STATION,1.38551,103.83574,5:49:00,23:53:00,1.0,1.0,1.0,2022-01-03 19:37:02
189,TE6,MAYFLOWER MRT STATION,1.37146,103.83657,5:52:00,23:50:00,1.0,1.0,1.0,2022-01-03 19:37:02
190,TE7,BRIGHT HILL MRT STATION,1.36367,103.83363,5:54:00,23:48:00,1.0,1.0,1.0,2022-01-03 19:37:02
191,TE8,UPPER THOMSON MRT STATION,1.35442,103.83290,5:56:00,23:47:00,1.0,1.0,1.0,2022-01-03 19:37:02


In [9]:
mrt_list_df.dtypes

stn_id                  object
stn_name                object
stn_lat                float64
stn_lon                float64
stn_first_train         object
stn_last_train          object
in_operation_bool      float64
non_disruption_bool    float64
final_status           float64
update_time             object
dtype: object

# Get connection between Taxi Stand to Weather Stn

In [5]:
from math import radians, sin, cos, asin, sqrt

def haversine_distance(lon1, lat1, lon2, lat2):
    """
    Compute distance between two pairs of coordinates (lon1, lat1, lon2, lat2)
    See - (https://en.wikipedia.org/wiki/Haversine_formula)
    Distance is measured in kilometers when r = 6371
    r = 6371  Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    Lats and Longs are converted to radians first then computed used haversine
    """
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    radius = 6371
    return 2 * radius * asin(sqrt(a))

In [2]:
bqclient = bigquery.Client()

# Download query results.
query_string = """
select distinct taxi_st_id, taxi_st_lat, taxi_st_lon from `taxi-compass-lewagon.api_dataset.c_taxi_stand`
order by taxi_st_id asc
"""

taxi_st_df = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)
taxi_st_df.head()

Unnamed: 0,taxi_st_id,taxi_st_lat,taxi_st_lon
0,kml_1,1.281261,103.844358
1,kml_10,1.278939,103.847973
2,kml_100,1.331126,103.925469
3,kml_101,1.312172,103.938959
4,kml_102,1.313754,103.837341


In [6]:
bqclient = bigquery.Client()

# Download query results.
query_string = """
select distinct a.station_id, a.station_lat, a.station_lon from `taxi-compass-lewagon.api_dataset.h_weather_rainfall` a
order by station_id asc
"""

weather_df = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)
weather_df.head()

Unnamed: 0,station_id,station_lat,station_lon
0,S08,1.3701,103.8271
1,S100,1.4172,103.74855
2,S104,1.44387,103.78538
3,S106,1.4168,103.9673
4,S107,1.3135,103.9625


In [7]:
combined_df = pd.merge(taxi_st_df, weather_df, how="cross")
combined_df["distance"] = combined_df.apply(lambda x : haversine_distance(x["station_lon"], x["station_lat"], 
                                                                          x["taxi_st_lon"], x["taxi_st_lat"]), axis=1)
combined_df

Unnamed: 0,taxi_st_id,taxi_st_lat,taxi_st_lon,station_id,station_lat,station_lon,distance
0,kml_1,1.281261,103.844358,S08,1.37010,103.82710,10.063072
1,kml_1,1.281261,103.844358,S100,1.41720,103.74855,18.491040
2,kml_1,1.281261,103.844358,S104,1.44387,103.78538,19.233285
3,kml_1,1.281261,103.844358,S106,1.41680,103.96730,20.345096
4,kml_1,1.281261,103.844358,S107,1.31350,103.96250,13.613864
...,...,...,...,...,...,...,...
23795,kml_99,1.405508,103.902206,S88,1.34270,103.84820,9.209624
23796,kml_99,1.405508,103.902206,S89,1.31985,103.66162,28.389843
23797,kml_99,1.405508,103.902206,S90,1.31910,103.81910,13.329057
23798,kml_99,1.405508,103.902206,S900,1.41284,103.86922,3.756326


In [8]:
combined_df = combined_df.sort_values('distance', ascending=True).drop_duplicates(["taxi_st_id"])[["taxi_st_id","station_id"]]
combined_df.reset_index(inplace=True, drop=True)
combined_df

Unnamed: 0,taxi_st_id,station_id
0,kml_115,S40
1,kml_66,S79
2,kml_132,S118
3,kml_131,S118
4,kml_323,S89
...,...,...
345,kml_206,S118
346,kml_205,S222
347,kml_204,S118
348,kml_207,S222


In [9]:
combined_df = combined_df.rename(columns={"station_id" : "weather_stn_id"})
combined_df

Unnamed: 0,taxi_st_id,weather_stn_id
0,kml_115,S40
1,kml_66,S79
2,kml_132,S118
3,kml_131,S118
4,kml_323,S89
...,...,...
345,kml_206,S118
346,kml_205,S222
347,kml_204,S118
348,kml_207,S222


# Deep Learning Modelling

In [2]:
bqclient = bigquery.Client()

# Download query results.
query_string = """
select x.taxi_st_id,  substr(x.taxi_st_id,5) taxi_st_num,  x.taxi_count, x.taxi_update_time, x.weather_stn_id, c.rainfall, c.weather_update_time, x.mrt_stn_id, e.mrt_final_status, e.mrt_update_time
from (
select a.taxi_st_id, a.taxi_count, a.taxi_update_time, b.weather_stn_id, d.mrt_stn_id
from (
SELECT ts_id as taxi_st_id, taxi_count, cast(timestamp_trunc(timestamp, minute) as datetime) as taxi_update_time
FROM `taxi-compass-lewagon.api_dataset.h_taxi_stand_taxi_count`
WHERE timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP() , INTERVAL 150 hour)
) a
left join
(
select weather_stn_id, taxi_st_id from `taxi-compass-lewagon.api_dataset.c_taxi_stand_weather_stn`
) b on a.taxi_st_id = b.taxi_st_id
left join 
(
select taxi_st_id, mrt_stn as mrt_stn_id from `taxi-compass-lewagon.api_dataset.c_mrt_stn_taxi_stand`
where mrt_stn is not null
) d on a.taxi_st_id = d.taxi_st_id
)x
left join 
(
select station_id as weather_stn_id, rainfall, datetime_trunc(datetime (update_time), minute) as weather_update_time
from `taxi-compass-lewagon.api_dataset.h_weather_rainfall`
where datetime(update_time) > datetime_SUB(CURRENT_DATETIME() , INTERVAL 150 hour)
) c on x.weather_stn_id = c.weather_stn_id and x.taxi_update_time = c.weather_update_time
left join
(
select stn_id as mrt_stn_id, final_status as mrt_final_status, datetime_trunc(datetime (update_time), minute) as mrt_update_time 
from `taxi-compass-lewagon.api_dataset.h_mrt_status_availability`
where datetime(update_time) > datetime_SUB(CURRENT_DATETIME() , INTERVAL 150 hour)
) e on x.taxi_update_time = e.mrt_update_time and x.mrt_stn_id = e.mrt_stn_id
"""

taxi_df = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)
taxi_df.head()

Unnamed: 0,taxi_st_id,taxi_st_num,taxi_count,taxi_update_time,weather_stn_id,rainfall,weather_update_time,mrt_stn_id,mrt_final_status,mrt_update_time
0,kml_4,4,0,2022-01-11 18:45:00,S118,0.0,2022-01-11 18:45:00,,,NaT
1,kml_5,5,0,2022-01-11 18:45:00,S118,0.0,2022-01-11 18:45:00,,,NaT
2,kml_9,9,0,2022-01-11 18:45:00,S220,0.0,2022-01-11 18:45:00,NE15,1.0,2022-01-11 18:45:00
3,kml_11,11,0,2022-01-11 18:45:00,S209,0.0,2022-01-11 18:45:00,NS14,1.0,2022-01-11 18:45:00
4,kml_12,12,0,2022-01-11 18:45:00,S211,0.0,2022-01-11 18:45:00,,,NaT


In [3]:
df = taxi_df[["taxi_st_num","taxi_update_time","taxi_count","rainfall","mrt_final_status"]].copy()
df["taxi_st_num"] = df["taxi_st_num"].astype('int64')
df = df.sort_values(by=["taxi_st_num", "taxi_update_time"],ascending=True).reset_index(drop=True)
df[["rainfall"]] = df[["rainfall"]].fillna(df.groupby(['taxi_st_num'])[["rainfall"]].ffill())
df[["mrt_final_status","rainfall"]] = df[["mrt_final_status","rainfall"]].fillna(value=0)
df

Unnamed: 0,taxi_st_num,taxi_update_time,taxi_count,rainfall,mrt_final_status
0,1,2022-01-05 05:07:00,2,0.0,0.0
1,1,2022-01-05 05:08:00,3,0.0,0.0
2,1,2022-01-05 05:09:00,1,0.0,0.0
3,1,2022-01-05 05:10:00,1,0.0,0.0
4,1,2022-01-05 05:11:00,1,0.0,0.0
...,...,...,...,...,...
3627578,350,2022-01-11 19:02:00,1,0.0,1.0
3627579,350,2022-01-11 19:03:00,1,0.0,1.0
3627580,350,2022-01-11 19:04:00,2,0.0,1.0
3627581,350,2022-01-11 19:05:00,1,0.0,1.0


In [4]:
def get_weekday(time):
    time = time.weekday()
    if time == 5 or time == 6:
        return 1
    return 0

df = df.groupby(["taxi_st_num","taxi_update_time","taxi_count","rainfall"]).agg('min').reset_index()
df = df.set_index(["taxi_st_num","taxi_update_time"])
df = df.groupby(level=0).apply(lambda x: x.reset_index(level=0, drop=True).asfreq("60S")).reset_index()
df[["taxi_count","rainfall","mrt_final_status"]] = df[["taxi_count","rainfall","mrt_final_status"]].fillna(df.groupby(['taxi_st_num'])[["taxi_count","rainfall","mrt_final_status"]].ffill())
df["hour"] = df["taxi_update_time"].dt.hour
df["minute"] = df["taxi_update_time"].dt.minute
df['hr_sin'] = np.sin(df["hour"]*(2.*np.pi/24))
df['hr_cos'] = np.cos(df["hour"]*(2.*np.pi/24))
df['min_sin'] = np.sin(df["minute"]*(2.*np.pi/60))
df['min_cos'] = np.cos(df["minute"]*(2.*np.pi/60))
df["taxi_update_time"] = df["taxi_update_time"].dt.tz_localize("Asia/Singapore")
df["weekend_bool"] = df.apply(lambda x : get_weekday(x["taxi_update_time"]), axis=1)
df

Unnamed: 0,taxi_st_num,taxi_update_time,taxi_count,rainfall,mrt_final_status,hour,minute,hr_sin,hr_cos,min_sin,min_cos,weekend_bool
0,1,2022-01-05 05:07:00+08:00,2.0,0.0,0.0,5,7,0.965926,0.258819,0.669131,0.743145,0
1,1,2022-01-05 05:08:00+08:00,3.0,0.0,0.0,5,8,0.965926,0.258819,0.743145,0.669131,0
2,1,2022-01-05 05:09:00+08:00,1.0,0.0,0.0,5,9,0.965926,0.258819,0.809017,0.587785,0
3,1,2022-01-05 05:10:00+08:00,1.0,0.0,0.0,5,10,0.965926,0.258819,0.866025,0.500000,0
4,1,2022-01-05 05:11:00+08:00,1.0,0.0,0.0,5,11,0.965926,0.258819,0.913545,0.406737,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3317995,350,2022-01-11 19:02:00+08:00,1.0,0.0,1.0,19,2,-0.965926,0.258819,0.207912,0.978148,0
3317996,350,2022-01-11 19:03:00+08:00,1.0,0.0,1.0,19,3,-0.965926,0.258819,0.309017,0.951057,0
3317997,350,2022-01-11 19:04:00+08:00,2.0,0.0,1.0,19,4,-0.965926,0.258819,0.406737,0.913545,0
3317998,350,2022-01-11 19:05:00+08:00,1.0,0.0,1.0,19,5,-0.965926,0.258819,0.500000,0.866025,0


In [5]:
_ = np.array([])
len(_) == 0

X_train_mas = np.array([])
X_test_mas = np.array([])
y_train_mas = np.array([])
y_test_mas = np.array([])

In [6]:
bins = [0, 1, 2, 3, 4, 6, 9, 10000]
labels = [0, 1, 2, 3, 4, 5, 6]
for i in range(350):
    print(i+1,len(df.loc[df["taxi_st_num"] == i+1]), "started...")
    X = df.loc[df["taxi_st_num"] == i+1][["taxi_count", "rainfall","mrt_final_status",
                                          "weekend_bool","hr_sin","hr_cos","min_sin","min_cos"]].copy()
    X["y"] = X["taxi_count"].shift(-15)
    for day in range(1,15):
        X[f"taxi_count_+{15-day}"] = X["taxi_count"].shift(day-15)
    X = X.dropna()
    y = pd.cut(X["y"], bins=bins, right=False, labels=labels).to_numpy()
    X = X.drop(columns = ["y"]).to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, train_size=0.8)
    X_train = X_train.reshape(1, X_train.shape[0], X_train.shape[1])
    X_test = X_test.reshape(1, X_test.shape[0], X_test.shape[1])
    y_train = y_train.reshape(1, y_train.shape[0])
    y_test = y_test.reshape(1, y_test.shape[0])

    if len(X_train_mas) == 0:
        X_train_mas, X_test_mas, y_train_mas, y_test_mas = X_train, X_test, y_train, y_test
    else:
        X_train_mas = np.vstack((X_train_mas, X_train))
        X_test_mas = np.vstack((X_test_mas, X_test))
        y_train_mas = np.vstack((y_train_mas, y_train))
        y_test_mas = np.vstack((y_test_mas, y_test))
print(X_train_mas.shape, X_test_mas.shape, y_train_mas.shape, y_test_mas.shape)

1 9480 started...
2 9480 started...
3 9480 started...
4 9480 started...
5 9480 started...
6 9480 started...
7 9480 started...
8 9480 started...
9 9480 started...
10 9480 started...
11 9480 started...
12 9480 started...
13 9480 started...
14 9480 started...
15 9480 started...
16 9480 started...
17 9480 started...
18 9480 started...
19 9480 started...
20 9480 started...
21 9480 started...
22 9480 started...
23 9480 started...
24 9480 started...
25 9480 started...
26 9480 started...
27 9480 started...
28 9480 started...
29 9480 started...
30 9480 started...
31 9480 started...
32 9480 started...
33 9480 started...
34 9480 started...
35 9480 started...
36 9480 started...
37 9480 started...
38 9480 started...
39 9480 started...
40 9480 started...
41 9480 started...
42 9480 started...
43 9480 started...
44 9480 started...
45 9480 started...
46 9480 started...
47 9480 started...
48 9480 started...
49 9480 started...
50 9480 started...
51 9480 started...
52 9480 started...
53 9480 started...
54

In [7]:
from tensorflow.keras.utils import to_categorical
y_train_mas_cat = to_categorical(y_train_mas)
y_test_mas_cat = to_categorical(y_test_mas)

2022-01-11 19:13:24.613402: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-11 19:13:24.613464: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [8]:
X_train_mas[:,:-120].shape

(350, 7452, 22)

In [9]:
X_train1, X_val1, y_train1, y_val1 = X_train_mas[:,:-500], X_train_mas[:,-500:], y_train_mas_cat[:,:-500], y_train_mas_cat[:,-500:]

In [10]:
X_train1.shape

(350, 7072, 22)

In [11]:
X_train1[0].shape

(7072, 22)

## Use NN Multi-class Classification

In [12]:
from tensorflow.keras import models, layers

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop

def init_model():
    model = models.Sequential()
    model.add(layers.Dense(50, activation="relu", input_shape=X_train1[0].shape))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(25, activation="tanh"))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(15, activation="relu"))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(7, activation="softmax"))
    
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    return model

es = EarlyStopping(patience=5, restore_best_weights=True)

model = init_model()

history = model.fit(X_train1, y_train1, validation_data=(X_val1, y_val1),
                   batch_size=16, epochs=500, callbacks=[es], verbose=1)

2022-01-11 19:13:29.596067: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-01-11 19:13:29.596624: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-11 19:13:29.596742: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-01-11 19:13:29.596827: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-01-11 19:13:29.596899: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500


Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500


Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500


In [13]:
model.evaluate(X_test_mas, y_test_mas_cat)



[1.274998664855957, 0.4745106101036072]

In [247]:
from tensorflow.keras import models, layers

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop

def init_model():
    model = models.Sequential()
    model.add(layers.LSTM(units=10, activation="tanh", input_shape=X_train1[0].shape))
    model.add(layers.Dense(25, activation="relu"))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(15, activation="relu"))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation="linear"))
    
    model.compile(loss="mse", optimizer="rmsprop", metrics=["mae"])
    
    return model

es = EarlyStopping(patience=5, restore_best_weights=True)

model = init_model()

history = model.fit(X_train1, y_train1, validation_data=(X_val1, y_val1),
                   batch_size=32, epochs=50, callbacks=[es], verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50


## Use traditional machine learning

In [14]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(solver='liblinear')

log_model.fit(X_train_mas[0], y_train_mas[0])
log_model.score(X_test_mas[0], y_test_mas[0])

0.387216059165346

In [15]:
y_train_mas[0]

array([1, 3, 3, ..., 5, 4, 4])

In [16]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_jobs=-1)
xgb.fit(X_train_mas[0], y_train_mas[0])
xgb.score(X_test_mas[0], y_test_mas[0])

0.5830805280810113