<a href="https://colab.research.google.com/github/alickchoi/STAT5106-Lab/blob/main/Week_6_data_gov_API_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# MTR Next Train
The API page: https://data.gov.hk/tc-data/dataset/mtr-data2-nexttrain-data
<br><br>
Provide the arrival time information for up to the next four trains of
- Airport Express
- Tung Chung Line
- Tuen Ma Line
- Tseung Kwan O Line
- East Rail Line
- South Island Line
- Tsuen Wan Line
- Island Line
- Kwun Tong Line
<br>
<img src="https://drive.google.com/uc?id=13z5Dla1pHP-hhJ8APBhiJVoeJgDl6a9u"></img>
<br><br>
Data Dictionary: https://opendata.mtr.com.hk/doc/Next_Train_DataDictionary_v1.2.pdf <br>
API Spec: https://opendata.mtr.com.hk/doc/Next_Train_API_Spec_v1.2.pdf

In [1]:
import urllib.request, urllib.parse
import json
import pandas as pd

In [2]:
# get API data

serviceurl = 'https://rt.data.gov.hk/v1/transport/mtr/getSchedule.php'

params = {'line': 'EAL' , 'sta': 'UNI'}
url = serviceurl + '?' + urllib.parse.urlencode(params)
data = urllib.request.urlopen(url).read().decode()
js = json.loads(data)

In [3]:
js

{'sys_time': '2025-10-16 19:33:13',
 'curr_time': '2025-10-16 19:33:07',
 'data': {'EAL-UNI': {'curr_time': '2025-10-16 19:33:07',
   'sys_time': '2025-10-16 19:33:13',
   'UP': [{'seq': '1',
     'dest': 'LMC',
     'plat': '1',
     'time': '2025-10-16 19:34:07',
     'ttnt': '1',
     'valid': 'Y',
     'source': '-',
     'route': '',
     'timeType': 'A'},
    {'seq': '2',
     'dest': 'LOW',
     'plat': '1',
     'time': '2025-10-16 19:38:07',
     'ttnt': '5',
     'valid': 'Y',
     'source': '-',
     'route': '',
     'timeType': 'A'},
    {'seq': '3',
     'dest': 'LMC',
     'plat': '1',
     'time': '2025-10-16 19:41:07',
     'ttnt': '8',
     'valid': 'Y',
     'source': '-',
     'route': '',
     'timeType': 'A'},
    {'seq': '4',
     'dest': 'LOW',
     'plat': '1',
     'time': '2025-10-16 19:44:07',
     'ttnt': '11',
     'valid': 'Y',
     'source': '-',
     'route': '',
     'timeType': 'A'}],
   'DOWN': [{'seq': '1',
     'dest': 'ADM',
     'plat': '2',
    

In [4]:
js.keys()

dict_keys(['sys_time', 'curr_time', 'data', 'isdelay', 'status', 'message'])

In [5]:
js['data']['EAL-UNI']['DOWN']

[{'seq': '1',
  'dest': 'ADM',
  'plat': '2',
  'time': '2025-10-16 19:34:07',
  'ttnt': '1',
  'valid': 'Y',
  'source': '-',
  'route': '',
  'timeType': 'A'},
 {'seq': '2',
  'dest': 'ADM',
  'plat': '2',
  'time': '2025-10-16 19:36:07',
  'ttnt': '3',
  'valid': 'Y',
  'source': '-',
  'route': '',
  'timeType': 'A'},
 {'seq': '3',
  'dest': 'ADM',
  'plat': '2',
  'time': '2025-10-16 19:38:07',
  'ttnt': '5',
  'valid': 'Y',
  'source': '-',
  'route': '',
  'timeType': 'A'},
 {'seq': '4',
  'dest': 'ADM',
  'plat': '2',
  'time': '2025-10-16 19:43:07',
  'ttnt': '10',
  'valid': 'Y',
  'source': '-',
  'route': '',
  'timeType': 'A'}]

In [6]:
pd.DataFrame(js['data']['EAL-UNI']['DOWN'])

Unnamed: 0,seq,dest,plat,time,ttnt,valid,source,route,timeType
0,1,ADM,2,2025-10-16 19:34:07,1,Y,-,,A
1,2,ADM,2,2025-10-16 19:36:07,3,Y,-,,A
2,3,ADM,2,2025-10-16 19:38:07,5,Y,-,,A
3,4,ADM,2,2025-10-16 19:43:07,10,Y,-,,A


In [7]:
# makeup to be pandas df

df_down = pd.DataFrame(js['data']['EAL-UNI']['DOWN'])
df_down['bound'] = 'D'
df_up = pd.DataFrame(js['data']['EAL-UNI']['UP'])
df_up['bound'] = 'U'

df = pd.concat([df_up, df_down], ignore_index=True)
df['curr_time'] = js['curr_time']

In [8]:
df

Unnamed: 0,seq,dest,plat,time,ttnt,valid,source,route,timeType,bound,curr_time
0,1,LMC,1,2025-10-16 19:34:07,1,Y,-,,A,U,2025-10-16 19:33:07
1,2,LOW,1,2025-10-16 19:38:07,5,Y,-,,A,U,2025-10-16 19:33:07
2,3,LMC,1,2025-10-16 19:41:07,8,Y,-,,A,U,2025-10-16 19:33:07
3,4,LOW,1,2025-10-16 19:44:07,11,Y,-,,A,U,2025-10-16 19:33:07
4,1,ADM,2,2025-10-16 19:34:07,1,Y,-,,A,D,2025-10-16 19:33:07
5,2,ADM,2,2025-10-16 19:36:07,3,Y,-,,A,D,2025-10-16 19:33:07
6,3,ADM,2,2025-10-16 19:38:07,5,Y,-,,A,D,2025-10-16 19:33:07
7,4,ADM,2,2025-10-16 19:43:07,10,Y,-,,A,D,2025-10-16 19:33:07


In [9]:
# get the real station name
df_station = pd.read_csv('https://opendata.mtr.com.hk/data/mtr_lines_and_stations.csv')

In [10]:
df_station

Unnamed: 0,Line Code,Direction,Station Code,Station ID,Chinese Name,English Name,Sequence
0,AEL,DT,AWE,56.0,博覽館,AsiaWorld-Expo,1.0
1,AEL,DT,AIR,47.0,機場,Airport,2.0
2,AEL,DT,TSY,46.0,青衣,Tsing Yi,3.0
3,AEL,DT,KOW,45.0,九龍,Kowloon,4.0
4,AEL,DT,HOK,44.0,香港,Hong Kong,5.0
...,...,...,...,...,...,...,...
268,SIL,DT,OCP,86.0,海洋公園,Ocean Park,4.0
269,SIL,DT,ADM,2.0,金鐘,Admiralty,5.0
270,,,,,,,
271,,,,,,,


In [11]:
df_station.dropna(subset=['Station Code'])[['Station Code', 'Chinese Name', 'English Name']].query('`Station Code`=="ADM"')

Unnamed: 0,Station Code,Chinese Name,English Name
27,ADM,金鐘,Admiralty
41,ADM,金鐘,Admiralty
42,ADM,金鐘,Admiralty
56,ADM,金鐘,Admiralty
81,ADM,金鐘,Admiralty
92,ADM,金鐘,Admiralty
242,ADM,金鐘,Admiralty
245,ADM,金鐘,Admiralty
260,ADM,金鐘,Admiralty
269,ADM,金鐘,Admiralty


In [12]:
df_station.dropna(subset=['Line Code'])[['Station Code', 'Chinese Name', 'English Name']].drop_duplicates()

Unnamed: 0,Station Code,Chinese Name,English Name
0,AWE,博覽館,AsiaWorld-Expo
1,AIR,機場,Airport
2,TSY,青衣,Tsing Yi
3,KOW,九龍,Kowloon
4,HOK,香港,Hong Kong
...,...,...,...
241,TST,尖沙咀,Tsim Sha Tsui
261,OCP,海洋公園,Ocean Park
262,WCH,黃竹坑,Wong Chuk Hang
263,LET,利東,Lei Tung


In [13]:
df_station_mapping = df_station.dropna(subset=['Line Code'])[['Station Code', 'Chinese Name', 'English Name']].drop_duplicates()
df = df.merge(df_station_mapping, how='left', left_on='dest', right_on='Station Code')

In [14]:
df

Unnamed: 0,seq,dest,plat,time,ttnt,valid,source,route,timeType,bound,curr_time,Station Code,Chinese Name,English Name
0,1,LMC,1,2025-10-16 19:34:07,1,Y,-,,A,U,2025-10-16 19:33:07,LMC,落馬洲,Lok Ma Chau
1,2,LOW,1,2025-10-16 19:38:07,5,Y,-,,A,U,2025-10-16 19:33:07,LOW,羅湖,Lo Wu
2,3,LMC,1,2025-10-16 19:41:07,8,Y,-,,A,U,2025-10-16 19:33:07,LMC,落馬洲,Lok Ma Chau
3,4,LOW,1,2025-10-16 19:44:07,11,Y,-,,A,U,2025-10-16 19:33:07,LOW,羅湖,Lo Wu
4,1,ADM,2,2025-10-16 19:34:07,1,Y,-,,A,D,2025-10-16 19:33:07,ADM,金鐘,Admiralty
5,2,ADM,2,2025-10-16 19:36:07,3,Y,-,,A,D,2025-10-16 19:33:07,ADM,金鐘,Admiralty
6,3,ADM,2,2025-10-16 19:38:07,5,Y,-,,A,D,2025-10-16 19:33:07,ADM,金鐘,Admiralty
7,4,ADM,2,2025-10-16 19:43:07,10,Y,-,,A,D,2025-10-16 19:33:07,ADM,金鐘,Admiralty


# Traffic Snapshot Image
The API page: https://data.gov.hk/tc-data/dataset/hk-td-tis_2-traffic-snapshot-images
<br><br>
The traffic snapshot images are captured by the closed circuit televisions (CCTV) and the traffic detectors installed respectively at 194 locations and 773 locations of major roads in Hong Kong for you to visualise the latest traffic conditions.
<br><br>
<img src="https://drive.google.com/uc?id=1422KhiRBr1ZSrQ3xoAQWY_W2NTtAE7ux"></img>
<br><br>
Data Dictionary: https://static.data.gov.hk/td/traffic-snapshot-images/en/Summary_of_traffic_snapshot_images.pdf


In [None]:
import urllib.request, urllib.parse
import json
import pandas as pd

In [None]:
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [None]:
# Download positions of each cameras
df_cameras = pd.read_csv(
        'https://static.data.gov.hk/td/traffic-snapshot-images/code/Traffic_Camera_Locations_En.csv'
        , encoding="utf-16"
        , sep='\t' # You need to preview what's happened in csv, for setting these parameter.
    )

In [None]:
df_cameras.head()

In [None]:
import urllib.request
url = df_cameras.url[0]
resp = urllib.request.urlopen(url)
# different Requests packages may have different results.

### Instead urllib package, you can use requests package

In [None]:
df_cameras.url[0]

In [None]:
from PIL import Image
import requests

# Showing the image
url = df_cameras.url[0]
resp = requests.get(url, stream=True)
print(resp.status_code)
im = Image.open(resp.raw)
im

In [None]:
# Download all images
def download_image(url):
    import requests
    import os

    resp = requests.get(url, stream=True)
    filename = os.path.basename(url)
    with open(filename, 'wb') as f:
        for chunk in resp:
            f.write(chunk)

    print(f'{url} downloaded.')
    return True

# for i in df_cameras.index:
for i in df_cameras.index[0:10]:
    print(f'{i} / {df_cameras.shape[0]}')
    url = df_cameras.url[i]
    download_image(url)

# Carpark Vacancy

The API page is here. ([In Traditional Chinese](https://data.gov.hk/tc-data/dataset/hk-dpo-datagovhk1-carpark-info-vacancy/resource/f4c792c6-071c-4a64-888b-afeea33d5ad7), [In English](https://data.gov.hk/en-data/dataset/hk-dpo-datagovhk1-carpark-info-vacancy/resource/01752c62-a6b6-4ddc-bf2d-25efccadc143))

<img src="https://drive.google.com/uc?id=1-MdF-gsRTdV4LmYe5IKY0v5mLlnthT-b"></img>


But how to import to your python program ? <br>


In [None]:
import urllib.request, urllib.parse, urllib.error
import json
import ssl
import pandas as pd

serviceurl = 'https://api.data.gov.hk/v1/carpark-info-vacancy'

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

parms = {}
parms['data'] = 'info'
parms['vehicleTypes'] = 'privateCar'
url = serviceurl + '?' + urllib.parse.urlencode(parms)

print('Retrieving', url)
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read().decode()

print(f'Retrieved data with length {len(data)}.')

## Loading the first row for understanding the structure

In [None]:
js = json.loads(data)
js_result = js['results']

In [None]:
js_row = js_result[0]
js_row

{'park_Id': '10',
 'name': 'Kai Tak Cruise Terminal Car Park 1',
 'nature': 'commercial',
 'carpark_Type': 'multi-storey',
 'address': {'buildingName': 'Kai Tak Cruise Terminal',
  'streetName': 'Shing Fung Road',
  'buildingNo': '33',
  'floor': '1',
  'subDistrict': 'Kowloon Bay',
  'dcDistrict': 'Kwun Tong District',
  'region': 'KLN'},
 'displayAddress': '1st floor, Kai Tak Cruise Terminal, 33 Shing Fung Road, Kowloon Bay, KLN',
 'district': 'Kwun Tong District',
 'latitude': 22.3062049,
 'longitude': 114.21309471,
 'contactNo': '+852 3465 6888, 09:30-18:00 Mon-Fri, except public holiday',
 'renditionUrls': {'square': 'https://sps-opendata.pilotsmartke.gov.hk/rest/getRendition/fs-1%3A693265207413252869411532657339312395903827562313.JPG/square.png',
  'thumbnail': 'https://sps-opendata.pilotsmartke.gov.hk/rest/getRendition/fs-1%3A693265207413252869411532657339312395903827562313.JPG/thumbnail.png',
  'banner': 'https://sps-opendata.pilotsmartke.gov.hk/rest/getRendition/fs-1%3A6932652

In [None]:
# Extracting renditionUrls

def get_url_image(js_row):
  if 'renditionUrls' not in js_row: return None
  urls_obtain = list(js_row['renditionUrls'].values())
  if len(urls_obtain)==0: return None
  return urls_obtain[0]

In [None]:
get_url_image(js_row)

In [None]:
# Extracting privateCar

def get_todayinfo(js_row):

    from datetime import datetime as dt
    today_date_str = dt.today().strftime('%Y-%m-%d')
    today_date = dt.strptime(today_date_str, '%Y-%m-%d')
    today_weekday = today_date.strftime("%a").upper()

    dict_todayinfo = {'periodStart': None, 'periodEnd': None, 'price': None, 'space': 0}
    dict_todayinfo['today'] = today_date_str
    dict_todayinfo['today_weekday'] = today_weekday

    if 'privateCar' not in js_row: return dict_todayinfo

    for hourlyCharge in js_row['privateCar']['hourlyCharges']:
      if today_weekday in hourlyCharge['weekdays']:

        dict_todayinfo['periodStart'] = hourlyCharge.get('periodStart', None)
        dict_todayinfo['periodEnd'] = hourlyCharge.get('periodEnd', None)
        dict_todayinfo['price'] = hourlyCharge.get('price', None)

    dict_todayinfo['space'] = js_row['privateCar'].get('space', 0)

    return dict_todayinfo


In [None]:
get_todayinfo(js_row)

In [None]:
# Put all the things together

columns_needed = ['park_Id', 'name', 'displayAddress', 'district', 'latitude', 'longitude', 'opening_status', 'facilities', 'paymentMethods', 'modifiedDate']

out_dict_rows = []

for js_row in js_result:

  out_dict_row = {}

  for col in columns_needed:
    out_dict_row[f'{col}'] = js_row.get(col, None)

  out_dict_row['url_image'] = get_url_image(js_row)

  dict_todayinfo = get_todayinfo(js_row)
  out_dict_row |= dict_todayinfo

  out_dict_rows.append(out_dict_row)

df_carpark = pd.DataFrame(out_dict_rows)

In [None]:
df_carpark

# Weather
The API Page: https://data.gov.hk/tc-data/dataset/hk-hko-rss-current-weather-report

In [None]:
import urllib.request, urllib.parse
import json
import pandas as pd

import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

API reference is [here](https://data.weather.gov.hk/weatherAPI/doc/HKO_Open_Data_API_Documentation_tc.pdf)
```
dataType_value = {'flw': '本港地區天氣預報'
, 'fnd': '九天天氣預報'
, 'rhrread': '本港地區天氣報告'
, 'warnsum': '天氣警告一覽',
, 'warningInfo': '詳細天氣警告資訊'
, 'swt': '特別天氣提示'}
```

<img src="https://drive.google.com/uc?id=1447Hgdxjb3QMUfCzaCeY6cT6Uce14ekc"></img>


In [None]:
# https://data.weather.gov.hk/weatherAPI/opendata/weather.php?dataType=rhrread&lang=tc

import urllib.request, urllib.parse
import json
import pandas as pd

import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [None]:
serviceurl = 'https://data.weather.gov.hk/weatherAPI/opendata/weather.php'
params = {'dataType': 'rhrread', 'lang': 'tc'}

url = serviceurl + '?' + urllib.parse.urlencode(params)
resp = urllib.request.urlopen(url)
data = resp.read().decode()
js = json.loads(data)

df_temp = pd.DataFrame(js['temperature']['data']).rename(columns={'value': 'temperture', 'unit': 'temp_unit'})
df_rainfall = pd.DataFrame(js['rainfall']['data']).drop(columns=['main']).rename(columns={'max': 'rainfall', 'unit': 'rainfall_unit'})

df = df_temp.merge(df_rainfall, how='left', on='place')

In [None]:
df

## Historical Weather

Of course you can just search the API site below
<img src="https://drive.google.com/uc?id=15FIn8LPUuBGe37SopwMew0kSgCMdJJ8E"></img>

But how to import to your python program ? <br>
Alan will demostrate in the lecture.
