# Library Importation

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from functools import reduce
import ssl

# Constant and function definition

In [2]:
ssl._create_default_https_context = ssl._create_unverified_context
arg_read_csv = {'header':1, 'skipfooter':3, 'engine':'python'}
col_merge = ['date', 'region']
data_type_list_temperature = ['CLMTEMP', 'CLMMAXT', 'CLMMINT']
station_temperature = ['CCH','CWB','HKA','HKO','HKP','HKS','HPV','JKB','KLT','KP','KSC','KTG','LFS','NGP','PEN','PLC','SE1','SEK','SHA','SKG','SKW','SSH','SSP','STY','TC','TKL','TMS','TPO','TU1','TW','TWN','TY1','TYW','VP1','WGL','WLP','WTS','YCT']
staion_other = ['BHD','CCB','CCH','CPH','CP1','CWB','GI','HKA','HKO','HKP','HKS','HPV','JKB','KLT','KP','KSC','KTG','LAM','LFS','NGP','NP','PEN','PLC','SC','SE','SE1','SEK','SF','SHA','SHL','SKG','SKW','SSH','SSP','STY','TC','TKL','TME','TMS','TPK','TPO','TUN','TU1','TW','TWN','TY1','TYW','VP1','WGL','WLP','WTS','YCT']

In [3]:
def get_link_list(data):
  response = requests.get(f"https://data.gov.hk/tc-data/dataset/hk-hko-rss-{data}")
  soup = BeautifulSoup(response.text, 'html.parser')
  result = soup.find_all('a', "dataset-details__list-item-download btn btn--light")
  link_list = []
  for tag in result:
    if isinstance(link:=tag.get("href"), str) and "ALL" in link:
      link_list.append(link)
  return link_list

In [4]:
def digit_or_null(value):
    try:
        float(value)
    except:
        return np.nan
    else:
        return float(value)

In [5]:
def read_csv(link, year=0):
  df = pd.read_csv(link, **arg_read_csv)
  col_name = df.columns[0]
  df_temp = df.reset_index().iloc[1:].dropna(subset='level_3')
  df_rename = df_temp.rename(columns={'level_0': 'year', 'level_1': 'month', 'level_2': 'day', 'level_3': col_name, col_name:'completeness'})
  df_valid_data = df_rename[(df_rename['year'] != '1900') | (df_rename['month'] != '2') | (df_rename['day'] != '29')]
  df_valid_data = df_valid_data.copy()
  df_valid_data['date'] = pd.to_datetime(df_valid_data[['year','month','day']])
  df_date = df_valid_data.set_index('date')
  df_date_filter = df_date[df_date.index.year >= year]
  df_final = df_date_filter[col_name].apply(lambda x: digit_or_null(x))
  return df_final

In [6]:
def temperature(data_type, station, year=0):
  df = pd.read_csv(f"https://data.weather.gov.hk/weatherAPI/opendata/opendata.php?dataType={data_type}&rformat=csv&station={station}",
            engine='python',
            header=1,
            skipfooter=3,
            sep=None)
  col_name = df.columns[0]
  df_temp = df.iloc[1:,0].str.split(",", expand=True)
  df_rename = df_temp.rename(columns={0: 'year', 1: 'month', 2: 'day', 3: col_name, 4:'completeness'})
  df_valid_data = df_rename[(df_rename['year'] != '1900') | (df_rename['month'] != '2') | (df_rename['day'] != '29')].reset_index()
  df_valid_data = df_valid_data.copy()
  df_valid_data['date'] = pd.to_datetime(df_valid_data[['year','month','day']])
  df_date = df_valid_data.set_index('date')
  df_date_filter = df_date[df_date.index.year >= year]
  df_final = df_date_filter[col_name].apply(lambda x: digit_or_null(x))
  return df_final

In [7]:
def melt(df):
  if isinstance(df, pd.Series):
    df = df.copy().to_frame()
  df_melt = pd.melt(df.reset_index(), id_vars=['date'])
  df_melt['region'] = df_melt['variable'].apply(lambda x: x.split("- ")[1] if "-" in x else x.split("at the ")[1]).str.rstrip()
  df_melt['type'] = df_melt['variable'].apply(lambda x: x.split(" -")[0] if "-" in x else x.split(" at the")[0].split("Daily ")[1])
  col_name = df_melt['type'].value_counts().index[0]
  return df_melt[['date', 'region', 'value']].rename(columns={'value':col_name})

In [8]:
def merge(list_df): 
    return reduce(lambda  left, right: pd.merge(left, right, on=col_merge, how='outer'), list_df)

In [9]:
def melt_merge(data, year=0):   
    return merge([melt(read_csv(link_csv,year)) for link_csv in get_link_list(data)])

In [10]:
def concat_melt(data,year=0):
    return melt(pd.concat([read_csv(link,year) for link in get_link_list(data)], axis=1))

In [74]:
def get_station_list(id_list):
    return [link.split("/")[-1].split("_")[1] for id in id_list for link in get_link_list(id)]

In [None]:
def get_data_series_list(id_list):
    return [link.split("/")[-1].split("_")[2] for id in id_list for link in get_link_list(id)]

In [11]:
def unique_sort(input_list):
    unique = set(input_list)
    sorted = list(unique)
    sorted.sort()
    return sorted

In [12]:
def get_series_station(series, station, year=0):
    link = f"https://data.weather.gov.hk/weatherAPI/cis/csvfile/{station}/ALL/daily_{station}_{series}_ALL.csv"
    return read_csv(link, year)

In [101]:
def get_wind_station(series, station, year=0):
    link = f"https://data.weather.gov.hk/cis/csvfile/{station}/ALL/daily_{station}_{series}_ALL.csv"
    return read_csv(link, year)

In [13]:
def merge_all_station_in_all_series(data_type_list, station_list, func, year=0):
    series_all_station_all = []
    series_one_station_all = []
    for data_type in data_type_list:
        for station in station_list:
            try:
                result = func(data_type, station, year)
            except:
                pass
            else:
                series_one_station_all.append(result)
        series_all_station_all.append(melt(pd.concat(series_one_station_all, axis=1)))
        series_one_station_all = []
    return merge(series_all_station_all)

# All Combination between Data Series and Station

## Find All Station

In [19]:
data_id_list = ["daily-maximum-mean-heat-index", 
                "daily-mean-amount-of-cloud", 
                "daily-mean-pressure",
                "daily-total-rainfall",
                "daily-mean-relative-humidity",
                "daily-global-solar-radiation",
                "daily-maximum-mean-uv-index",
                "daily-total-bright-sunshine"]

In [28]:
wind_id_list = ["daily-prevailing-wind-direction",
                "daily-mean-wind-speed"]

In [75]:
station_wind = get_station_list(wind_id_list)

In [76]:
station_data = get_station_list(data_id_list)

In [77]:
station_all = station_data + station_temperature + station_wind + staion_other

In [78]:
station_unique_sort = unique_sort(station_all)

In [81]:
station_unique_sort

['BHD',
 'CCB',
 'CCH',
 'CP1',
 'CPH',
 'CWB',
 'GI',
 'HKA',
 'HKO',
 'HKP',
 'HKS',
 'HPV',
 'JKB',
 'KLT',
 'KP',
 'KSC',
 'KTG',
 'LAM',
 'LFS',
 'NGP',
 'NP',
 'PEN',
 'PLC',
 'SC',
 'SE',
 'SE1',
 'SEK',
 'SF',
 'SHA',
 'SHL',
 'SKG',
 'SKW',
 'SSH',
 'SSP',
 'STY',
 'TC',
 'TKL',
 'TME',
 'TMS',
 'TPK',
 'TPO',
 'TU1',
 'TUN',
 'TW',
 'TWN',
 'TY1',
 'TYW',
 'VP1',
 'WGL',
 'WLP',
 'WTS',
 'YCT']

## Find All Data Series

In [96]:
data_series = unique_sort(get_data_series_list(data_id_list))

In [97]:
wind_series = unique_sort(get_data_series_list(wind_id_list))

In [98]:
data_series

['CLD', 'GSR', 'MAXHKHI', 'MAXUV', 'MEANHKHI', 'MSLP', 'RF', 'RH', 'SUN', 'UV']

In [99]:
wind_series

['PDIR', 'WSPD']

In [106]:
data_type_list_temperature

['CLMTEMP', 'CLMMAXT', 'CLMMINT']

## Combine All Series in All Station

In [100]:
df_data = merge_all_station_in_all_series(data_series, station_unique_sort, get_series_station, 2014)

In [102]:
df_wind = merge_all_station_in_all_series(wind_series, station_unique_sort, get_wind_station, 2014)

In [103]:
df_temp = merge_all_station_in_all_series(data_type_list_temperature, station_unique_sort, temperature, 2014)

In [104]:
df_data

Unnamed: 0,date,region,Mean Amount of Cloud (%),Global Solar Radiation (MJ/m&sup2;),Max HKHI,Max UV Indices(15-minute average),Mean HKHI,Mean Pressure (hPa),Total Rainfall (mm),Mean Relative Humidity (%),Total Bright Sunshine (hours),Mean UV Indices(7 a.m. to 6 p.m.)
0,2014-01-01,Cheung Chau,,,,,,1019.0,0.0,50.0,,
1,2014-01-01,Ching Pak House(Tsing Yi),,,,,,,0.0,41.0,,
2,2014-01-01,Clear Water Bay,,,,,,,,,,
3,2014-01-01,Green Island,,,,,,,0.0,,,
4,2014-01-01,Happy Valley,,,,,,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
155488,2024-11-30,Tuen Mun Children and Juvenile Home,,,,,,,0.0,47.0,,
155489,2024-11-30,Tuen Mun Government Offices,,,,,,,,,,
155490,2024-11-30,Waglan Island,,,,,,1017.7,0.0,57.0,,
155491,2024-11-30,Wetland Park,,,,,,1017.6,0.0,65.0,,


In [106]:
df_wind

Unnamed: 0,date,region,Prevailing Wind Direction (°),Mean Wind Speed (km/h)
0,2014-01-01,Bluff Head,80.0,9.7
1,2014-01-01,Central Pier,80.0,7.0
2,2014-01-01,Cheung Chau,10.0,13.7
3,2014-01-01,Cheung Chau Beach,350.0,9.0
4,2014-01-01,Ching Pak House(Tsing Yi),,
...,...,...,...,...
131566,2024-11-30,Tseung Kwan O,350.0,3.5
131567,2024-11-30,Tuen Mun Government Offices,340.0,4.7
131568,2024-11-30,Waglan Island,80.0,14.2
131569,2024-11-30,Wetland Park,170.0,1.8


In [105]:
df_temp

Unnamed: 0,date,region,Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C)
0,2014-01-01,Bluff Head,16.3,20.8,13.7
1,2014-01-01,Cheung Chau,15.7,20.6,12.3
2,2014-01-01,Ching Pak House(Tsing Yi),16.4,21.1,12.8
3,2014-01-01,Clear Water Bay,,,
4,2014-01-01,Happy Valley,13.9,21.0,8.7
...,...,...,...,...,...
163462,2024-11-30,Tuen Mun Government Offices,,,
163463,2024-11-30,Waglan Island,19.3,23.4,16.9
163464,2024-11-30,Wetland Park,16.9,24.2,11.6
163465,2024-11-30,Wong Chuk Hang,17.2,23.2,13.1


In [107]:
data_all = merge([df_data, df_wind, df_temp])

In [108]:
data_all

Unnamed: 0,date,region,Mean Amount of Cloud (%),Global Solar Radiation (MJ/m&sup2;),Max HKHI,Max UV Indices(15-minute average),Mean HKHI,Mean Pressure (hPa),Total Rainfall (mm),Mean Relative Humidity (%),Total Bright Sunshine (hours),Mean UV Indices(7 a.m. to 6 p.m.),Prevailing Wind Direction (°),Mean Wind Speed (km/h),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C)
0,2014-01-01,Bluff Head,,,,,,,,,,,80.0,9.7,16.3,20.8,13.7
1,2014-01-01,Central Pier,,,,,,,,,,,80.0,7.0,,,
2,2014-01-01,Cheung Chau,,,,,,1019.0,0.0,50.0,,,10.0,13.7,15.7,20.6,12.3
3,2014-01-01,Cheung Chau Beach,,,,,,,,,,,350.0,9.0,,,
4,2014-01-01,Ching Pak House(Tsing Yi),,,,,,,0.0,41.0,,,,,16.4,21.1,12.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207319,2024-11-30,Tuen Mun Government Offices,,,,,,,,,,,340.0,4.7,,,
207320,2024-11-30,Waglan Island,,,,,,1017.7,0.0,57.0,,,80.0,14.2,19.3,23.4,16.9
207321,2024-11-30,Wetland Park,,,,,,1017.6,0.0,65.0,,,170.0,1.8,16.9,24.2,11.6
207322,2024-11-30,Wong Chuk Hang,,,,,,,,62.0,,,220.0,2.7,17.2,23.2,13.1


## Final Formatting

In [109]:
data_final = data_all[['date', 
                       'region', 
                       'Mean HKHI', 
                       'Max HKHI', 
                       'Mean Amount of Cloud (%)', 
                       'Mean Pressure (hPa)', 
                       'Total Rainfall (mm)', 
                       'Mean Relative Humidity (%)', 
                       'Maximum Temperature (°C)', 
                       'Minimum Temperature (°C)', 
                       'Mean Temperature (°C)', 
                       'Global Solar Radiation (MJ/m&sup2;)', 
                       'Max UV Indices(15-minute average)', 
                       'Mean UV Indices(7 a.m. to 6 p.m.)', 
                       'Total Bright Sunshine (hours)',
                       'Prevailing Wind Direction (°)',
                       'Mean Wind Speed (km/h)'
                       ]].sort_values(["date", "region"]).reset_index(drop=True)

In [110]:
data_final

Unnamed: 0,date,region,Mean HKHI,Max HKHI,Mean Amount of Cloud (%),Mean Pressure (hPa),Total Rainfall (mm),Mean Relative Humidity (%),Maximum Temperature (°C),Minimum Temperature (°C),Mean Temperature (°C),Global Solar Radiation (MJ/m&sup2;),Max UV Indices(15-minute average),Mean UV Indices(7 a.m. to 6 p.m.),Total Bright Sunshine (hours),Prevailing Wind Direction (°),Mean Wind Speed (km/h)
0,2014-01-01,Bluff Head,,,,,,,20.8,13.7,16.3,,,,,80.0,9.7
1,2014-01-01,Central Pier,,,,,,,,,,,,,,80.0,7.0
2,2014-01-01,Cheung Chau,,,,1019.0,0.0,50.0,20.6,12.3,15.7,,,,,10.0,13.7
3,2014-01-01,Cheung Chau Beach,,,,,,,,,,,,,,350.0,9.0
4,2014-01-01,Ching Pak House(Tsing Yi),,,,,0.0,41.0,21.1,12.8,16.4,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207319,2024-11-30,Tuen Mun Government Offices,,,,,,,,,,,,,,340.0,4.7
207320,2024-11-30,Waglan Island,,,,1017.7,0.0,57.0,23.4,16.9,19.3,,,,,80.0,14.2
207321,2024-11-30,Wetland Park,,,,1017.6,0.0,65.0,24.2,11.6,16.9,,,,,170.0,1.8
207322,2024-11-30,Wong Chuk Hang,,,,,,62.0,23.2,13.1,17.2,,,,,220.0,2.7


## Save as csv

In [111]:
data_final.to_csv("data_final.csv")