# Library Importation

In [1]:
# data maniuplation
import numpy as np
import pandas as pd
from functools import reduce
# web scraping
import requests
from bs4 import BeautifulSoup
# ssl certification
import ssl

# Constant and function definition

In [2]:
# ssl certification
ssl._create_default_https_context = ssl._create_unverified_context
# pre-define argument for pd.read_csv
arg_read_csv = {'header':1, 'skipfooter':3, 'engine':'python'}
# pre-defind argument for pd.merge
col_merge = ['date', 'region']
# 3 series for temperature (Mean, Max, Min)
data_type_list_temperature = ['CLMTEMP', 'CLMMAXT', 'CLMMINT']
# known stations for temperature
station_temperature = ['CCH','CWB','HKA','HKO','HKP','HKS','HPV','JKB','KLT','KP','KSC','KTG','LFS','NGP','PEN','PLC','SE1','SEK','SHA','SKG','SKW','SSH','SSP','STY','TC','TKL','TMS','TPO','TU1','TW','TWN','TY1','TYW','VP1','WGL','WLP','WTS','YCT']
# known stations for series other than temperature
staion_other = ['BHD','CCB','CCH','CPH','CP1','CWB','GI','HKA','HKO','HKP','HKS','HPV','JKB','KLT','KP','KSC','KTG','LAM','LFS','NGP','NP','PEN','PLC','SC','SE','SE1','SEK','SF','SHA','SHL','SKG','SKW','SSH','SSP','STY','TC','TKL','TME','TMS','TPK','TPO','TUN','TU1','TW','TWN','TY1','TYW','VP1','WGL','WLP','WTS','YCT']

In [3]:
def get_link_list(data):
  # to get the list of api link for a data series except temperature
  response = requests.get(f"https://data.gov.hk/tc-data/dataset/hk-hko-rss-{data}")
  soup = BeautifulSoup(response.text, 'html.parser')
  result = soup.find_all('a', "dataset-details__list-item-download btn btn--light")
  link_list = []
  for tag in result:
    if isinstance(link:=tag.get("href"), str) and "ALL" in link:
      link_list.append(link)
  return link_list

In [4]:
def digit_or_null(value):
    # to change the data type to float or null
    try:
        float(value)
    except:
        return np.nan
    else:
        return float(value)

In [5]:
def read_csv(link, year=0):
  # to get a dataframe from csv with transformation except for temperature
  df = pd.read_csv(link, **arg_read_csv)
  col_name = df.columns[0]
  df_temp = df.reset_index().iloc[1:].dropna(subset='level_3')
  df_rename = df_temp.rename(columns={'level_0': 'year', 'level_1': 'month', 'level_2': 'day', 'level_3': col_name, col_name:'completeness'})
  df_valid_data = df_rename[(df_rename['year'] != '1900') | (df_rename['month'] != '2') | (df_rename['day'] != '29')]
  df_valid_data = df_valid_data.copy()
  df_valid_data['date'] = pd.to_datetime(df_valid_data[['year','month','day']])
  df_date = df_valid_data.set_index('date')
  df_date_filter = df_date[df_date.index.year >= year]
  df_final = df_date_filter[col_name].apply(lambda x: digit_or_null(x))
  return df_final

In [6]:
def temperature(data_type, station, year=0):
  # to get a dataframe from csv with transformation for temperature
  df = pd.read_csv(f"https://data.weather.gov.hk/weatherAPI/opendata/opendata.php?dataType={data_type}&rformat=csv&station={station}",
            engine='python',
            header=1,
            skipfooter=3,
            sep=None)
  col_name = df.columns[0]
  df_temp = df.iloc[1:,0].str.split(",", expand=True)
  df_rename = df_temp.rename(columns={0: 'year', 1: 'month', 2: 'day', 3: col_name, 4:'completeness'})
  df_valid_data = df_rename[(df_rename['year'] != '1900') | (df_rename['month'] != '2') | (df_rename['day'] != '29')].reset_index()
  df_valid_data = df_valid_data.copy()
  df_valid_data['date'] = pd.to_datetime(df_valid_data[['year','month','day']])
  df_date = df_valid_data.set_index('date')
  df_date_filter = df_date[df_date.index.year >= year]
  df_final = df_date_filter[col_name].apply(lambda x: digit_or_null(x))
  return df_final

In [7]:
def melt(df):
  # to get a 3-columns dataframe for a single data series when each column represent a single station
  if isinstance(df, pd.Series):
    df = df.copy().to_frame()
  df_melt = pd.melt(df.reset_index(), id_vars=['date'])
  df_melt['region'] = df_melt['variable'].apply(lambda x: x.split("- ")[1] if "-" in x else x.split("at the ")[1]).str.rstrip()
  df_melt['type'] = df_melt['variable'].apply(lambda x: x.split(" -")[0] if "-" in x else x.split(" at the")[0].split("Daily ")[1])
  col_name = df_melt['type'].value_counts().index[0]
  return df_melt[['date', 'region', 'value']].rename(columns={'value':col_name})

In [8]:
def merge(list_df): 
    # to combine a dataframe from different data series and using "date" and "region" to outer join
    return reduce(lambda  left, right: pd.merge(left, right, on=col_merge, how='outer'), list_df)

In [11]:
def get_station_list(id_list):
    # to get the list of station by input the list of data series id
    return [link.split("/")[-1].split("_")[1] for id in id_list for link in get_link_list(id)]

In [12]:
def get_data_series_list(id_list):
    # to get the list of data series code by input the list of data series id
    # one data series id may contain more than one data series. i.e. daily-maximum-mean-heat-index
    return [link.split("/")[-1].split("_")[2] for id in id_list for link in get_link_list(id)]

In [13]:
def unique_sort(input_list):
    # to get the unique and sorted value of a list
    unique = set(input_list)
    sorted = list(unique)
    sorted.sort()
    return sorted

In [14]:
def get_series_station(series, station, year=0):
    link = f"https://data.weather.gov.hk/weatherAPI/cis/csvfile/{station}/ALL/daily_{station}_{series}_ALL.csv"
    return read_csv(link, year)

In [15]:
def get_wind_station(series, station, year=0):
    link = f"https://data.weather.gov.hk/cis/csvfile/{station}/ALL/daily_{station}_{series}_ALL.csv"
    return read_csv(link, year)

In [16]:
def merge_all_station_in_all_series(data_type_list, station_list, func, year=0):
    series_all_station_all = []
    series_one_station_all = []
    for data_type in data_type_list:
        for station in station_list:
            try:
                result = func(data_type, station, year)
            except:
                pass
            else:
                series_one_station_all.append(result)
        series_all_station_all.append(melt(pd.concat(series_one_station_all, axis=1)))
        series_one_station_all = []
    return merge(series_all_station_all)

# All Combination between Data Series and Station

## Find All Station

In [17]:
data_id_list = ["daily-maximum-mean-heat-index", 
                "daily-mean-amount-of-cloud", 
                "daily-mean-pressure",
                "daily-total-rainfall",
                "daily-mean-relative-humidity",
                "daily-global-solar-radiation",
                "daily-maximum-mean-uv-index",
                "daily-total-bright-sunshine"]

In [18]:
wind_id_list = ["daily-prevailing-wind-direction",
                "daily-mean-wind-speed"]

In [19]:
station_wind = get_station_list(wind_id_list)

In [20]:
station_data = get_station_list(data_id_list)

In [21]:
station_all = station_data + station_temperature + station_wind + staion_other

In [22]:
station_unique_sort = unique_sort(station_all)

## Find All Data Series

In [23]:
data_series = unique_sort(get_data_series_list(data_id_list))

In [24]:
wind_series = unique_sort(get_data_series_list(wind_id_list))

## Combine All Series in All Station

In [25]:
df_data = merge_all_station_in_all_series(data_series, station_unique_sort, get_series_station, 2014)

In [26]:
df_wind = merge_all_station_in_all_series(wind_series, station_unique_sort, get_wind_station, 2014)

In [27]:
df_temp = merge_all_station_in_all_series(data_type_list_temperature, station_unique_sort, temperature, 2014)

In [28]:
data_all = merge([df_data, df_wind, df_temp])

## Final Formatting

In [29]:
data_final = data_all[['date', 
                       'region', 
                       'Mean HKHI', 
                       'Max HKHI', 
                       'Mean Amount of Cloud (%)', 
                       'Mean Pressure (hPa)', 
                       'Total Rainfall (mm)', 
                       'Mean Relative Humidity (%)', 
                       'Maximum Temperature (°C)', 
                       'Minimum Temperature (°C)', 
                       'Mean Temperature (°C)', 
                       'Global Solar Radiation (MJ/m&sup2;)', 
                       'Max UV Indices(15-minute average)', 
                       'Mean UV Indices(7 a.m. to 6 p.m.)', 
                       'Total Bright Sunshine (hours)',
                       'Prevailing Wind Direction (°)',
                       'Mean Wind Speed (km/h)'
                       ]].sort_values(["date", "region"]).reset_index(drop=True)

In [30]:
data_final

Unnamed: 0,date,region,Mean HKHI,Max HKHI,Mean Amount of Cloud (%),Mean Pressure (hPa),Total Rainfall (mm),Mean Relative Humidity (%),Maximum Temperature (°C),Minimum Temperature (°C),Mean Temperature (°C),Global Solar Radiation (MJ/m&sup2;),Max UV Indices(15-minute average),Mean UV Indices(7 a.m. to 6 p.m.),Total Bright Sunshine (hours),Prevailing Wind Direction (°),Mean Wind Speed (km/h)
0,2014-01-01,Bluff Head,,,,,,,20.8,13.7,16.3,,,,,80.0,9.7
1,2014-01-01,Central Pier,,,,,,,,,,,,,,80.0,7.0
2,2014-01-01,Cheung Chau,,,,1019.0,0.0,50.0,20.6,12.3,15.7,,,,,10.0,13.7
3,2014-01-01,Cheung Chau Beach,,,,,,,,,,,,,,350.0,9.0
4,2014-01-01,Ching Pak House(Tsing Yi),,,,,0.0,41.0,21.1,12.8,16.4,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208931,2024-12-31,Tuen Mun Government Offices,,,,,,,,,,,,,,20.0,7.5
208932,2024-12-31,Waglan Island,,,,1019.2,0.0,64.0,23.8,17.0,19.0,,,,,350.0,14.8
208933,2024-12-31,Wetland Park,,,,1019.3,0.0,60.0,25.8,13.7,19.0,,,,,40.0,1.5
208934,2024-12-31,Wong Chuk Hang,,,,,,55.0,23.7,14.2,20.0,,,,,130.0,5.3


## Save as csv

In [31]:
# data_final.to_csv("data_final.csv")