# Library Importation

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from functools import reduce
import ssl

# Constant and function definition

In [95]:
ssl._create_default_https_context = ssl._create_unverified_context
arg_read_csv = {'header':1, 'skipfooter':3, 'engine':'python'}
col_merge = ['date', 'region']
data_type_list_temperature = ['CLMTEMP', 'CLMMAXT', 'CLMMINT']
station_temperature = ['CCH','CWB','HKA','HKO','HKP','HKS','HPV','JKB','KLT','KP','KSC','KTG','LFS','NGP','PEN','PLC','SE1','SEK','SHA','SKG','SKW','SSH','SSP','STY','TC','TKL','TMS','TPO','TU1','TW','TWN','TY1','TYW','VP1','WGL','WLP','WTS','YCT']
staion_other = ['BHD','CCB','CCH','CPH','CP1','CWB','GI','HKA','HKO','HKP','HKS','HPV','JKB','KLT','KP','KSC','KTG','LAM','LFS','NGP','NP','PEN','PLC','SC','SE','SE1','SEK','SF','SHA','SHL','SKG','SKW','SSH','SSP','STY','TC','TKL','TME','TMS','TPK','TPO','TUN','TU1','TW','TWN','TY1','TYW','VP1','WGL','WLP','WTS','YCT']

In [3]:
def get_link_list(data):
  response = requests.get(f"https://data.gov.hk/tc-data/dataset/hk-hko-rss-{data}")
  soup = BeautifulSoup(response.text, 'html.parser')
  result = soup.find_all('a', "dataset-details__list-item-download btn btn--light")
  link_list = []
  for tag in result:
    if isinstance(link:=tag.get("href"), str) and "ALL" in link:
      link_list.append(link)
  return link_list

In [4]:
def digit_or_null(value):
    try:
        float(value)
    except:
        return np.nan
    else:
        return float(value)

In [5]:
def read_csv(link, year=0):
  df = pd.read_csv(link, **arg_read_csv)
  col_name = df.columns[0]
  df_temp = df.reset_index().iloc[1:].dropna(subset='level_3')
  df_rename = df_temp.rename(columns={'level_0': 'year', 'level_1': 'month', 'level_2': 'day', 'level_3': col_name, col_name:'completeness'})
  df_valid_data = df_rename[(df_rename['year'] != '1900') | (df_rename['month'] != '2') | (df_rename['day'] != '29')]
  df_valid_data = df_valid_data.copy()
  df_valid_data['date'] = pd.to_datetime(df_valid_data[['year','month','day']])
  df_date = df_valid_data.set_index('date')
  df_date_filter = df_date[df_date.index.year >= year]
  df_final = df_date_filter[col_name].apply(lambda x: digit_or_null(x))
  return df_final

In [6]:
def temperature(data_type, station, year=0):
  df = pd.read_csv(f"https://data.weather.gov.hk/weatherAPI/opendata/opendata.php?dataType={data_type}&rformat=csv&station={station}",
            engine='python',
            header=1,
            skipfooter=3,
            sep=None)
  col_name = df.columns[0]
  df_temp = df.iloc[1:,0].str.split(",", expand=True)
  df_rename = df_temp.rename(columns={0: 'year', 1: 'month', 2: 'day', 3: col_name, 4:'completeness'})
  df_valid_data = df_rename[(df_rename['year'] != '1900') | (df_rename['month'] != '2') | (df_rename['day'] != '29')].reset_index()
  df_valid_data = df_valid_data.copy()
  df_valid_data['date'] = pd.to_datetime(df_valid_data[['year','month','day']])
  df_date = df_valid_data.set_index('date')
  df_date_filter = df_date[df_date.index.year >= year]
  df_final = df_date_filter[col_name].apply(lambda x: digit_or_null(x))
  return df_final

In [7]:
def melt(df):
  if isinstance(df, pd.Series):
    df = df.copy().to_frame()
  df_melt = pd.melt(df.reset_index(), id_vars=['date'])
  df_melt['region'] = df_melt['variable'].apply(lambda x: x.split("- ")[1] if "-" in x else x.split("at the ")[1]).str.rstrip()
  df_melt['type'] = df_melt['variable'].apply(lambda x: x.split(" -")[0] if "-" in x else x.split(" at the")[0].split("Daily ")[1])
  col_name = df_melt['type'].value_counts().index[0]
  return df_melt[['date', 'region', 'value']].rename(columns={'value':col_name})

In [8]:
def merge(list_df): 
    return reduce(lambda  left, right: pd.merge(left, right, on=col_merge, how='outer'), list_df)

In [9]:
def melt_merge(data, year=0):   
    return merge([melt(read_csv(link_csv,year)) for link_csv in get_link_list(data)])

In [10]:
def concat_melt(data,year=0):
    return melt(pd.concat([read_csv(link,year) for link in get_link_list(data)], axis=1))

In [None]:
def unique_sort(input_list):
    unique = set(input_list)
    sorted = list(unique)
    sorted.sort()
    return sorted

In [None]:
def get_series_station(series, station, year=0):
    link = f"https://data.weather.gov.hk/weatherAPI/cis/csvfile/{station}/ALL/daily_{station}_{series}_ALL.csv"
    return read_csv(link, year)

In [None]:
def merge_all_station_in_all_series(data_type_list, station_list, func, year=0):
    series_all_station_all = []
    series_one_station_all = []
    for data_type in data_type_list:
        for station in station_list:
            try:
                result = func(data_type, station, year)
            except:
                pass
            else:
                series_one_station_all.append(result)
        series_all_station_all.append(melt(pd.concat(series_one_station_all, axis=1)))
        series_one_station_all = []
    return merge(series_all_station_all)

# Data Series through Web Scraping

## Daily Maximum Mean Heat Index

In [11]:
daily_maximum_mean_heat_index = melt_merge("daily-maximum-mean-heat-index", 2014)

In [12]:
daily_maximum_mean_heat_index

Unnamed: 0,date,region,Mean HKHI,Max HKHI
0,2014-05-30,King's Park,28.3,30.1
1,2014-05-31,King's Park,28.6,30.3
2,2014-06-01,King's Park,29.0,30.2
3,2014-06-02,King's Park,28.5,29.8
4,2014-06-03,King's Park,27.9,30.1
...,...,...,...,...
3803,2024-10-27,King's Park,24.6,25.9
3804,2024-10-28,King's Park,21.8,23.3
3805,2024-10-29,King's Park,21.9,23.9
3806,2024-10-30,King's Park,23.2,25.4


## Daily Mean Amount Of Cloud

In [13]:
daily_mean_amount_of_cloud = melt_merge("daily-mean-amount-of-cloud", 2014)

In [14]:
daily_mean_amount_of_cloud

Unnamed: 0,date,region,Mean Amount of Cloud (%)
0,2014-01-01,Hong Kong Observatory,0.0
1,2014-01-02,Hong Kong Observatory,49.0
2,2014-01-03,Hong Kong Observatory,28.0
3,2014-01-04,Hong Kong Observatory,0.0
4,2014-01-05,Hong Kong Observatory,2.0
...,...,...,...
3982,2024-11-26,Hong Kong Observatory,45.0
3983,2024-11-27,Hong Kong Observatory,45.0
3984,2024-11-28,Hong Kong Observatory,40.0
3985,2024-11-29,Hong Kong Observatory,32.0


## Daily Mean Pressure

In [15]:
daily_mean_pressure = concat_melt("daily-mean-pressure", 2014)

In [16]:
daily_mean_pressure

Unnamed: 0,date,region,Mean Pressure (hPa)
0,2014-01-01,Hong Kong International Airport,1019.3
1,2014-01-02,Hong Kong International Airport,1016.6
2,2014-01-03,Hong Kong International Airport,1015.9
3,2014-01-04,Hong Kong International Airport,1018.0
4,2014-01-05,Hong Kong International Airport,1018.5
...,...,...,...
47839,2024-11-26,Wetland Park,1019.6
47840,2024-11-27,Wetland Park,1021.3
47841,2024-11-28,Wetland Park,1022.2
47842,2024-11-29,Wetland Park,1021.0


## Daily Total Rainfall

In [17]:
daily_total_rainfall = concat_melt("daily-total-rainfall", 2014)

In [18]:
daily_total_rainfall

Unnamed: 0,date,region,Total Rainfall (mm)
0,2014-01-01,Hong Kong International Airport,0.0
1,2014-01-02,Hong Kong International Airport,0.0
2,2014-01-03,Hong Kong International Airport,0.0
3,2014-01-04,Hong Kong International Airport,0.0
4,2014-01-05,Hong Kong International Airport,0.0
...,...,...,...
99670,2024-11-26,Wetland Park,2.0
99671,2024-11-27,Wetland Park,0.0
99672,2024-11-28,Wetland Park,0.0
99673,2024-11-29,Wetland Park,0.0


## Daily Mean Relative Humidity

In [19]:
daily_mean_relative_humidity = concat_melt("daily-mean-relative-humidity", 2014)

In [20]:
# Missed New Tsing Yi Station
daily_mean_relative_humidity

Unnamed: 0,date,region,Mean Relative Humidity (%)
0,2014-01-01,Hong Kong International Airport,45.0
1,2014-01-02,Hong Kong International Airport,67.0
2,2014-01-03,Hong Kong International Airport,63.0
3,2014-01-04,Hong Kong International Airport,46.0
4,2014-01-05,Hong Kong International Airport,48.0
...,...,...,...
95683,2024-11-26,Wong Chuk Hang,60.0
95684,2024-11-27,Wong Chuk Hang,44.0
95685,2024-11-28,Wong Chuk Hang,33.0
95686,2024-11-29,Wong Chuk Hang,36.0


## Daily Maximum, Mean and Minimum Temperatures

In [21]:
daily_temperature = merge([melt(pd.concat([temperature(data_type, station, 2014) for station in station_temperature], axis=1)) for data_type in data_type_list_temperature])

In [22]:
daily_temperature

Unnamed: 0,date,region,Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C)
0,2014-01-01,Cheung Chau,15.7,20.6,12.3
1,2014-01-01,Clear Water Bay,,,
2,2014-01-01,Happy Valley,13.9,21.0,8.7
3,2014-01-01,Hong Kong International Airport,16.1,19.5,11.5
4,2014-01-01,Hong Kong Observatory,15.7,18.9,13.0
...,...,...,...,...,...
151501,2024-11-30,Tuen Mun Children and Juvenile Home,18.1,23.7,14.7
151502,2024-11-30,Waglan Island,19.3,23.4,16.9
151503,2024-11-30,Wetland Park,16.9,24.2,11.6
151504,2024-11-30,Wong Chuk Hang,17.2,23.2,13.1


## Daily Global Solar Radiation

In [23]:
daily_global_solar_radiation = concat_melt("daily-global-solar-radiation", 2014)

In [24]:
daily_global_solar_radiation

Unnamed: 0,date,region,Global Solar Radiation (MJ/m&sup2;)
0,2014-01-01,King's Park,16.11
1,2014-01-02,King's Park,11.77
2,2014-01-03,King's Park,13.92
3,2014-01-04,King's Park,13.49
4,2014-01-05,King's Park,14.17
...,...,...,...
7969,2024-11-26,Kau Sai Chau,17.80
7970,2024-11-27,Kau Sai Chau,17.41
7971,2024-11-28,Kau Sai Chau,18.00
7972,2024-11-29,Kau Sai Chau,18.22


## Daily Maximum Mean UV Index

In [25]:
daily_maximum_mean_uv_index = melt_merge("daily-maximum-mean-uv-index", 2014)

In [26]:
daily_maximum_mean_uv_index

Unnamed: 0,date,region,Mean UV Indices(7 a.m. to 6 p.m.),Max UV Indices(15-minute average)
0,2014-01-01,King's Park,2.0,5.0
1,2014-01-02,King's Park,2.0,5.0
2,2014-01-03,King's Park,2.0,4.0
3,2014-01-04,King's Park,2.0,4.0
4,2014-01-05,King's Park,2.0,4.0
...,...,...,...,...
3982,2024-11-26,King's Park,3.0,6.0
3983,2024-11-27,King's Park,2.0,5.0
3984,2024-11-28,King's Park,3.0,6.0
3985,2024-11-29,King's Park,3.0,6.0


## Daily Total Bright Sunshine

In [27]:
daily_total_bright_sunshine = melt_merge("daily-total-bright-sunshine", 2014)

In [28]:
daily_total_bright_sunshine

Unnamed: 0,date,region,Total Bright Sunshine (hours)
0,2014-01-01,King's Park,9.5
1,2014-01-02,King's Park,4.5
2,2014-01-03,King's Park,8.6
3,2014-01-04,King's Park,8.8
4,2014-01-05,King's Park,9.0
...,...,...,...
3982,2024-11-26,King's Park,9.8
3983,2024-11-27,King's Park,9.8
3984,2024-11-28,King's Park,9.8
3985,2024-11-29,King's Park,9.8


## Combine All

In [29]:
data_list = [daily_maximum_mean_heat_index, 
             daily_mean_amount_of_cloud, 
             daily_mean_pressure, 
             daily_total_rainfall, 
             daily_mean_relative_humidity, 
             daily_temperature, 
             daily_global_solar_radiation, 
             daily_maximum_mean_uv_index, 
             daily_total_bright_sunshine]

In [30]:
data = merge(data_list)

In [31]:
data

Unnamed: 0,date,region,Mean HKHI,Max HKHI,Mean Amount of Cloud (%),Mean Pressure (hPa),Total Rainfall (mm),Mean Relative Humidity (%),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Global Solar Radiation (MJ/m&sup2;),Mean UV Indices(7 a.m. to 6 p.m.),Max UV Indices(15-minute average),Total Bright Sunshine (hours)
0,2014-01-01,Cheung Chau,,,,1019.0,0.0,50.0,15.7,20.6,12.3,,,,
1,2014-01-01,Ching Pak House(Tsing Yi),,,,,0.0,,,,,,,,
2,2014-01-01,Clear Water Bay,,,,,,,,,,,,,
3,2014-01-01,Happy Valley,,,,,,,13.9,21.0,8.7,,,,
4,2014-01-01,Hong Kong International Airport,,,,1019.3,0.0,45.0,16.1,19.5,11.5,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159475,2024-11-30,Tuen Mun Children and Juvenile Home,,,,,0.0,47.0,18.1,23.7,14.7,,,,
159476,2024-11-30,Waglan Island,,,,1017.7,0.0,57.0,19.3,23.4,16.9,,,,
159477,2024-11-30,Wetland Park,,,,1017.6,0.0,65.0,16.9,24.2,11.6,,,,
159478,2024-11-30,Wong Chuk Hang,,,,,,62.0,17.2,23.2,13.1,,,,


In [32]:
# data.to_csv("data_v1.csv")

# All Combination between Data Series and Station

## Find All Station

In [33]:
data_id_list = ["daily-maximum-mean-heat-index", 
                "daily-mean-amount-of-cloud", 
                "daily-mean-pressure",
                "daily-total-rainfall",
                "daily-mean-relative-humidity",
                "daily-global-solar-radiation",
                "daily-maximum-mean-uv-index",
                "daily-total-bright-sunshine"]

In [34]:
data_link_list = [item for sublist in [get_link_list(data_id) for data_id in data_id_list] for item in sublist]

In [35]:
data_link_list

['https://data.weather.gov.hk/weatherAPI/cis/csvfile/KP/ALL/daily_KP_MEANHKHI_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/KP/ALL/daily_KP_MAXHKHI_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/HKO/ALL/daily_HKO_CLD_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/HKA/ALL/daily_HKA_MSLP_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/CCH/ALL/daily_CCH_MSLP_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/HKO/ALL/daily_HKO_MSLP_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/LFS/ALL/daily_LFS_MSLP_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/PEN/ALL/daily_PEN_MSLP_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/SHA/ALL/daily_SHA_MSLP_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/SEK/ALL/daily_SEK_MSLP_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile/SSH/ALL/daily_SSH_MSLP_ALL.csv',
 'https://data.weather.gov.hk/weatherAPI/cis/csvfile

In [84]:
station_all = [data_link.split("/")[6] for data_link in data_link_list] + station_temperature + staion_other

In [100]:
station_unique_sort = unique_sort(station_all)

In [88]:
station_unique_sort

['BHD',
 'CCB',
 'CCH',
 'CP1',
 'CPH',
 'CWB',
 'GI',
 'HKA',
 'HKO',
 'HKP',
 'HKS',
 'HPV',
 'JKB',
 'KLT',
 'KP',
 'KSC',
 'KTG',
 'LAM',
 'LFS',
 'NGP',
 'NP',
 'PEN',
 'PLC',
 'SC',
 'SE',
 'SE1',
 'SEK',
 'SF',
 'SHA',
 'SHL',
 'SKG',
 'SKW',
 'SSH',
 'SSP',
 'STY',
 'TC',
 'TKL',
 'TME',
 'TMS',
 'TPK',
 'TPO',
 'TU1',
 'TUN',
 'TW',
 'TWN',
 'TY1',
 'TYW',
 'VP1',
 'WGL',
 'WLP',
 'WTS',
 'YCT']

## Find All Data Series

In [107]:
data_series_all_excl_temp = [data_link.split("/")[8].split("_")[2] for data_link in data_link_list]

In [112]:
data_series_unique_sort_excl_temp = unique_sort(data_series_all_excl_temp)

In [113]:
data_series_unique_sort_excl_temp

['CLD', 'GSR', 'MAXHKHI', 'MAXUV', 'MEANHKHI', 'MSLP', 'RF', 'RH', 'SUN', 'UV']

In [106]:
data_type_list_temperature

['CLMTEMP', 'CLMMAXT', 'CLMMINT']

## Combine All Series in All Station

In [153]:
data_excl_temp = merge_all_station_in_all_series(data_series_unique_sort_excl_temp, station_unique_sort, get_series_station, 2014)

In [154]:
data_temp = merge_all_station_in_all_series(data_type_list_temperature, station_unique_sort, temperature, 2014)

In [155]:
data_excl_temp

Unnamed: 0,date,region,Mean Amount of Cloud (%),Global Solar Radiation (MJ/m&sup2;),Max HKHI,Max UV Indices(15-minute average),Mean HKHI,Mean Pressure (hPa),Total Rainfall (mm),Mean Relative Humidity (%),Total Bright Sunshine (hours),Mean UV Indices(7 a.m. to 6 p.m.)
0,2014-01-01,Cheung Chau,,,,,,1019.0,0.0,50.0,,
1,2014-01-01,Ching Pak House(Tsing Yi),,,,,,,0.0,41.0,,
2,2014-01-01,Clear Water Bay,,,,,,,,,,
3,2014-01-01,Green Island,,,,,,,0.0,,,
4,2014-01-01,Happy Valley,,,,,,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
155488,2024-11-30,Tuen Mun Children and Juvenile Home,,,,,,,0.0,47.0,,
155489,2024-11-30,Tuen Mun Government Offices,,,,,,,,,,
155490,2024-11-30,Waglan Island,,,,,,1017.7,0.0,57.0,,
155491,2024-11-30,Wetland Park,,,,,,1017.6,0.0,65.0,,


In [156]:
data_temp

Unnamed: 0,date,region,Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C)
0,2014-01-01,Bluff Head,16.3,20.8,13.7
1,2014-01-01,Cheung Chau,15.7,20.6,12.3
2,2014-01-01,Ching Pak House(Tsing Yi),16.4,21.1,12.8
3,2014-01-01,Clear Water Bay,,,
4,2014-01-01,Happy Valley,13.9,21.0,8.7
...,...,...,...,...,...
163462,2024-11-30,Tuen Mun Government Offices,,,
163463,2024-11-30,Waglan Island,19.3,23.4,16.9
163464,2024-11-30,Wetland Park,16.9,24.2,11.6
163465,2024-11-30,Wong Chuk Hang,17.2,23.2,13.1


In [158]:
data_all = pd.merge(data_excl_temp, data_temp, on=col_merge)

## Final Formatting

In [176]:
data_final = data_all[['date', 
                       'region', 
                       'Mean HKHI', 
                       'Max HKHI', 
                       'Mean Amount of Cloud (%)', 
                       'Mean Pressure (hPa)', 
                       'Total Rainfall (mm)', 
                       'Mean Relative Humidity (%)', 
                       'Maximum Temperature (°C)', 
                       'Minimum Temperature (°C)', 
                       'Mean Temperature (°C)', 
                       'Global Solar Radiation (MJ/m&sup2;)', 
                       'Max UV Indices(15-minute average)', 
                       'Mean UV Indices(7 a.m. to 6 p.m.)', 
                       'Total Bright Sunshine (hours)']].sort_values(["region", "date"]).reset_index(drop=True)

In [177]:
data_final

Unnamed: 0,date,region,Mean HKHI,Max HKHI,Mean Amount of Cloud (%),Mean Pressure (hPa),Total Rainfall (mm),Mean Relative Humidity (%),Maximum Temperature (°C),Minimum Temperature (°C),Mean Temperature (°C),Global Solar Radiation (MJ/m&sup2;),Max UV Indices(15-minute average),Mean UV Indices(7 a.m. to 6 p.m.),Total Bright Sunshine (hours)
0,2014-01-01,Cheung Chau,,,,1019.0,0.0,50.0,20.6,12.3,15.7,,,,
1,2014-01-02,Cheung Chau,,,,1016.5,0.0,77.0,20.4,13.2,16.6,,,,
2,2014-01-03,Cheung Chau,,,,1015.6,0.0,67.0,24.4,16.6,19.5,,,,
3,2014-01-04,Cheung Chau,,,,1017.5,0.0,51.0,23.2,15.2,18.4,,,,
4,2014-01-05,Cheung Chau,,,,1018.5,0.0,56.0,18.9,13.4,16.2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143527,2024-11-26,Wong Chuk Hang,,,,,,60.0,27.1,18.2,21.7,,,,
143528,2024-11-27,Wong Chuk Hang,,,,,,44.0,24.6,16.9,19.6,,,,
143529,2024-11-28,Wong Chuk Hang,,,,,,33.0,24.0,16.8,19.6,,,,
143530,2024-11-29,Wong Chuk Hang,,,,,,36.0,23.3,13.7,18.1,,,,


## Save as csv

In [178]:
data_final.to_csv("data_final.csv")