In [1]:
import pandas as pd
import numpy as np

## HK daily means on air pollutant index 2008-2018 
Source: https://cd.epic.epd.gov.hk/EPICDI/air/station/?lang=en

    - Data is provided by Hong Kong Environmental Protection Department


In [2]:
# Trials
col_names = ['date','Station','CO','PM25','NO_2','NOx','O_3','PM10','SO2']

df = pd.read_csv("././HK_pollution_data/200801.csv",na_values='N.A.',names=col_names)
df

Unnamed: 0,date,Station,CO,PM25,NO_2,NOx,O_3,PM10,SO2
0,Remarks:,,,,,,,,
1,1. All Pollutant unit in μg/m3 except CO which...,,,,,,,,
2,2. N.A. = data not available,,,,,,,,
3,3. CO = Carbon Monoxide,,,,,,,,
4,4. FSP = Fine Suspended Particulates,,,,,,,,
5,5. NO2 = Nitrogen Dioxide,,,,,,,,
6,6. NOX = Nitrogen Oxides,,,,,,,,
7,7. O3 = Ozone,,,,,,,,
8,8. RSP = Respirable Suspended Particulates,,,,,,,,
9,9. SO2 = Sulphur Dioxide,,,,,,,,


In [3]:
df_2 = df.drop(list(range(0,11))).copy().reset_index(drop=True)
df_2['date'] = pd.to_datetime(df_2['date'], dayfirst=True, format='%d-%m-%Y')
df_2

Unnamed: 0,date,Station,CO,PM25,NO_2,NOx,O_3,PM10,SO2
0,2008-01-01,CENTRAL/WESTERN,,,51,63,47,70,28.0
1,2008-01-02,CENTRAL/WESTERN,,,67,85,45,64,35.0
2,2008-01-03,CENTRAL/WESTERN,,,101,144,37,87,45.0
3,2008-01-04,CENTRAL/WESTERN,,,69,95,44,59,22.0
4,2008-01-05,CENTRAL/WESTERN,,,99,126,40,76,37.0
5,2008-01-06,CENTRAL/WESTERN,,,68,80,49,53,20.0
6,2008-01-07,CENTRAL/WESTERN,,,129,296,15,82,58.0
7,2008-01-08,CENTRAL/WESTERN,,,75,87,48,67,30.0
8,2008-01-09,CENTRAL/WESTERN,,,69,80,49,64,27.0
9,2008-01-10,CENTRAL/WESTERN,,,69,84,45,91,22.0


## Align column names 

Original data use Respirable Suspended Particulates (RSP) instead of PM10 and Fine Suspended Particulates (FSP) instead PM25. Align columns' names with DataFrames in the other cities. 

In [4]:
year = list(range(2008,2019))
month = list(range(1,13))

# format month number to 2 digits with lead zero
formatter = "{:02d}".format
month = [formatter(m) for m in month]

col_names = ['date','Station','CO','PM25','NO_2','NOx','O_3','PM10','SO_2']

hk_pollution = pd.DataFrame(columns=col_names)

for y in year:
    for m in month:
        df = pd.read_csv(f"./HK_pollution_data/{y}{m}.csv",na_values='N.A.',names=col_names)
        df = df.drop(list(range(0,11))).copy().reset_index(drop=True)
        hk_pollution = pd.concat([hk_pollution,df])
        df['date'] = pd.to_datetime(df['date'])
        
        

## Edit columns 

In [5]:
# Drop 'Station' column 
hk_pollution.drop('Station',axis=1)

# Align column names with DataFrames in other cities 
hk_pollution = hk_pollution[['date','CO','NO_2','NOx','O_3','PM10','PM25','SO_2']]

# Format datetime
hk_pollution['date'] = pd.to_datetime(hk_pollution['date'], dayfirst=True, format='%d-%m-%Y')

# Add 'city' column
hk_pollution = hk_pollution.assign(city='hong kong')

In [6]:
hk_pollution = hk_pollution.reset_index(drop=True)
hk_pollution

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city
0,2008-01-01,,51,63,47,70,,28,hong kong
1,2008-01-02,,67,85,45,64,,35,hong kong
2,2008-01-03,,101,144,37,87,,45,hong kong
3,2008-01-04,,69,95,44,59,,22,hong kong
4,2008-01-05,,99,126,40,76,,37,hong kong
...,...,...,...,...,...,...,...,...,...
4011,2018-12-27,,63,102,35,43,32,6,hong kong
4012,2018-12-28,,40,52,37,32,22,2,hong kong
4013,2018-12-29,,39,57,16,27,18,2,hong kong
4014,2018-12-30,,37,51,23,47,34,3,hong kong


In [7]:
hk_pollution.to_pickle('.././pickles/hk_pollution.pkl')