In [1]:
import pandas as pd
import numpy as np

# Madrid pollution

In [3]:
def open_clean_csv(year):

    pollution=pd.read_csv(f"./pollution_madrid/madrid_{year}.csv")
    
    if "NOx" not in list(pollution.columns):
        pollution=pollution.assign(NOx="nan")
        
    col_to_keep=["date","CO","NO_2","NOx", "O_3", "PM10", "PM25", "SO_2"]
    cols_to_drop=[col for col in pollution if col not in col_to_keep]
    
    pollution.drop(cols_to_drop,axis=1,inplace=True)
    pollution["date"]=pollution["date"].str.split(" ").str[0]
    
    new_df = pollution.groupby("date").mean().reset_index()
    if "NOx" not in list(new_df.columns):
        new_df=new_df.assign(NOx="nan")
    new_df = new_df[["date","CO","NO_2","NOx", "O_3", "PM10", "PM25", "SO_2"]]
    return new_df

In [4]:
poll_mad = pd.DataFrame()
for year in range(2008,2019):
    df_year=open_clean_csv(year)
    poll_mad=poll_mad.append(df_year,ignore_index=True)
poll_mad=poll_mad.assign(city="madrid")

In [5]:
poll_mad

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city
0,2008-01-01,0.652554,67.084565,136.804,9.280977,39.855419,25.650870,19.815357,madrid
1,2008-01-02,0.531806,66.552372,126.825,9.750819,21.005401,15.767895,14.897970,madrid
2,2008-01-03,0.360000,50.410930,75.0393,30.313847,9.173423,5.617396,10.923919,madrid
3,2008-01-04,0.458993,58.247292,102.346,20.756432,21.390939,13.236000,12.875144,madrid
4,2008-01-05,0.389635,47.282292,74.3136,20.405407,18.745682,12.485729,11.804503,madrid
...,...,...,...,...,...,...,...,...,...
3779,2018-04-27,0.249583,26.074783,37.9391,65.695783,26.278317,10.256944,3.677966,madrid
3780,2018-04-28,0.212917,13.512153,17.8958,71.706587,10.971154,5.930556,3.604167,madrid
3781,2018-04-29,0.218333,11.098958,14.6979,78.044643,6.955128,3.229167,3.591667,madrid
3782,2018-04-30,0.236250,20.576389,27.5503,68.814925,6.862179,3.354167,3.800000,madrid


# Hong Kong

## HK daily means on air pollutant index 2008-2018 
Source: https://cd.epic.epd.gov.hk/EPICDI/air/station/?lang=en

    - Data is provided by Hong Kong Environmental Protection Department

In [7]:
# Trials
col_names = ['date','Station','CO','PM25','NO_2','NOx','O_3','PM10','SO2']

df = pd.read_csv("./Hong Kong Data/HK_pollution_data/200801.csv",na_values='N.A.',names=col_names)
df

Unnamed: 0,date,Station,CO,PM25,NO_2,NOx,O_3,PM10,SO2
0,Remarks:,,,,,,,,
1,1. All Pollutant unit in μg/m3 except CO which...,,,,,,,,
2,2. N.A. = data not available,,,,,,,,
3,3. CO = Carbon Monoxide,,,,,,,,
4,4. FSP = Fine Suspended Particulates,,,,,,,,
5,5. NO2 = Nitrogen Dioxide,,,,,,,,
6,6. NOX = Nitrogen Oxides,,,,,,,,
7,7. O3 = Ozone,,,,,,,,
8,8. RSP = Respirable Suspended Particulates,,,,,,,,
9,9. SO2 = Sulphur Dioxide,,,,,,,,


In [8]:
df_2 = df.drop(list(range(0,11))).copy()
df_2['date'] = pd.to_datetime(df_2['date'])
df_2

Unnamed: 0,date,Station,CO,PM25,NO_2,NOx,O_3,PM10,SO2
11,2008-01-01,CENTRAL/WESTERN,,,51,63,47,70,28.0
12,2008-02-01,CENTRAL/WESTERN,,,67,85,45,64,35.0
13,2008-03-01,CENTRAL/WESTERN,,,101,144,37,87,45.0
14,2008-04-01,CENTRAL/WESTERN,,,69,95,44,59,22.0
15,2008-05-01,CENTRAL/WESTERN,,,99,126,40,76,37.0
16,2008-06-01,CENTRAL/WESTERN,,,68,80,49,53,20.0
17,2008-07-01,CENTRAL/WESTERN,,,129,296,15,82,58.0
18,2008-08-01,CENTRAL/WESTERN,,,75,87,48,67,30.0
19,2008-09-01,CENTRAL/WESTERN,,,69,80,49,64,27.0
20,2008-10-01,CENTRAL/WESTERN,,,69,84,45,91,22.0


## Align column names
Original data use Respirable Suspended Particulates (RSP) instead of PM10 and Fine Suspended Particulates (FSP) instead PM25. Align columns' names with DataFrames in the other cities.

In [10]:
year = list(range(2008,2019))
month = list(range(1,13))

# format month number to 2 digits with lead zero
formatter = "{:02d}".format
month = [formatter(m) for m in month]

col_names = ['date','Station','CO','PM25','NO_2','NOx','O_3','PM10','SO_2']

hk_pollution = pd.DataFrame(columns=col_names)
hk_pollution

for y in year:
    for m in month:
        df = pd.read_csv(f"./Hong Kong Data/HK_pollution_data/{y}{m}.csv",na_values='N.A.',names=col_names)
        df = df.drop(list(range(0,11))).copy()
        df['date'] = pd.to_datetime(df['date'])
        hk_pollution = pd.concat([hk_pollution,df])

# Edit columns

In [11]:
# Drop 'Station' column 
hk_pollution.drop('Station',axis=1)

# Align column names with DataFrames in other cities 
hk_pollution = hk_pollution[['date','CO','NO_2','NOx','O_3','PM10','PM25','SO_2']]

# Add 'city' column
hk_pollution = hk_pollution.assign(city='Hong Kong')

In [12]:
hk_pollution = hk_pollution.reset_index(drop=True)
hk_pollution

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city
0,2008-01-01,,51,63,47,70,,28,Hong Kong
1,2008-02-01,,67,85,45,64,,35,Hong Kong
2,2008-03-01,,101,144,37,87,,45,Hong Kong
3,2008-04-01,,69,95,44,59,,22,Hong Kong
4,2008-05-01,,99,126,40,76,,37,Hong Kong
...,...,...,...,...,...,...,...,...,...
4011,2018-12-27,,63,102,35,43,32,6,Hong Kong
4012,2018-12-28,,40,52,37,32,22,2,Hong Kong
4013,2018-12-29,,39,57,16,27,18,2,Hong Kong
4014,2018-12-30,,37,51,23,47,34,3,Hong Kong
