In [1]:
import pandas as pd
import numpy as np

# Madrid pollution

In [2]:
def open_clean_csv(year):

    pollution=pd.read_csv(f"./pollution_madrid/madrid_{year}.csv")
    
    if "NOx" not in list(pollution.columns):
        pollution=pollution.assign(NOx=np.nan)
        
    col_to_keep=["date","CO","NO_2","NOx", "O_3", "PM10", "PM25", "SO_2"]
    cols_to_drop=[col for col in pollution if col not in col_to_keep]
    
    pollution.drop(cols_to_drop,axis=1,inplace=True)
    pollution["date"]=pollution["date"].str.split(" ").str[0]
    
    new_df = pollution.groupby("date").mean().reset_index()
    if "NOx" not in list(new_df.columns):
        new_df=new_df.assign(NOx=np.nan)
    new_df = new_df[["date","CO","NO_2","NOx", "O_3", "PM10", "PM25", "SO_2"]]
    return new_df

In [3]:
poll_mad = pd.DataFrame()
for year in range(2008,2019):
    df_year=open_clean_csv(year)
    poll_mad=poll_mad.append(df_year,ignore_index=True)
poll_mad=poll_mad.assign(city="madrid")

In [4]:
poll_mad

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city
0,2008-01-01,0.652554,67.084565,136.804130,9.280977,39.855419,25.650870,19.815357,madrid
1,2008-01-02,0.531806,66.552372,126.825304,9.750819,21.005401,15.767895,14.897970,madrid
2,2008-01-03,0.360000,50.410930,75.039279,30.313847,9.173423,5.617396,10.923919,madrid
3,2008-01-04,0.458993,58.247292,102.346346,20.756432,21.390939,13.236000,12.875144,madrid
4,2008-01-05,0.389635,47.282292,74.313590,20.405407,18.745682,12.485729,11.804503,madrid
...,...,...,...,...,...,...,...,...,...
3779,2018-04-27,0.249583,26.074783,37.939130,65.695783,26.278317,10.256944,3.677966,madrid
3780,2018-04-28,0.212917,13.512153,17.895833,71.706587,10.971154,5.930556,3.604167,madrid
3781,2018-04-29,0.218333,11.098958,14.697917,78.044643,6.955128,3.229167,3.591667,madrid
3782,2018-04-30,0.236250,20.576389,27.550347,68.814925,6.862179,3.354167,3.800000,madrid


# Hong Kong

## HK daily means on air pollutant index 2008-2018 
Source: https://cd.epic.epd.gov.hk/EPICDI/air/station/?lang=en

    - Data is provided by Hong Kong Environmental Protection Department

In [5]:
# Trials
col_names = ['date','Station','CO','PM25','NO_2','NOx','O_3','PM10','SO2']

df = pd.read_csv("./Hong Kong Data/HK_pollution_data/200801.csv",na_values='N.A.',names=col_names)
df

Unnamed: 0,date,Station,CO,PM25,NO_2,NOx,O_3,PM10,SO2
0,Remarks:,,,,,,,,
1,1. All Pollutant unit in μg/m3 except CO which...,,,,,,,,
2,2. N.A. = data not available,,,,,,,,
3,3. CO = Carbon Monoxide,,,,,,,,
4,4. FSP = Fine Suspended Particulates,,,,,,,,
5,5. NO2 = Nitrogen Dioxide,,,,,,,,
6,6. NOX = Nitrogen Oxides,,,,,,,,
7,7. O3 = Ozone,,,,,,,,
8,8. RSP = Respirable Suspended Particulates,,,,,,,,
9,9. SO2 = Sulphur Dioxide,,,,,,,,


In [6]:
df_2 = df.drop(list(range(0,11))).copy()
df_2['date'] = pd.to_datetime(df_2['date'])
df_2

Unnamed: 0,date,Station,CO,PM25,NO_2,NOx,O_3,PM10,SO2
11,2008-01-01,CENTRAL/WESTERN,,,51,63,47,70,28.0
12,2008-02-01,CENTRAL/WESTERN,,,67,85,45,64,35.0
13,2008-03-01,CENTRAL/WESTERN,,,101,144,37,87,45.0
14,2008-04-01,CENTRAL/WESTERN,,,69,95,44,59,22.0
15,2008-05-01,CENTRAL/WESTERN,,,99,126,40,76,37.0
16,2008-06-01,CENTRAL/WESTERN,,,68,80,49,53,20.0
17,2008-07-01,CENTRAL/WESTERN,,,129,296,15,82,58.0
18,2008-08-01,CENTRAL/WESTERN,,,75,87,48,67,30.0
19,2008-09-01,CENTRAL/WESTERN,,,69,80,49,64,27.0
20,2008-10-01,CENTRAL/WESTERN,,,69,84,45,91,22.0


## Align column names
Original data use Respirable Suspended Particulates (RSP) instead of PM10 and Fine Suspended Particulates (FSP) instead PM25. Align columns' names with DataFrames in the other cities.

In [7]:
year = list(range(2008,2019))
month = list(range(1,13))

# format month number to 2 digits with lead zero
formatter = "{:02d}".format
month = [formatter(m) for m in month]

col_names = ['date','Station','CO','PM25','NO_2','NOx','O_3','PM10','SO_2']

hk_pollution = pd.DataFrame(columns=col_names)
hk_pollution

for y in year:
    for m in month:
        df = pd.read_csv(f"./Hong Kong Data/HK_pollution_data/{y}{m}.csv",na_values='N.A.',names=col_names)
        df = df.drop(list(range(0,11))).copy()
        df['date'] = pd.to_datetime(df['date'])
        hk_pollution = pd.concat([hk_pollution,df])

# Edit columns

In [8]:
# Drop 'Station' column 
hk_pollution.drop('Station',axis=1)

# Align column names with DataFrames in other cities 
hk_pollution = hk_pollution[['date','CO','NO_2','NOx','O_3','PM10','PM25','SO_2']]

# Add 'city' column
hk_pollution = hk_pollution.assign(city='Hong Kong')

In [9]:
hk_pollution = hk_pollution.reset_index(drop=True)
hk_pollution

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city
0,2008-01-01,,51,63,47,70,,28,Hong Kong
1,2008-02-01,,67,85,45,64,,35,Hong Kong
2,2008-03-01,,101,144,37,87,,45,Hong Kong
3,2008-04-01,,69,95,44,59,,22,Hong Kong
4,2008-05-01,,99,126,40,76,,37,Hong Kong
...,...,...,...,...,...,...,...,...,...
4011,2018-12-27,,63,102,35,43,32,6,Hong Kong
4012,2018-12-28,,40,52,37,32,22,2,Hong Kong
4013,2018-12-29,,39,57,16,27,18,2,Hong Kong
4014,2018-12-30,,37,51,23,47,34,3,Hong Kong


# Berlin pollution

In [10]:
berlin_district = {"Wedding" : "mc010",
                  "Schöneberg" : "mc018",
                  "Marienfelde" : "mc027",
                  "Grunewald" : "mc032",
                  "Neukölln" : "mc042",
                  "Buch" : "mc077",
                  "Friedrichshagen" : "mc085",
                  "Messwagen Leipziger Str." : "mw088",
                  "Hardenbergplatz" : "mc115",
                  "Schildhornstraße"  :"mc117",
                  "Mariendorfer Damm" : "mc124",
                  "Silbersteinstraße" : "mc143",
                  "Frohnau" : "mc145",
                  "Mitte" : "mc171",
                  "Frankfurter Allee" : "mc174",
                  "Karl-Marx-Straße" : "mc220",
                  "Karlshorst" : "mc282"}

berlin_period = {"Dayly" : "24h",
                "Hourly" : "1h",
                "Monthly" : "1m"}

In [11]:
# load multiple csv of website, for all districts of berlin
period  = berlin_period["Dayly"]
start = "01.01.2000"
end = "30.10.2019"
new_df=pd.DataFrame()
for key, value in berlin_district.items():
    csv_downloadlink = "https://luftdaten.berlin.de/station/{0}.csv?group=pollution&period={1}&timespan=custom&start%5Bdate%5D={2}&start%5Bhour%5D=00&end%5Bdate%5D={3}&end%5Bhour%5D=00".format(value, period, start, end)
    district = pd.read_csv(csv_downloadlink, sep=";", header=[1,2])
    district = district.drop([district.index[0]])

    district.columns = [f'{i}{j}' for i, j in district.columns]
    district = district.rename(columns={"Feinstaub (PM10)µg/m³" : "PM10",
                                    "Kohlenmonoxidmg/m³" : "CO",
                                    "Stickoxideµg/m³": "NOx",
                                    "Stickstoffdioxidµg/m³" : "NO_2", 
                                    "Ozonµg/m³" : "O_3",
                                    "Stickstoffmonoxidµg/m³" : "NO",
                                    "Schwefeldioxidµg/m³" : "SO_2", 
                                    "Ortschaft" : "city",
                                    "MesskomponenteEinheit" : "date"})
    
    if "CO" not in list(district.columns):
        district=district.assign(CO=np.nan)
    if "SO_2" not in list(district.columns):
        district=district.assign(SO_2=np.nan)
    if "PM10" not in list(district.columns):
        district=district.assign(PM10=np.nan)
    if "O_3" not in list(district.columns):
        district=district.assign(O_3=np.nan)
    district=district[["date","CO","NO_2","NOx", "O_3", "PM10", "SO_2"]]
    new_df=new_df.append(district,ignore_index=True)

In [12]:
new_df["date"] = pd.to_datetime(new_df["date"], infer_datetime_format=True)
new_df

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,SO_2
0,2009-10-29,,31,45,10,18,
1,2009-10-30,,35,60,11,17,
2,2009-10-31,,30,40,17,28,
3,2009-11-01,,25,35,13,47,
4,2009-11-02,,36,62,7,34,
...,...,...,...,...,...,...,...
59154,2019-10-26,,11,13,,,
59155,2019-10-27,,14,16,,,
59156,2019-10-28,,27,36,,,
59157,2019-10-29,,31,55,,,


In [13]:
new_df=new_df.astype({"date":"datetime64","CO":"float","NO_2":"float","NOx":"float", "O_3":"float", "PM10":"float", "SO_2":"float"})
new_df = new_df.groupby("date").mean().reset_index()
new_df["city"] = "berlin"
new_df["PM25"] = np.nan
poll_ber = new_df[["date","CO","NO_2","NOx", "O_3", "PM10", "PM25", "SO_2","city"]]
poll_ber

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city
0,2009-10-29,0.500000,33.812500,71.687500,16.142857,19.583333,,1.5,berlin
1,2009-10-30,0.700000,39.312500,94.437500,15.571429,21.833333,,1.5,berlin
2,2009-10-31,0.650000,33.312500,70.875000,18.571429,32.750000,,4.0,berlin
3,2009-11-01,0.600000,25.000000,48.625000,18.285714,50.333333,,6.0,berlin
4,2009-11-02,0.850000,39.312500,95.812500,13.000000,35.916667,,4.0,berlin
...,...,...,...,...,...,...,...,...,...
3649,2019-10-26,0.300000,20.117647,35.058824,38.666667,13.833333,,1.0,berlin
3650,2019-10-27,0.233333,17.000000,26.294118,46.666667,11.750000,,1.0,berlin
3651,2019-10-28,0.333333,28.176471,56.647059,34.000000,12.083333,,2.0,berlin
3652,2019-10-29,0.266667,30.411765,52.647059,24.444444,14.166667,,0.0,berlin


# Merging all pollution dataframes

In [14]:
all_pollution=pd.concat([poll_ber,hk_pollution,poll_mad],ignore_index=True)
all_pollution

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city
0,2009-10-29 00:00:00,0.5,33.8125,71.6875,16.1429,19.5833,,1.5,berlin
1,2009-10-30 00:00:00,0.7,39.3125,94.4375,15.5714,21.8333,,1.5,berlin
2,2009-10-31 00:00:00,0.65,33.3125,70.875,18.5714,32.75,,4,berlin
3,2009-11-01 00:00:00,0.6,25,48.625,18.2857,50.3333,,6,berlin
4,2009-11-02 00:00:00,0.85,39.3125,95.8125,13,35.9167,,4,berlin
...,...,...,...,...,...,...,...,...,...
11449,2018-04-27,0.249583,26.0748,37.9391,65.6958,26.2783,10.2569,3.67797,madrid
11450,2018-04-28,0.212917,13.5122,17.8958,71.7066,10.9712,5.93056,3.60417,madrid
11451,2018-04-29,0.218333,11.099,14.6979,78.0446,6.95513,3.22917,3.59167,madrid
11452,2018-04-30,0.23625,20.5764,27.5503,68.8149,6.86218,3.35417,3.8,madrid


In [16]:
# Creating new column
all_pollution["date"] = pd.to_datetime(all_pollution["date"], infer_datetime_format=True)
all_pollution.dtypes

date    datetime64[ns]
CO              object
NO_2            object
NOx             object
O_3             object
PM10            object
PM25            object
SO_2            object
city            object
dtype: object

In [17]:
all_pollution

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city
0,2009-10-29,0.5,33.8125,71.6875,16.1429,19.5833,,1.5,berlin
1,2009-10-30,0.7,39.3125,94.4375,15.5714,21.8333,,1.5,berlin
2,2009-10-31,0.65,33.3125,70.875,18.5714,32.75,,4,berlin
3,2009-11-01,0.6,25,48.625,18.2857,50.3333,,6,berlin
4,2009-11-02,0.85,39.3125,95.8125,13,35.9167,,4,berlin
...,...,...,...,...,...,...,...,...,...
11449,2018-04-27,0.249583,26.0748,37.9391,65.6958,26.2783,10.2569,3.67797,madrid
11450,2018-04-28,0.212917,13.5122,17.8958,71.7066,10.9712,5.93056,3.60417,madrid
11451,2018-04-29,0.218333,11.099,14.6979,78.0446,6.95513,3.22917,3.59167,madrid
11452,2018-04-30,0.23625,20.5764,27.5503,68.8149,6.86218,3.35417,3.8,madrid


### Creating new column for the year

In [18]:
pollution_final=all_pollution.copy()

In [25]:
pollution_final=pollution_final.assign(year=pollution_final['date'].dt.year)
pollution_final

Unnamed: 0,date,CO,NO_2,NOx,O_3,PM10,PM25,SO_2,city,year
0,2009-10-29,0.5,33.8125,71.6875,16.1429,19.5833,,1.5,berlin,2009
1,2009-10-30,0.7,39.3125,94.4375,15.5714,21.8333,,1.5,berlin,2009
2,2009-10-31,0.65,33.3125,70.875,18.5714,32.75,,4,berlin,2009
3,2009-11-01,0.6,25,48.625,18.2857,50.3333,,6,berlin,2009
4,2009-11-02,0.85,39.3125,95.8125,13,35.9167,,4,berlin,2009
...,...,...,...,...,...,...,...,...,...,...
11449,2018-04-27,0.249583,26.0748,37.9391,65.6958,26.2783,10.2569,3.67797,madrid,2018
11450,2018-04-28,0.212917,13.5122,17.8958,71.7066,10.9712,5.93056,3.60417,madrid,2018
11451,2018-04-29,0.218333,11.099,14.6979,78.0446,6.95513,3.22917,3.59167,madrid,2018
11452,2018-04-30,0.23625,20.5764,27.5503,68.8149,6.86218,3.35417,3.8,madrid,2018
