Project: Prediction mean values and exceeding limit days of fine Particulate Matter (PM2.5) in the air - Milan (Italy).

Student: **Alessandro Monolo** | 1790210

Lecturer: Jonas Moons

Fundamentals of Machine Learning - Master Data-Driven Design, Hogeschool Utrecht.

August 2021 - Block E

## Data cleaning and Pre-Processing of NO2 Dataset in Milan from 2014 to 2019

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# I will import, clean and merge all the NO2 - Data Frames from 2014 to 2019

### Harmful Element NO2 Dataset of 2014

In [3]:
# Importing dataset of 2014, skipping the first row

In [4]:
NO2_2014 = pd.read_csv("2014.csv", encoding="utf-8", skiprows=1)

In [5]:
NO2_2014.drop(index=NO2_2014.index[0], axis=0, inplace=True)

In [6]:
NO2_2014.reset_index(inplace=True)

In [7]:
# After resetting the df index I set the new column names

In [8]:
NO2_2014.rename(columns={"index": "TimeStamp", "-999 Valore mancante o invalido": "NO2 µg/m³"}, inplace=True)

In [9]:
# Save values equal to -999 to drop from the df

In [10]:
indexNames = NO2_2014[NO2_2014['NO2 µg/m³'] == '-999' ].index

In [11]:
NO2_2014.drop(indexNames , inplace=True)

In [12]:
# Dropping any NaN values from the df

In [13]:
NO2_2014.dropna()

Unnamed: 0,TimeStamp,NO2 µg/m³
0,2014/01/01 01:00,69.8
1,2014/01/01 02:00,65.4
2,2014/01/01 03:00,68.2
3,2014/01/01 04:00,80.3
4,2014/01/01 05:00,73.9
...,...,...
8755,2014/12/31 20:00,42.5
8756,2014/12/31 21:00,51.0
8757,2014/12/31 22:00,57.5
8758,2014/12/31 23:00,55.8


In [14]:
# Transforming object column to a numeric values column

In [15]:
NO2_2014["NO2 µg/m³"] = pd.to_numeric(NO2_2014["NO2 µg/m³"], errors = 'coerce')

In [16]:
# Create a new df column from timestamp formatting their values with year-month-day and hour format:

In [17]:
NO2_2014['DateTime'] = pd.to_datetime(NO2_2014['TimeStamp'], format='%Y%m%d %H')

In [18]:
# Drop the timestamp column being not useful anymore

In [19]:
NO2_2014.drop('TimeStamp', axis=1, inplace=True)

In [20]:
# Set Datetime column as the new index of the df

In [21]:
NO2_2014_index = NO2_2014.set_index('DateTime')

In [22]:
# Getting mean daily values from the hourly values per each day in df

In [23]:
df_NO2_2014 = NO2_2014_index.resample('D').mean()

In [24]:
df_NO2_2014.reset_index(inplace=True)

In [25]:
# Drop the last row since it is the first january of the next year

In [26]:
df_NO2_2014 = df_NO2_2014[:-1]

In [27]:
df_NO2_2014.shape

(365, 2)

In [28]:
# Repeat the same process for all the NO2 df of 2015, 2016, 2017,2018 and 2019.

### Harmful Element NO2 Dataset of 2015

In [29]:
NO2_2015 = pd.read_csv("2015.csv", encoding="utf-8", skiprows=1)

In [30]:
NO2_2015.drop(index=NO2_2015.index[0], axis=0, inplace=True)

In [31]:
NO2_2015.reset_index(inplace=True)

In [32]:
NO2_2015.rename(columns={"index": "TimeStamp", "-999 Valore mancante o invalido": "NO2 µg/m³"}, inplace=True)

In [33]:
indexNames = NO2_2015[NO2_2015['NO2 µg/m³'] == '-999' ].index

In [34]:
NO2_2015.drop(indexNames , inplace=True)

In [35]:
NO2_2015.dropna()

Unnamed: 0,TimeStamp,NO2 µg/m³
0,2015/01/01 01:00,57.5
1,2015/01/01 02:00,48.2
2,2015/01/01 03:00,45.5
3,2015/01/01 04:00,43.5
4,2015/01/01 05:00,47.7
...,...,...
8755,2015/12/31 20:00,45.1
8756,2015/12/31 21:00,48.7
8757,2015/12/31 22:00,45.4
8758,2015/12/31 23:00,44.1


In [36]:
NO2_2015["NO2 µg/m³"] = pd.to_numeric(NO2_2015["NO2 µg/m³"], errors = 'coerce')

In [37]:
NO2_2015['DateTime'] = pd.to_datetime(NO2_2015['TimeStamp'], format='%Y%m%d %H')

In [38]:
NO2_2015.drop('TimeStamp', axis=1, inplace=True)

In [39]:
NO2_2015_index = NO2_2015.set_index('DateTime')

In [40]:
df_NO2_2015 = NO2_2015_index.resample('D').mean()

In [41]:
df_NO2_2015.reset_index(inplace=True)

In [42]:
df_NO2_2015 = df_NO2_2015[:-1]

In [43]:
df_NO2_2015.shape

(365, 2)

### Harmful Element NO2 Dataset of 2016

In [44]:
NO2_2016 = pd.read_csv("2016.csv", encoding="utf-8", skiprows=1)

In [45]:
NO2_2016.drop(index=NO2_2016.index[0], axis=0, inplace=True)

In [46]:
NO2_2016.reset_index(inplace=True)

In [47]:
NO2_2016.rename(columns={"index": "TimeStamp", "-999 Valore mancante o invalido": "NO2 µg/m³"}, inplace=True)

In [48]:
indexNames = NO2_2016[NO2_2016['NO2 µg/m³'] == '-999' ].index

In [49]:
NO2_2016.drop(indexNames , inplace=True)

In [50]:
NO2_2016.dropna()

Unnamed: 0,TimeStamp,NO2 µg/m³
0,2016/01/01 01:00,40.2
1,2016/01/01 02:00,42.3
2,2016/01/01 03:00,40.9
3,2016/01/01 04:00,37.6
4,2016/01/01 05:00,35.1
...,...,...
8779,2016/12/31 20:00,83.4
8780,2016/12/31 21:00,96.7
8781,2016/12/31 22:00,103.7
8782,2016/12/31 23:00,107.6


In [51]:
NO2_2016["NO2 µg/m³"] = pd.to_numeric(NO2_2016["NO2 µg/m³"], errors = 'coerce')

In [52]:
NO2_2016['DateTime'] = pd.to_datetime(NO2_2016['TimeStamp'], format='%Y%m%d %H')

In [53]:
NO2_2016.drop('TimeStamp', axis=1, inplace=True)

In [54]:
NO2_2016_index = NO2_2016.set_index('DateTime')

In [55]:
df_NO2_2016 = NO2_2016_index.resample('D').mean()

In [56]:
df_NO2_2016.reset_index(inplace=True)

In [57]:
df_NO2_2016 = df_NO2_2016[:-1]

In [58]:
df_NO2_2016.shape

(366, 2)

### Harmful Element NO2 Dataset of 2017

In [59]:
NO2_2017 = pd.read_csv("2017.csv", encoding="utf-8", skiprows=1)

In [60]:
NO2_2017.drop(index=NO2_2017.index[0], axis=0, inplace=True)

In [61]:
NO2_2017.reset_index(inplace=True)

In [62]:
NO2_2017.rename(columns={"index": "TimeStamp", "-999 Valore mancante o invalido": "NO2 µg/m³"}, inplace=True)

In [63]:
indexNames = NO2_2017[NO2_2017['NO2 µg/m³'] == '-999' ].index

In [64]:
NO2_2017.drop(indexNames , inplace=True)

In [65]:
NO2_2017.dropna()

Unnamed: 0,TimeStamp,NO2 µg/m³
0,2017/01/01 01:00,109.3
1,2017/01/01 02:00,107.1
2,2017/01/01 03:00,100.1
3,2017/01/01 04:00,88.1
4,2017/01/01 05:00,79.4
...,...,...
8755,2017/12/31 20:00,68.6
8756,2017/12/31 21:00,66.5
8757,2017/12/31 22:00,65.6
8758,2017/12/31 23:00,63.1


In [66]:
NO2_2017["NO2 µg/m³"] = pd.to_numeric(NO2_2017["NO2 µg/m³"], errors = 'coerce')

In [67]:
NO2_2017['DateTime'] = pd.to_datetime(NO2_2017['TimeStamp'], format='%Y%m%d %H')

In [68]:
NO2_2017.drop('TimeStamp', axis=1, inplace=True)

In [69]:
NO2_2017_index = NO2_2017.set_index('DateTime')

In [70]:
df_NO2_2017 = NO2_2017_index.resample('D').mean()

In [71]:
df_NO2_2017.reset_index(inplace=True)

In [72]:
df_NO2_2017 = df_NO2_2017[:-1]

In [73]:
df_NO2_2017.shape

(365, 2)

### Harmful Element NO2 Dataset of 2018

In [74]:
NO2_2018 = pd.read_csv("2018.csv", encoding="utf-8", skiprows=1)

In [75]:
NO2_2018.drop(index=NO2_2018.index[0], axis=0, inplace=True)

In [76]:
NO2_2018.reset_index(inplace=True)

In [77]:
NO2_2018.rename(columns={"index": "TimeStamp", "-999 Valore mancante o invalido": "NO2 µg/m³"}, inplace=True)

In [78]:
indexNames = NO2_2018[NO2_2018['NO2 µg/m³'] == '-999' ].index

In [79]:
NO2_2018.drop(indexNames , inplace=True)

In [80]:
NO2_2018.dropna()

Unnamed: 0,TimeStamp,NO2 µg/m³
0,2018/01/01 01:00,56.2
1,2018/01/01 02:00,54.3
2,2018/01/01 03:00,51.6
3,2018/01/01 04:00,55.0
4,2018/01/01 05:00,53.2
...,...,...
8755,2018/12/31 20:00,57.3
8756,2018/12/31 21:00,56.5
8757,2018/12/31 22:00,58.2
8758,2018/12/31 23:00,53.8


In [81]:
NO2_2018["NO2 µg/m³"] = pd.to_numeric(NO2_2018["NO2 µg/m³"], errors = 'coerce')

In [82]:
NO2_2018['DateTime'] = pd.to_datetime(NO2_2018['TimeStamp'], format='%Y%m%d %H')

In [83]:
NO2_2018.drop('TimeStamp', axis=1, inplace=True)

In [84]:
NO2_2018_index = NO2_2018.set_index('DateTime')

In [85]:
df_NO2_2018 = NO2_2018_index.resample('D').mean()

In [86]:
df_NO2_2018.reset_index(inplace=True)

In [87]:
df_NO2_2018 = df_NO2_2018[:-1]

In [88]:
df_NO2_2018.shape

(365, 2)

### Harmful Element NO2 Dataset of 2019

In [89]:
NO2_2019 = pd.read_csv("2019.csv", encoding="utf-8", skiprows=1)

In [90]:
NO2_2019.drop(index=NO2_2019.index[0], axis=0, inplace=True)

In [91]:
NO2_2019.reset_index(inplace=True)

In [92]:
NO2_2019.rename(columns={"index": "TimeStamp", "-999 Valore mancante o invalido": "NO2 µg/m³"}, inplace=True)

In [93]:
indexNames = NO2_2019[NO2_2019['NO2 µg/m³'] == '-999' ].index

In [94]:
NO2_2019.drop(indexNames , inplace=True)

In [95]:
NO2_2019.dropna()

Unnamed: 0,TimeStamp,NO2 µg/m³
0,2019/01/01 01:00,51.9
1,2019/01/01 02:00,67.0
2,2019/01/01 03:00,67.7
3,2019/01/01 04:00,50.8
4,2019/01/01 05:00,50.0
...,...,...
8755,2019/12/31 20:00,66.0
8756,2019/12/31 21:00,67.8
8757,2019/12/31 22:00,65.4
8758,2019/12/31 23:00,65.0


In [96]:
NO2_2019["NO2 µg/m³"] = pd.to_numeric(NO2_2019["NO2 µg/m³"], errors = 'coerce')

In [97]:
NO2_2019['DateTime'] = pd.to_datetime(NO2_2019['TimeStamp'], format='%Y%m%d %H')

In [98]:
NO2_2019.drop('TimeStamp', axis=1, inplace=True)

In [99]:
NO2_2019_index = NO2_2019.set_index('DateTime')

In [100]:
df_NO2_2019 = NO2_2019_index.resample('D').mean()

In [101]:
df_NO2_2019.reset_index(inplace=True)

In [102]:
df_NO2_2019 = df_NO2_2019[:-1]

In [103]:
df_NO2_2019.shape

(365, 2)

In [104]:
df_NO2_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   DateTime   365 non-null    datetime64[ns]
 1   NO2 µg/m³  356 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 5.8 KB


In [105]:
df_NO2_2019

Unnamed: 0,DateTime,NO2 µg/m³
0,2019-01-01,47.273913
1,2019-01-02,37.833333
2,2019-01-03,51.704167
3,2019-01-04,76.954167
4,2019-01-05,82.070833
...,...,...
360,2019-12-27,65.095833
361,2019-12-28,66.483333
362,2019-12-29,53.404167
363,2019-12-30,39.775000


**Last check**

In [106]:
print(df_NO2_2014.shape)
print(df_NO2_2015.shape)
print(df_NO2_2016.shape) # Leap year with one day more
print(df_NO2_2017.shape)
print(df_NO2_2018.shape)
print(df_NO2_2019.shape)

(365, 2)
(365, 2)
(366, 2)
(365, 2)
(365, 2)
(365, 2)


### Merging all the NO2 datasets into a CSV file:

In [107]:
# Create a list of dataframes:

In [108]:
data_frames = [df_NO2_2014, df_NO2_2015, df_NO2_2016, df_NO2_2017, df_NO2_2018, df_NO2_2019]

In [109]:
# Concat all the yearly dtaframes into a complete dataframe, using columns as concatenating axes:

In [110]:
Milan_NO2_2014_2019 = pd.concat(data_frames, join='outer', axis=0)

In [111]:
# Finally, save the result as new yearly dtaframe into a csv file:

In [112]:
Milan_NO2_2014_2019.to_csv("Milan_NO2_2014_2019.csv", index=False)

In [113]:
df_NO2 = pd.read_csv("Milan_NO2_2014_2019.csv")

In [114]:
df_NO2

Unnamed: 0,DateTime,NO2 µg/m³
0,2014-01-01,80.078261
1,2014-01-02,70.520833
2,2014-01-03,54.245833
3,2014-01-04,52.395833
4,2014-01-05,53.850000
...,...,...
2186,2019-12-27,65.095833
2187,2019-12-28,66.483333
2188,2019-12-29,53.404167
2189,2019-12-30,39.775000
