## On this Notebook, I am going to access to Pollutant web from Catalonia's government, in order to download automatically all DATA available from Tarragona (Bonavista), Pollutant station selected, since the beginning (1991). 

# 1. Download DATA

In [1]:
#Install Sodapy pack to download data from Pollutant website Catalonia's government
! pip install sodapy



In [2]:
#Import libraries required
import os
import pandas as pd
import numpy as np
from sodapy import Socrata
import datetime

In [3]:
#Download data info from web. It is not needed token because we will download data once.
socrata_domain = "analisi.transparenciacatalunya.cat"
socrata_dataset_identifier = "tasf-thgu"
#socrata_token = os.environ.get("None") --> not needed, but just in case, here we have the sentence to include it.

client = Socrata(socrata_domain, None)
print(
    "Domain: {domain:}\nSession: {session:}\nURI Prefix: {uri_prefix:}".format(
        **client.__dict__
    )
)

metadata = client.get_metadata(socrata_dataset_identifier)
[x["name"] for x in metadata["columns"]]

results = client.get(socrata_dataset_identifier,
                    limit=100000, 
                    nom_estacio="Tarragona (Bonavista)")    
    
df = pd.DataFrame.from_dict(results)
df.head()



Domain: analisi.transparenciacatalunya.cat
Session: <requests.sessions.Session object at 0x7f318f164c50>
URI Prefix: https://


Unnamed: 0,codi_eoi,nom_estacio,data,magnitud,contaminant,unitats,tipus_estacio,area_urbana,codi_ine,municipi,...,h19,h20,h21,h22,h23,h24,altitud,latitud,longitud,geocoded_column
0,43148003,Tarragona (Bonavista),2014-07-28T00:00:00.000,11,PM1,µg/m3,industrial,peri-urban,43148,Tarragona,...,10,9,7,8,10,11,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
1,43148003,Tarragona (Bonavista),2014-05-04T00:00:00.000,7,NO,µg/m3,industrial,peri-urban,43148,Tarragona,...,3,2,1,1,1,3,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
2,43148003,Tarragona (Bonavista),2014-04-15T00:00:00.000,11,PM1,µg/m3,industrial,peri-urban,43148,Tarragona,...,10,14,14,17,20,31,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
3,43148003,Tarragona (Bonavista),2014-10-09T00:00:00.000,11,PM1,µg/m3,industrial,peri-urban,43148,Tarragona,...,19,15,18,16,22,26,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
4,43148003,Tarragona (Bonavista),2014-05-15T00:00:00.000,12,NOX,µg/m3,industrial,peri-urban,43148,Tarragona,...,6,3,3,13,6,5,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."


In [4]:
#Data object should be modified to datetime, to later on be merged with Meteorological data.
df['data'] = pd.to_datetime(df['data']) 

In [5]:
#Let's check dataframe size
df.shape

(67847, 40)

In [6]:
#Let's check type features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67847 entries, 0 to 67846
Data columns (total 40 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   codi_eoi         67847 non-null  object        
 1   nom_estacio      67847 non-null  object        
 2   data             67847 non-null  datetime64[ns]
 3   magnitud         67847 non-null  object        
 4   contaminant      67847 non-null  object        
 5   unitats          67847 non-null  object        
 6   tipus_estacio    67847 non-null  object        
 7   area_urbana      67847 non-null  object        
 8   codi_ine         67847 non-null  object        
 9   municipi         67847 non-null  object        
 10  codi_comarca     67847 non-null  object        
 11  nom_comarca      67847 non-null  object        
 12  h01              65670 non-null  object        
 13  h02              65495 non-null  object        
 14  h03              66307 non-null  objec

In [7]:
#Date is sorted by ascending value
df1 = df.sort_values('data',ascending=True)
df1

Unnamed: 0,codi_eoi,nom_estacio,data,magnitud,contaminant,unitats,tipus_estacio,area_urbana,codi_ine,municipi,...,h19,h20,h21,h22,h23,h24,altitud,latitud,longitud,geocoded_column
23925,43148003,Tarragona (Bonavista),1991-04-01,6,CO,mg/m3,industrial,peri-urban,43148,Tarragona,...,0.4,0.4,0.5,0.9,1.1,0.8,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
24584,43148003,Tarragona (Bonavista),1991-04-01,7,NO,µg/m3,industrial,peri-urban,43148,Tarragona,...,4,3,3,5,9,4,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
24780,43148003,Tarragona (Bonavista),1991-04-01,44,HCNM,ppm,industrial,peri-urban,43148,Tarragona,...,0,0,0,0,0,0,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
24227,43148003,Tarragona (Bonavista),1991-04-01,65,H2S,µg/m3,industrial,peri-urban,43148,Tarragona,...,0.7,1,1.3,1.3,1.8,1.3,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
26746,43148003,Tarragona (Bonavista),1991-04-01,8,NO2,µg/m3,industrial,peri-urban,43148,Tarragona,...,41,51,66,106,135,109,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67268,43148003,Tarragona (Bonavista),2021-07-19,11,PM1,µg/m3,industrial,suburban,43148,Tarragona,...,,,,,,,39,41.11591,1.1919986,"{'type': 'Point', 'coordinates': [1.1919986, 4..."
67269,43148003,Tarragona (Bonavista),2021-07-19,8,NO2,µg/m3,industrial,suburban,43148,Tarragona,...,,,,,,,39,41.11591,1.1919986,"{'type': 'Point', 'coordinates': [1.1919986, 4..."
67270,43148003,Tarragona (Bonavista),2021-07-19,9,PM2.5,µg/m3,industrial,suburban,43148,Tarragona,...,,,,,,,39,41.11591,1.1919986,"{'type': 'Point', 'coordinates': [1.1919986, 4..."
67277,43148003,Tarragona (Bonavista),2021-07-19,12,NOX,µg/m3,industrial,suburban,43148,Tarragona,...,,,,,,,39,41.11591,1.1919986,"{'type': 'Point', 'coordinates': [1.1919986, 4..."


# 2. Select time period to be analyzed

In [8]:
#Desired period to analyze is 2010-2020. Period decision is made because of many values from 2021 have not been confirmed yet.
df2 = df1[(df1['data'] >= '2010-01-01') & (df1['data'] <= '2020-12-31')]
df2

Unnamed: 0,codi_eoi,nom_estacio,data,magnitud,contaminant,unitats,tipus_estacio,area_urbana,codi_ine,municipi,...,h19,h20,h21,h22,h23,h24,altitud,latitud,longitud,geocoded_column
65531,43148003,Tarragona (Bonavista),2010-01-01,1,SO2,µg/m3,industrial,peri-urban,43148,Tarragona,...,1,1,1,1,1,1,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
64476,43148003,Tarragona (Bonavista),2010-01-01,65,H2S,µg/m3,industrial,peri-urban,43148,Tarragona,...,1,1,1,1.1,1.1,1.3,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
65109,43148003,Tarragona (Bonavista),2010-01-01,8,NO2,µg/m3,industrial,peri-urban,43148,Tarragona,...,3,7,3,3,7,5,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
64939,43148003,Tarragona (Bonavista),2010-01-01,7,NO,µg/m3,industrial,peri-urban,43148,Tarragona,...,1,1,1,1,1,1,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
64100,43148003,Tarragona (Bonavista),2010-01-01,6,CO,mg/m3,industrial,peri-urban,43148,Tarragona,...,0.2,0.2,0.2,0.2,0.2,0.2,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66931,43148003,Tarragona (Bonavista),2020-12-31,10,PM10,µg/m3,industrial,suburban,43148,Tarragona,...,22,22,23,27,25,23,39,41.11591,1.1919986,"{'type': 'Point', 'coordinates': [1.1919986, 4..."
66933,43148003,Tarragona (Bonavista),2020-12-31,12,NOX,µg/m3,industrial,suburban,43148,Tarragona,...,19,27,37,51,35,22,39,41.11591,1.1919986,"{'type': 'Point', 'coordinates': [1.1919986, 4..."
66937,43148003,Tarragona (Bonavista),2020-12-31,8,NO2,µg/m3,industrial,suburban,43148,Tarragona,...,18,25,32,40,32,20,39,41.11591,1.1919986,"{'type': 'Point', 'coordinates': [1.1919986, 4..."
66938,43148003,Tarragona (Bonavista),2020-12-31,9,PM2.5,µg/m3,industrial,suburban,43148,Tarragona,...,4,19,17,20,22,20,39,41.11591,1.1919986,"{'type': 'Point', 'coordinates': [1.1919986, 4..."


In [9]:
#Let's check new dataframe size
df2.shape

(28283, 40)

In [10]:
#Show columns
df2.columns

Index(['codi_eoi', 'nom_estacio', 'data', 'magnitud', 'contaminant', 'unitats',
       'tipus_estacio', 'area_urbana', 'codi_ine', 'municipi', 'codi_comarca',
       'nom_comarca', 'h01', 'h02', 'h03', 'h04', 'h05', 'h06', 'h07', 'h08',
       'h09', 'h10', 'h11', 'h12', 'h13', 'h14', 'h15', 'h16', 'h17', 'h18',
       'h19', 'h20', 'h21', 'h22', 'h23', 'h24', 'altitud', 'latitud',
       'longitud', 'geocoded_column'],
      dtype='object')

In [11]:
#Features values should be transformed to float to be interpreted mathematically
df2[['h01', 'h02', 'h03', 'h04', 'h05', 'h06', 'h07', 'h08',
       'h09', 'h10', 'h11', 'h12', 'h13', 'h14', 'h15', 'h16', 'h17', 'h18',
       'h19', 'h20', 'h21', 'h22', 'h23', 'h24']] = df2[['h01', 'h02', 'h03', 'h04', 'h05', 'h06', 'h07', 'h08',
       'h09', 'h10', 'h11', 'h12', 'h13', 'h14', 'h15', 'h16', 'h17', 'h18',
       'h19', 'h20', 'h21', 'h22', 'h23', 'h24']].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [12]:
#Let's confirm type features modified
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28283 entries, 65531 to 66936
Data columns (total 40 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   codi_eoi         28283 non-null  object        
 1   nom_estacio      28283 non-null  object        
 2   data             28283 non-null  datetime64[ns]
 3   magnitud         28283 non-null  object        
 4   contaminant      28283 non-null  object        
 5   unitats          28283 non-null  object        
 6   tipus_estacio    28283 non-null  object        
 7   area_urbana      28283 non-null  object        
 8   codi_ine         28283 non-null  object        
 9   municipi         28283 non-null  object        
 10  codi_comarca     28283 non-null  object        
 11  nom_comarca      28283 non-null  object        
 12  h01              27277 non-null  float64       
 13  h02              27121 non-null  float64       
 14  h03              27818 non-null  f

In [13]:
#Let's create a column at the end of dataframe with average daily value and maximum value for any feature
ave = df2.loc[: , "h01":"h24"]
df2['promedio'] = ave.mean(axis=1)
maxi = df2.loc[: , "h01":"h24"]
df2['maximo'] = maxi.max(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [14]:
#Let's confirm dataframe
df2.head()

Unnamed: 0,codi_eoi,nom_estacio,data,magnitud,contaminant,unitats,tipus_estacio,area_urbana,codi_ine,municipi,...,h21,h22,h23,h24,altitud,latitud,longitud,geocoded_column,promedio,maximo
65531,43148003,Tarragona (Bonavista),2010-01-01,1,SO2,µg/m3,industrial,peri-urban,43148,Tarragona,...,1.0,1.0,1.0,1.0,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41...",1.0,1.0
64476,43148003,Tarragona (Bonavista),2010-01-01,65,H2S,µg/m3,industrial,peri-urban,43148,Tarragona,...,1.0,1.1,1.1,1.3,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41...",1.108333,1.4
65109,43148003,Tarragona (Bonavista),2010-01-01,8,NO2,µg/m3,industrial,peri-urban,43148,Tarragona,...,3.0,3.0,7.0,5.0,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41...",2.625,7.0
64939,43148003,Tarragona (Bonavista),2010-01-01,7,NO,µg/m3,industrial,peri-urban,43148,Tarragona,...,1.0,1.0,1.0,1.0,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41...",1.0,1.0
64100,43148003,Tarragona (Bonavista),2010-01-01,6,CO,mg/m3,industrial,peri-urban,43148,Tarragona,...,0.2,0.2,0.2,0.2,39,41.11591,1.191999,"{'type': 'Point', 'coordinates': [1.191999, 41...",0.2,0.2


In [15]:
#Let's reduce dataframe to columns desired: date, pollutant, average and maximum value
df3=df2[['data','contaminant','promedio','maximo']]
df3

Unnamed: 0,data,contaminant,promedio,maximo
65531,2010-01-01,SO2,1.000000,1.0
64476,2010-01-01,H2S,1.108333,1.4
65109,2010-01-01,NO2,2.625000,7.0
64939,2010-01-01,NO,1.000000,1.0
64100,2010-01-01,CO,0.200000,0.2
...,...,...,...,...
66931,2020-12-31,PM10,8.916667,27.0
66933,2020-12-31,NOX,12.583333,51.0
66937,2020-12-31,NO2,10.791667,40.0
66938,2020-12-31,PM2.5,6.708333,22.0


# 3. Structuring Dataset

In [16]:
#Let's generate a pivot table to allocate pollutant in columns as features and date samples as rows
table = df3.pivot_table(index='data', columns='contaminant', aggfunc=np.sum)
table

Unnamed: 0_level_0,maximo,maximo,maximo,maximo,maximo,maximo,maximo,maximo,maximo,promedio,promedio,promedio,promedio,promedio,promedio,promedio,promedio,promedio
contaminant,CO,H2S,NO,NO2,NOX,PM1,PM10,PM2.5,SO2,CO,H2S,NO,NO2,NOX,PM1,PM10,PM2.5,SO2
data,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
2010-01-01,0.2,1.4,1.0,7.0,,,,,1.0,0.200000,1.108333,1.000000,2.625000,,,,,1.000000
2010-01-02,0.2,1.8,13.0,44.0,,,,,1.0,0.200000,1.137500,2.250000,12.916667,,,,,1.000000
2010-01-03,0.3,1.6,17.0,48.0,,,,,7.0,0.204167,1.158333,3.625000,22.166667,,,,,1.875000
2010-01-04,0.2,2.0,26.0,42.0,,,,,4.0,0.200000,1.382609,8.217391,28.304348,,,,,1.652174
2010-01-05,0.4,1.4,23.0,44.0,,,,,2.0,0.220833,1.037500,5.750000,24.958333,,,,,1.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-27,,3.1,13.0,44.0,63.0,23.0,28.0,24.0,5.0,,1.408333,2.833333,13.625000,17.416667,8.291667,10.458333,8.916667,1.500000
2020-12-28,,1.6,6.0,27.0,37.0,4.0,26.0,6.0,9.0,,1.208333,1.541667,7.833333,9.666667,2.083333,7.541667,3.083333,5.416667
2020-12-29,,1.5,5.0,15.0,16.0,8.0,14.0,9.0,4.0,,1.150000,1.250000,6.791667,8.166667,2.458333,4.708333,3.208333,2.333333
2020-12-30,,1.3,3.0,20.0,25.0,2.0,42.0,5.0,1.0,,1.120833,1.208333,5.458333,6.916667,1.416667,7.500000,2.250000,1.000000


In [17]:
#Reduce a unique column row
table = pd.DataFrame(table.to_records())
table

Unnamed: 0,data,"('maximo', 'CO')","('maximo', 'H2S')","('maximo', 'NO')","('maximo', 'NO2')","('maximo', 'NOX')","('maximo', 'PM1')","('maximo', 'PM10')","('maximo', 'PM2.5')","('maximo', 'SO2')","('promedio', 'CO')","('promedio', 'H2S')","('promedio', 'NO')","('promedio', 'NO2')","('promedio', 'NOX')","('promedio', 'PM1')","('promedio', 'PM10')","('promedio', 'PM2.5')","('promedio', 'SO2')"
0,2010-01-01,0.2,1.4,1.0,7.0,,,,,1.0,0.200000,1.108333,1.000000,2.625000,,,,,1.000000
1,2010-01-02,0.2,1.8,13.0,44.0,,,,,1.0,0.200000,1.137500,2.250000,12.916667,,,,,1.000000
2,2010-01-03,0.3,1.6,17.0,48.0,,,,,7.0,0.204167,1.158333,3.625000,22.166667,,,,,1.875000
3,2010-01-04,0.2,2.0,26.0,42.0,,,,,4.0,0.200000,1.382609,8.217391,28.304348,,,,,1.652174
4,2010-01-05,0.4,1.4,23.0,44.0,,,,,2.0,0.220833,1.037500,5.750000,24.958333,,,,,1.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3987,2020-12-27,,3.1,13.0,44.0,63.0,23.0,28.0,24.0,5.0,,1.408333,2.833333,13.625000,17.416667,8.291667,10.458333,8.916667,1.500000
3988,2020-12-28,,1.6,6.0,27.0,37.0,4.0,26.0,6.0,9.0,,1.208333,1.541667,7.833333,9.666667,2.083333,7.541667,3.083333,5.416667
3989,2020-12-29,,1.5,5.0,15.0,16.0,8.0,14.0,9.0,4.0,,1.150000,1.250000,6.791667,8.166667,2.458333,4.708333,3.208333,2.333333
3990,2020-12-30,,1.3,3.0,20.0,25.0,2.0,42.0,5.0,1.0,,1.120833,1.208333,5.458333,6.916667,1.416667,7.500000,2.250000,1.000000


In [18]:
#Columns are going to be renamed
table.columns

Index(['data', '('maximo', 'CO')', '('maximo', 'H2S')', '('maximo', 'NO')',
       '('maximo', 'NO2')', '('maximo', 'NOX')', '('maximo', 'PM1')',
       '('maximo', 'PM10')', '('maximo', 'PM2.5')', '('maximo', 'SO2')',
       '('promedio', 'CO')', '('promedio', 'H2S')', '('promedio', 'NO')',
       '('promedio', 'NO2')', '('promedio', 'NOX')', '('promedio', 'PM1')',
       '('promedio', 'PM10')', '('promedio', 'PM2.5')', '('promedio', 'SO2')'],
      dtype='object')

In [19]:
#Let's rename columns
table.columns = ['data','CO max','H2S max','NO max','NO2 max','NOx max','PM1 max','PM10 max','PM2.5 max','SO2 max','CO','H2S','NO','NO2','NOx','PM1','PM10','PM2.5','SO2']
table

Unnamed: 0,data,CO max,H2S max,NO max,NO2 max,NOx max,PM1 max,PM10 max,PM2.5 max,SO2 max,CO,H2S,NO,NO2,NOx,PM1,PM10,PM2.5,SO2
0,2010-01-01,0.2,1.4,1.0,7.0,,,,,1.0,0.200000,1.108333,1.000000,2.625000,,,,,1.000000
1,2010-01-02,0.2,1.8,13.0,44.0,,,,,1.0,0.200000,1.137500,2.250000,12.916667,,,,,1.000000
2,2010-01-03,0.3,1.6,17.0,48.0,,,,,7.0,0.204167,1.158333,3.625000,22.166667,,,,,1.875000
3,2010-01-04,0.2,2.0,26.0,42.0,,,,,4.0,0.200000,1.382609,8.217391,28.304348,,,,,1.652174
4,2010-01-05,0.4,1.4,23.0,44.0,,,,,2.0,0.220833,1.037500,5.750000,24.958333,,,,,1.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3987,2020-12-27,,3.1,13.0,44.0,63.0,23.0,28.0,24.0,5.0,,1.408333,2.833333,13.625000,17.416667,8.291667,10.458333,8.916667,1.500000
3988,2020-12-28,,1.6,6.0,27.0,37.0,4.0,26.0,6.0,9.0,,1.208333,1.541667,7.833333,9.666667,2.083333,7.541667,3.083333,5.416667
3989,2020-12-29,,1.5,5.0,15.0,16.0,8.0,14.0,9.0,4.0,,1.150000,1.250000,6.791667,8.166667,2.458333,4.708333,3.208333,2.333333
3990,2020-12-30,,1.3,3.0,20.0,25.0,2.0,42.0,5.0,1.0,,1.120833,1.208333,5.458333,6.916667,1.416667,7.500000,2.250000,1.000000


In [20]:
#Summary of table got
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3992 entries, 0 to 3991
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   data       3992 non-null   datetime64[ns]
 1   CO max     343 non-null    float64       
 2   H2S max    3961 non-null   float64       
 3   NO max     3952 non-null   float64       
 4   NO2 max    3952 non-null   float64       
 5   NOx max    3323 non-null   float64       
 6   PM1 max    2638 non-null   float64       
 7   PM10 max   3073 non-null   float64       
 8   PM2.5 max  3066 non-null   float64       
 9   SO2 max    3975 non-null   float64       
 10  CO         343 non-null    float64       
 11  H2S        3961 non-null   float64       
 12  NO         3952 non-null   float64       
 13  NO2        3952 non-null   float64       
 14  NOx        3323 non-null   float64       
 15  PM1        2638 non-null   float64       
 16  PM10       3073 non-null   float64       


# 4. Export Dataset

In [21]:
#Finally, Pollutant dataset is exported to csv to be merged with Meteorological dataset
table.to_csv(r'Data_sets/Pollutant_ready.csv')