# Exploratory Data Analysis of the measurements

## Initialization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math

In [None]:
# Directory where the data is located
dir_files = './03-dados-qualar-longo-corrigido/'

In [None]:
# Measures file
ficheiro_medicoes = dir_files + '03-medicoes-longo-AML.csv'

In [None]:
medicoes = pd.read_csv(ficheiro_medicoes, thousands=',', index_col=0, parse_dates=True)

## Evaluate, for each pollutant, whether the distribution of measurement values varies across stations

In [None]:
# Projct DataFrame
medicoes

In [None]:
# Create pollutants list
lista_poluentes = medicoes['variable'].unique()
lista_poluentes

In [None]:
# Create stations list
lista_estacoes = medicoes['Estação'].unique()
lista_estacoes

In [None]:
# Function that returns two random stations
def duas_estacoes():
    return np.random.choice(lista_estacoes, size = 2, replace = False)

In [None]:
# Function that returns three years of measurements from a list of available years
    tres = np.random.choice(anos_disponiveis, size=3, replace=False)
    tres.sort()
    return tres

In [None]:
# Arbitrary list of colors, used only to help distinguish the plots
lista_cores = ['blue','red','green','orange','purple','cyan','magenta','yellow','brown','pink','olive']

In [None]:
# Function that returns any desired number of colors
def diferentes_cores(numero_cores):
    return np.random.choice(lista_cores, size = numero_cores, replace = False)

In [None]:
### NOTE: The following commands were executed using fixed lists, since using a function that returns multiple
### stations or different pollutants may cause issues, as not all stations measure all pollutants.
### That said, randomness-based functions were created to test these scenarios.

In [None]:
# KDE (or histogram) of measurement values for a pollutant, to compare distributions across a subset of stations
hist_data = medicoes.loc[medicoes['variable'] == 'SO2']
estacoes = ['Mem Martins','Reboleira']
cores = diferentes_cores(len(estacoes))

for estacao, cor in zip(estacoes, cores):
    hist = hist_data.loc[hist_data['Estação'] == estacao]
    # Logarithmic histogram
    hist.plot(kind = 'hist', column = 'value', log = True, xlabel = 'Value', title = estacao, color = cor)

In [None]:
# Box plot of measurement values for a pollutant, to compare distributions across a subset of stations
box_data = medicoes.loc[medicoes['variable'] == 'SO2']
estacoes = ['Escavadeira', 'Fidalguinhos']
cores = diferentes_cores(len(estacoes))

for estacao, cor in zip(estacoes, cores):
    box = box_data.loc[box_data['Estação']==estacao]
    # Limit the data to the 3rd quartile to remove outliers
    ter_quartil = box['value'].quantile([0.75]).values[0]
    box = box.loc[box['value']<= (ter_quartil)]
    # BoxPlot
    box.plot(kind = 'box', column = 'value', title = estacao, color = cor)

## Evaluate, for each pollutant, whether the distribution of measurement values varies over time

In [None]:
# Function that computes the decade from a given year
def calc_dec(Ano):
    decada = math.floor((Ano - math.floor(Ano/100)*100)/10)*10
    return decada

In [None]:
# Add columns with year and decade information
medicoes['Ano'] = medicoes.index.year
medicoes['Década'] = medicoes['Ano'].apply(lambda x: calc_dec(x))

In [None]:
# Project the DataFrame to facilitate visualization
medicoes

In [None]:
# KDE (or histogram) of measurement values for a pollutant, to compare distributions across different years 
hist_data = medicoes.loc[medicoes['variable'] == 'O3']
estacao = 'Alto Seixalinho'
print('Estudo para a estação:')
print(estacao)
print()

hist_data = hist_data[hist_data['Estação'] == estacao]
anos = tres_anos(hist_data['Ano'].unique())
cores = diferentes_cores(len(anos))

print('Anos estudados:')
for ano, cor in zip(anos, cores):
    print(ano)
    titulo = estacao + ' ' + str(ano)
    hist = hist_data.loc[hist_data['Ano'] == ano]
    # Logarithmic histogram
    hist.plot(kind = 'hist', column = 'value', log = True, xlabel = 'Value', title = titulo, color = cor)

In [None]:
# Box plot of measurement values for a pollutant, to compare distributions across different years
box_data = medicoes.loc[medicoes['variable'] == 'NO2']
estacao = 'Alverca'
print('Estudo para a estação:')
print(estacao)
print()

box_data = box_data.loc[box_data['Estação'] == estacao]
anos = tres_anos(hist_data['Ano'].unique())
cores = diferentes_cores(len(anos))

print('Anos estudados:')
for ano, cor in zip(anos, cores):
    print(ano)
    titulo = estacao + ' ' + str(ano)
    # Limit the data to the 3rd quartile to remove outliers
    ter_quartil = box['value'].quantile([0.75]).values[0]
    box = box.loc[box['value']<= (ter_quartil)]
    # BoxPlot
    box.plot(kind = 'box', column='value', title = titulo, color = cor)

## Now we will analyze the evolution over the decades

In [None]:
# Project DataFrame
medicoes

In [None]:
# Decades list
lista_decadas = medicoes['Década'].unique()
lista_decadas

In [None]:
# KDE (or histogram) of measurement values for a pollutant, to compare distributions across decades, for a subset of stations
hist_data = medicoes.loc[medicoes['variable'] == 'NO2']
estacoes = ['Avenida da Liberdade','Alfragide/Amadora']
cores = diferentes_cores(len(estacoes))

for estacao, cor in zip(estacoes, cores):
    for decada in lista_decadas:
        titulo = estacao + ' ' + str(decada)
        hist = hist_data[hist_data['Estação'] == estacao]
        hist = hist.loc[hist['Década'] == decada]
        ## Logarithmic histogram
        hist.plot(kind = 'hist', column = 'value', log = True, xlabel = 'Value', title = titulo, color = cor)

In [None]:
# Box plot of measurement values for a pollutant, to compare distributions across decades, for a subset of stations
box_data = medicoes.loc[medicoes['variable'] == 'PM2.5']
estacoes = ['Entrecampos','Olivais']
cores = diferentes_cores(len(estacoes))

for estacao, cor in zip(estacoes,cores):
    for decada in lista_decadas:
        titulo = estacao + ' ' + str(decada)
        box = box_data.loc[box_data['Estação'] == estacao]
        box = box.loc[box['Década'] == decada]
        # Limit the data to the 3rd quartile to remove outliers
        ter_quartil = box['value'].quantile([0.75]).values[0]
        box = box.loc[box['value'] <= (ter_quartil)]
        # BoxPlot
        box.plot(kind = 'box', column = 'value', title = titulo, color = cor)