# Conversion to long format and basic corrections

## Initialization

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Directory where the data is located
dir_original_files = './02-dados-qualar-integrados/'

# Directory where the results will be saved
dir_destination = './03-dados-qualar-longo-corrigido/'

In [None]:
# Measures file
ficheiro_medicoes = dir_original_files + 'df_estacoes.csv'

In [None]:
df = pd.read_csv(ficheiro_medicoes, thousands=',', index_col=0)

In [None]:
df

## Renaming columns
We will use simpler names:

    - Sulfur Dioxide (µg/m3): SO2
    - Particulate Matter < 10 µm (µg/m3): PM10
    - Ozone (µg/m3): O3
    - Nitrogen Dioxide (µg/m3): NO2
    - Carbon Monoxide (mg/m3): CO
    - Benzene (µg/m3): Benzene
    - Particulate Matter < 2.5 µm (µg/m3): PM2.5

In [None]:
# Rename columns to simpler names
df.rename(columns={'Dióxido de Enxofre (µg/m3)' : 'SO2','Partículas < 10 µm (µg/m3)' : 'PM10',
                   'Dióxido de Azoto (µg/m3)' : 'NO2','Ozono (µg/m3)' : 'O3', 'Monóxido de Carbono (mg/m3)' : 'CO', 
                  'Benzeno (µg/m3)' : 'Benzeno', 'Partículas < 2.5 µm (µg/m3)' : 'PM2.5'},inplace = True)
# Check result
df

In [None]:
# Since we will not use Benzene and CO measurements, we can remove them
df.drop(columns=['Benzeno','CO'],inplace=True)
# Check result
df

## Convert to long format

In [None]:
# Each row will be defined by the date (index), station, and pollutant
df = df.melt(id_vars=['Estação'], ignore_index=False)
# Check result
df

In [None]:
#Check for null values in the measures
print('Existem valores nulos:')
print(df.isna().values.any())

In [None]:
# Removing the nulls
df.dropna(inplace = True)
# Check result
print('Existem valores nulos:')
print(df.isna().values.any())
df

## Convert negative values to zero
Due to sensor imprecision, negative values are sometimes recorded. We will replace all negative values with 0.

In [None]:
#Checking for negative values
negativos = df.loc[df['value']<0]
print('Existem valores negativos:')
print(not(negativos.empty))

In [None]:
# Changing the negative values for zero
df['value'] = df['value'].apply(lambda x: 0 if x<0 else x)
# Check result
negativos = df.loc[df['value']<0]
print('Existem valores negativos:')
print(not(negativos.empty))
df

## Save the new CSV

In [None]:
# Save result in CSV
df.to_csv(dir_destination + '03-medicoes-longo.csv', index = True)

## Save a new CSV containing only measurements from AML stations (Lisbon Metropolitan Area – north and south)

This subset of the data can be used in later stages if the full dataset proves to be too large.

In [None]:
# Using the stations data (estacoes.xlsx), filter measurements from AML stations
# Regions DataFrame containing the list of stations and regions present in the data
regioes = pd.read_excel(dir_original_files + 'estacoes.xlsx')
# Filter stations that are located in the AML region
regioes = regioes[regioes['Região'].str.contains('AML')]
# Convert the DataFrame into a List
lista_regioes = regioes['Estação'].to_list()
# Filter the DataFrame using the list of AML stations
df_filtrado = df[df['Estação'].isin(lista_regioes)]
df_filtrado

In [None]:
# Save result in CSV file
df_filtrado.to_csv(dir_destination + '03-medicoes-longo-AML.csv', index = True)