In [None]:
# upload the dataset, count id stations complete for all years and save cleaned data

import pandas as pd
import numpy as np

# function to add week number to the data frame

def add_week(data_frame):
  data_frame['Time'] = pd.to_datetime(data_frame['Time'])
  data_frame['Week'] = data_frame['Time'].dt.isocalendar().week
  return data_frame

# drop NaN in pm25, add week and log_pm25 and keep only stations with complete pm25 values

file_list = ['dataset_2016.csv', 'dataset_2017.csv', 'dataset_2018.csv', 'dataset_2019.csv', 'dataset_2020.csv', 'dataset_2021.csv']
years = ['2016', '2017', '2018', '2019', '2020', '2021']
df_complete_id_stations = []
path = "../data/"

for file, year in zip(file_list, years):
  df = pd.read_csv(path+file)
  df.dropna(subset=['AQ_pm25'], inplace=True)
  station_counts = df['IDStations'].value_counts()
  max_occurrences = station_counts.max()
  stations_max_occurrences = station_counts[station_counts == max_occurrences].index.tolist()
  df = df[df['IDStations'].isin(stations_max_occurrences)]
  df['log_pm25'] = np.log(df['AQ_pm25']+1)
  df = add_week(df)
  name_file = f'dataset_{year}_cleaned.csv'
  df.to_csv(path+name_file, index=False)
  number_stations_max_occurrences = len(stations_max_occurrences)
  df_complete_id_stations.append(number_stations_max_occurrences)

print("Results:")
for idx, stations_52 in enumerate(df_complete_id_stations, 1):
    print(f"Dataset {idx}: Number of station id that appear all the weeks: {stations_52}")

# for dataset_2021_cleaned.csv week are wrong (it reads 1/1 as week 53) --> don't use it!

In [None]:
# we use year 2019

df = pd.read_csv(path + 'dataset_2019_cleaned.csv')
df

In [None]:
# find NaN values
missing_values = df.isnull().sum()

# print NaN number for every covariate
print(missing_values)

In [None]:
# drop from dataset covariates AQ_co, AQ_nh3, AQ_so2, LA_soil_use
covariates_to_drop = ['AQ_co', 'AQ_nh3', 'AQ_so2', 'LA_soil_use']

df = df.drop(columns=covariates_to_drop)
df

In [None]:
# NaN values for stations
interested_covariates = ['AQ_pm10', 'AQ_nox', 'AQ_no2', 'LI_pigs', 'LI_bovine']
station_grouped = df.groupby('IDStations')
station_missing_values = {}

for station, df_station in station_grouped:
    print(f"Station: {station}")
    covariates_station = df_station[interested_covariates]
    missing_values = covariates_station.isnull().sum()
    covariate_with_missing = missing_values > 24
    if covariate_with_missing.any():
        station_missing_values[station] = covariate_with_missing[covariate_with_missing].index.tolist()
    print(missing_values)

# print stations with missing values
for station, covariate_list in station_missing_values.items():
    print(f"Station: {station} - Covariate with missing values: {covariate_list}")

In [None]:
# drop stations with too much missing values
IDStations_to_drop = list(station_missing_values.keys())
df_complete = df[~df['IDStations'].isin(IDStations_to_drop)]

# find NaN values
missing_values = df_complete.isnull().sum()

# print NaN number for every covariate
print(missing_values)

In [None]:
# fill NaN in df_complete

nan_indices, nan_columns = np.where(pd.isnull(df_complete))
print(nan_indices)
print(nan_columns)

df_filled = df_complete.copy()
df_filled.iloc[187, [5, 7]] = (df_complete.iloc[186, [5, 7]] + df_complete.iloc[188, [5, 7]]) / 2
df_filled.iloc[203, [7, 8]] = (df_complete.iloc[202, [7, 8]] + df_complete.iloc[204, [7, 8]]) / 2
df_filled.iloc[944:953, [7, 8]] = (df_complete.iloc[943, [7, 8]] + df_complete.iloc[954, [7, 8]]) / 2
df_filled.iloc[967, [5, 7]] = (df_complete.iloc[966, [5, 7]] + df_complete.iloc[966, [5, 7]]) / 2
df_filled.iloc[1075:1077, [7, 8]] = (df_complete.iloc[1074, [7, 8]] + df_complete.iloc[1077, [7, 8]]) / 2
df_filled.iloc[1121:1123, [7, 8]] = (df_complete.iloc[1120, [7, 8]] + df_complete.iloc[1123, [7, 8]]) / 2
df_filled.iloc[1268:1270, [7, 8]] = (df_complete.iloc[1267, [7, 8]] + df_complete.iloc[1270, [7, 8]]) / 2
df_filled.iloc[1279, 7] = (df_complete.iloc[1278, 7] + df_complete.iloc[1280, 7]) / 2
df_filled.iloc[1282, [7, 8]] = (df_complete.iloc[1281, [7, 8]] + df_complete.iloc[1283, [7, 8]]) / 2

df_filled.to_csv(path + "dataset_2019_filled.csv")

# find NaN values
missing_values = df_filled.isnull().sum()


# print NaN number for every covariate
print(missing_values)

In [None]:
# create a function that from a data_frame return a dictionary with id_station and time series of log_pm25 values

def create_time_series(data_frame):
    stations = data_frame['IDStations'].unique()

    dict_stations = {}  # stations -> time series
    for station in stations:
        # Select data for current station
        data_station = data_frame[data_frame['IDStations'] == station]

        # Crea una time series per la stazione corrente
        log_pm25_series = data_station.set_index('Week')['log_pm25']

        # Aggiungi la time series al dizionario
        dict_stations[station] = log_pm25_series

    return dict_stations

dict_time_series = create_time_series(df_filled)
# access to a particular station --> dict_time_series['station_name']
# access to log_pm25 values --> dict_time_series['station_name'].values
print(dict_time_series['669'].index)
print(dict_time_series['669'].values)
# acces to log_pm25 values of a specific week (ex week 13) --> dict_time_series['station_name'][13]



In [None]:
def create_matrix_from_dict(dict_stations):
    # Obtain number of stations and max length of time series
    num_stations = len(dict_stations)
    max_length = max(len(series) for series in dict_stations.values())

    # Initialize matrix with NaN values
    matrix = np.full((num_stations, max_length), np.nan)

    # Put log_pm25 values into matrix
    for idx, series in enumerate(dict_stations.values()):
        matrix[idx, :len(series)] = series.values

    return matrix

log_pm25_matrix = create_matrix_from_dict(dict_time_series)

print(log_pm25_matrix)
print(log_pm25_matrix.shape)