In [53]:
# upload the dataset, count id stations complete for all years and save cleaned data

import pandas as pd
import numpy as np

# function to add week number to the data frame

def add_week(data_frame):
  data_frame['Time'] = pd.to_datetime(data_frame['Time'])
  data_frame['Week'] = data_frame['Time'].dt.isocalendar().week
  return data_frame

# drop NaN in pm25, add week and log_pm25 and keep only stations with complete pm25 values

file_list = ['dataset_2016.csv', 'dataset_2017.csv', 'dataset_2018.csv', 'dataset_2019.csv', 'dataset_2020.csv', 'dataset_2021.csv']
years = ['2016', '2017', '2018', '2019', '2020', '2021']
df_complete_id_stations = []

for file, year in zip(file_list, years):
  df = pd.read_csv(file)
  df.dropna(subset=['AQ_pm25'], inplace=True)
  station_counts = df['IDStations'].value_counts()
  max_occurrences = station_counts.max()
  stations_max_occurrences = station_counts[station_counts == max_occurrences].index.tolist()
  df = df[df['IDStations'].isin(stations_max_occurrences)]
  df['log_pm25'] = np.log(df['AQ_pm25']+1)
  df = add_week(df)
  name_file = f'dataset_{year}_cleaned.csv'
  df.to_csv(name_file, index=False)
  number_stations_max_occurrences = len(stations_max_occurrences)
  df_complete_id_stations.append(number_stations_max_occurrences)

print("Results:")
for idx, stations_52 in enumerate(df_complete_id_stations, 1):
    print(f"Dataset {idx}: Number of station id that appear all the weeks: {stations_52}")

# for dataset_2021_cleaned.csv week are wrong (it reads 1/1 as week 53) --> don't use it!

Results:
Dataset 1: Number of station id that appear all the weeks: 42
Dataset 2: Number of station id that appear all the weeks: 41
Dataset 3: Number of station id that appear all the weeks: 43
Dataset 4: Number of station id that appear all the weeks: 46
Dataset 5: Number of station id that appear all the weeks: 45
Dataset 6: Number of station id that appear all the weeks: 29


In [56]:
# we use year 2019

df = pd.read_csv('dataset_2019_cleaned.csv')
df

Unnamed: 0,IDStations,Latitude,Longitude,Time,Altitude,AQ_pm10,AQ_pm25,AQ_co,AQ_nh3,AQ_nox,...,EM_nox_sum,EM_so2_sum,LI_pigs,LI_bovine,LA_hvi,LA_lvi,LA_land_use,LA_soil_use,log_pm25,Week
0,1264,46.167852,9.87921,2019-01-04,290,42.000000,31.714286,,,89.381429,...,10.568,2.6433,0.287400,4.647000,3.996143,1.232714,112,17.0,3.487812,1
1,1264,46.167852,9.87921,2019-01-11,290,33.428571,26.857143,,,89.350000,...,10.611,2.6905,0.287400,4.647000,3.994429,1.231000,112,17.0,3.327089,2
2,1264,46.167852,9.87921,2019-01-18,290,34.857143,28.714286,,,78.017143,...,10.650,2.7215,0.294243,4.647000,3.995000,1.231429,112,17.0,3.391628,3
3,1264,46.167852,9.87921,2019-01-25,290,42.857143,35.142857,,,89.498571,...,10.586,2.7023,0.335300,4.647000,3.996000,1.232714,112,17.0,3.587479,4
4,1264,46.167852,9.87921,2019-02-01,290,33.571429,30.428571,,,95.164286,...,10.317,2.5831,0.335300,4.647000,3.996857,1.234143,112,17.0,3.447717,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,STA.IT2282A,45.439700,8.62040,2019-11-29,154,24.000000,18.142857,,,,...,72.580,11.2580,63.570000,7.836714,4.138286,1.009229,112,,2.951930,48
2388,STA.IT2282A,45.439700,8.62040,2019-12-06,154,43.000000,33.857143,,,,...,72.370,11.3220,63.570000,7.846714,4.138000,0.984329,112,,3.551258,49
2389,STA.IT2282A,45.439700,8.62040,2019-12-13,154,28.571429,23.000000,,,,...,70.484,11.1090,63.562857,7.852286,4.137571,0.962929,112,,3.178054,50
2390,STA.IT2282A,45.439700,8.62040,2019-12-20,154,15.285714,8.714286,,,,...,68.036,10.7940,63.560000,7.860000,4.136714,0.953657,112,,2.273598,51


In [57]:
# create a function that from a data_frame return a dictionary with id_station and time series of log_pm25 values

def create_time_series(data_frame):
    stations = data_frame['IDStations'].unique()

    dict_stations = {}  # stations -> time series
    for station in stations:
        # Select data for current station
        data_station = data_frame[data_frame['IDStations'] == station]

        # Crea una time series per la stazione corrente
        log_pm25_series = data_station.set_index('Week')['log_pm25']

        # Aggiungi la time series al dizionario
        dict_stations[station] = log_pm25_series

    return dict_stations

dict_time_series = create_time_series(df)
# access to a particular station --> dict_time_series['station_name']
# access to log_pm25 values --> dict_time_series['station_name'].values
print(dict_time_series['STA.IT2282A'].index)
print(dict_time_series['STA.IT2282A'].values)
# acces to log_pm25 values of a specific week (ex week 13) --> dict_time_series['station_name'][13]



Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
            52],
           dtype='int64', name='Week')
[3.46126162 3.31158522 3.72051654 3.73084365 3.17208366 3.34739468
 3.91487607 3.34739468 2.50143595 2.22847712 2.25878247 2.79902198
 2.95936463 2.58668934 2.63905733 2.18122424 1.94591015 2.02438176
 2.00533357 2.16496372 2.34454929 2.63905733 2.66921037 2.7080502
 2.83321334 2.7902883  2.56494936 2.10006083 2.84158159 2.47293046
 2.49869997 2.62880083 2.47293046 2.89037176 2.62880083 2.24374459
 3.04452244 2.82477448 2.71752895 2.89827694 2.95192965 2.94443898
 3.06472515 2.57768838 2.44853901 2.56494936 2.58668934 2.95192965
 3.55125808 3.17805383 2.27359756 3.81928095]


In [61]:
def create_matrix_from_dict(dict_stations):
    # Obtain number of stations and max length of time series
    num_stations = len(dict_stations)
    max_length = max(len(series) for series in dict_stations.values())

    # Initialize matrix with NaN values
    matrix = np.full((num_stations, max_length), np.nan)

    # Put log_pm25 values into matrix
    for idx, series in enumerate(dict_stations.values()):
        matrix[idx, :len(series)] = series.values

    return matrix

log_pm25_matrix = create_matrix_from_dict(dict_time_series)

print(log_pm25_matrix)
print(log_pm25_matrix.shape)

[[3.48781185 3.32708941 3.39162793 ... 3.64133851 3.19575341 3.63381968]
 [3.88890059 3.52636052 3.74106521 ... 3.48343548 2.7080502  4.05797692]
 [4.05550473 3.79066215 3.83483337 ... 3.63758616 3.05803616 4.15888308]
 ...
 [3.47903987 3.03082359 3.31678004 ... 2.88240359 2.59738463 3.44316158]
 [3.9539872  3.74782199 3.07136969 ... 3.2358734  2.6492097  3.68887945]
 [3.46126162 3.31158522 3.72051654 ... 3.17805383 2.27359756 3.81928095]]
(46, 52)
