In [42]:
%reset -f
%config InteractiveShell.ast_node_interactivity = 'all'

# Fetching all Air Quality datasets into their dataframes
# Perform immediate concatenation per year
import os
import pandas as pd

# Initialize (do not add extra datasets to dir)
dir = 'CAdata/'
colidx = [0,2,4,17]     # column indexes to use
innerkeys = ['Date', 'Site ID', 'COUNTY']
dataA2020 = pd.read_csv(dir + 'cf-2020-co.csv', parse_dates=True, usecols=colidx)
dataA2021 = pd.read_csv(dir + 'cf-2021-co.csv', parse_dates=True, usecols=colidx)
dataA2022 = pd.read_csv(dir + 'cf-2022-co.csv', parse_dates=True, usecols=colidx)

with os.scandir(dir) as datasets:
    for dataset in datasets:
        if dataset.is_file() and 'co' not in dataset.name:
            temp = pd.read_csv(dataset, parse_dates=True, usecols=colidx)
            if '2020' in dataset.name:
                # cols_to_use = dataA2020.columns.difference(temp)
                dataA2020 = pd.merge(dataA2020, temp, how='outer', on=innerkeys)
            elif '2021' in dataset.name:
                # cols_to_use = dataA2021.columns.difference(temp)
                dataA2021 = pd.merge(dataA2021, temp, how='outer', on=innerkeys)
            elif '2022' in dataset.name:
                # cols_to_use = dataA2022.columns.difference(temp)
                dataA2022 = pd.merge(dataA2022, temp, how='outer', on=innerkeys)

dataA2020 = dataA2020.groupby(by=['Date', 'Site ID']).mean().groupby(by=['Date']).mean()
dataA2021 = dataA2021.groupby(by=['Date', 'Site ID']).mean().groupby(by=['Date']).mean()
dataA2022 = dataA2022.groupby(by=['Date', 'Site ID']).mean().groupby(by=['Date']).mean()

dataA = pd.concat([dataA2020, dataA2021, dataA2022])

new_names = ['CO conc (ppm)', 'NO2 conc (ppb)', 'Ozone conc (ppm)',
             'Pb Conc (ug/m3 SC)', 'PM10 Conc (ug/m3 SC)',
             'PM2.5 Conc (ug/m3 LC', 'SO2 Conc (ppb)']
dataA2020
dataA2021
dataA2022
dataA

# Rename columns
for i in range(len(new_names)):
    dataA.rename(columns={dataA.columns[i]: new_names[i]}, inplace=True)

dataA

import seaborn as sns

# Deciding whether to drop or impute null values, so we check how many null values there are.
# Dataset A impute
print("A: Number of entries with null values:", dataA.isna().any(axis=1).sum())
print("A: Number of entries:", dataA.shape[0])

# These imports are important, imputer relies on them.

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer   # Important!
from sklearn.impute import IterativeImputer     # default imputer is BayesianRidge

from sklearn.linear_model import BayesianRidge

# Initialize imputer
imp = IterativeImputer(max_iter=100, random_state=1, verbose=True)
dataA[:] = imp.fit_transform(dataA)

dataA

Unnamed: 0_level_0,Daily Max 8-hour CO Concentration,Daily Max 1-hour NO2 Concentration,Daily Max 8-hour Ozone Concentration,Daily Mean Pb Concentration,Daily Mean PM10 Concentration,Daily Mean PM2.5 Concentration,Daily Max 1-hour SO2 Concentration
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01/01/2020,0.606780,19.129570,0.030783,0.008000,17.211765,13.533446,1.236000
01/02/2020,0.643220,24.476087,0.028740,0.013000,16.750000,11.087611,1.228000
01/03/2020,0.794915,28.616304,0.027643,0.018990,19.861446,13.664307,1.236000
01/04/2020,0.817797,24.173913,0.031677,0.011233,19.096774,11.571699,1.140000
01/05/2020,0.714407,20.461828,0.033058,0.009000,13.103659,9.578448,0.884000
...,...,...,...,...,...,...,...
12/27/2020,0.545614,17.241053,0.030484,0.012000,13.506024,8.070082,0.880769
12/28/2020,0.414912,17.989247,0.033281,0.004000,8.666667,5.755833,0.596154
12/29/2020,0.596552,24.449468,0.028915,0.005167,10.402899,7.589557,0.996154
12/30/2020,0.693966,26.431915,0.029243,,14.378049,9.968835,0.960000


Unnamed: 0_level_0,Daily Max 8-hour CO Concentration,Daily Max 1-hour NO2 Concentration,Daily Max 8-hour Ozone Concentration,Daily Mean Pb Concentration,Daily Mean PM10 Concentration,Daily Mean PM2.5 Concentration,Daily Max 1-hour SO2 Concentration
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01/01/2021,0.642982,20.597872,0.030336,0.021500,15.517045,13.344805,1.100000
01/02/2021,0.625439,20.771053,0.028085,0.010000,17.964706,12.023790,0.792593
01/03/2021,0.613158,20.455319,0.028695,0.008000,18.927711,13.015200,0.842308
01/04/2021,0.613158,22.205789,0.031188,0.007455,18.474926,9.849903,0.866667
01/05/2021,0.628947,25.500532,0.027467,0.027000,17.523810,10.025600,0.980000
...,...,...,...,...,...,...,...
12/27/2021,0.429808,16.375000,0.034460,0.000650,8.610465,3.715428,0.683333
12/28/2021,0.367308,19.036667,0.034358,,7.035714,4.756911,0.475000
12/29/2021,0.388462,17.678022,0.030295,0.000650,6.331325,4.473016,0.486957
12/30/2021,0.439623,18.481183,0.026066,0.005625,7.711009,5.348130,1.139130


Unnamed: 0_level_0,Daily Max 8-hour CO Concentration,Daily Max 1-hour NO2 Concentration,Daily Max 8-hour Ozone Concentration,Daily Mean Pb Concentration,Daily Mean PM10 Concentration,Daily Mean PM2.5 Concentration,Daily Max 1-hour SO2 Concentration
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01/01/2022,0.541837,19.519231,0.033708,,13.477528,10.529359,0.600000
01/02/2022,0.602041,23.307065,0.031556,,12.318681,9.344264,1.005000
01/03/2022,0.680612,25.995055,0.031120,,15.823864,8.316667,0.860000
01/04/2022,0.652041,26.605618,0.029597,,17.174419,8.361370,0.842105
01/05/2022,0.687755,26.740217,0.028671,0.008412,17.867868,8.859782,0.761905
...,...,...,...,...,...,...,...
12/27/2022,0.693750,23.434337,0.028159,,18.103261,7.348347,0.814286
12/28/2022,0.453125,19.014118,0.029987,,11.724014,7.036765,0.904545
12/29/2022,0.394792,16.314535,0.029263,0.004000,11.717391,5.935887,0.509091
12/30/2022,0.343750,13.708824,0.023378,,13.032967,4.111475,2.665217


Unnamed: 0_level_0,Daily Max 8-hour CO Concentration,Daily Max 1-hour NO2 Concentration,Daily Max 8-hour Ozone Concentration,Daily Mean Pb Concentration,Daily Mean PM10 Concentration,Daily Mean PM2.5 Concentration,Daily Max 1-hour SO2 Concentration
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01/01/2020,0.606780,19.129570,0.030783,0.008000,17.211765,13.533446,1.236000
01/02/2020,0.643220,24.476087,0.028740,0.013000,16.750000,11.087611,1.228000
01/03/2020,0.794915,28.616304,0.027643,0.018990,19.861446,13.664307,1.236000
01/04/2020,0.817797,24.173913,0.031677,0.011233,19.096774,11.571699,1.140000
01/05/2020,0.714407,20.461828,0.033058,0.009000,13.103659,9.578448,0.884000
...,...,...,...,...,...,...,...
12/27/2022,0.693750,23.434337,0.028159,,18.103261,7.348347,0.814286
12/28/2022,0.453125,19.014118,0.029987,,11.724014,7.036765,0.904545
12/29/2022,0.394792,16.314535,0.029263,0.004000,11.717391,5.935887,0.509091
12/30/2022,0.343750,13.708824,0.023378,,13.032967,4.111475,2.665217


Unnamed: 0_level_0,CO conc (ppm),NO2 conc (ppb),Ozone conc (ppm),Pb Conc (ug/m3 SC),PM10 Conc (ug/m3 SC),PM2.5 Conc (ug/m3 LC,SO2 Conc (ppb)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01/01/2020,0.606780,19.129570,0.030783,0.008000,17.211765,13.533446,1.236000
01/02/2020,0.643220,24.476087,0.028740,0.013000,16.750000,11.087611,1.228000
01/03/2020,0.794915,28.616304,0.027643,0.018990,19.861446,13.664307,1.236000
01/04/2020,0.817797,24.173913,0.031677,0.011233,19.096774,11.571699,1.140000
01/05/2020,0.714407,20.461828,0.033058,0.009000,13.103659,9.578448,0.884000
...,...,...,...,...,...,...,...
12/27/2022,0.693750,23.434337,0.028159,,18.103261,7.348347,0.814286
12/28/2022,0.453125,19.014118,0.029987,,11.724014,7.036765,0.904545
12/29/2022,0.394792,16.314535,0.029263,0.004000,11.717391,5.935887,0.509091
12/30/2022,0.343750,13.708824,0.023378,,13.032967,4.111475,2.665217


A: Number of entries with null values: 359
A: Number of entries: 1096
[IterativeImputer] Completing matrix with shape (1096, 7)
[IterativeImputer] Change: 0.013314017907579904, scaled tolerance: 0.2222875 
[IterativeImputer] Early stopping criterion reached.


Unnamed: 0_level_0,CO conc (ppm),NO2 conc (ppb),Ozone conc (ppm),Pb Conc (ug/m3 SC),PM10 Conc (ug/m3 SC),PM2.5 Conc (ug/m3 LC,SO2 Conc (ppb)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01/01/2020,0.606780,19.129570,0.030783,0.008000,17.211765,13.533446,1.236000
01/02/2020,0.643220,24.476087,0.028740,0.013000,16.750000,11.087611,1.228000
01/03/2020,0.794915,28.616304,0.027643,0.018990,19.861446,13.664307,1.236000
01/04/2020,0.817797,24.173913,0.031677,0.011233,19.096774,11.571699,1.140000
01/05/2020,0.714407,20.461828,0.033058,0.009000,13.103659,9.578448,0.884000
...,...,...,...,...,...,...,...
12/27/2022,0.693750,23.434337,0.028159,0.015970,18.103261,7.348347,0.814286
12/28/2022,0.453125,19.014118,0.029987,0.015678,11.724014,7.036765,0.904545
12/29/2022,0.394792,16.314535,0.029263,0.004000,11.717391,5.935887,0.509091
12/30/2022,0.343750,13.708824,0.023378,0.017044,13.032967,4.111475,2.665217
