# Design phase 

 steps:

- Implement Functions 
- Load data 
- Add Location( latitude and longitude of sensor stations ) to the data phrame.
- Visualizing the extent and distributed of missing values.
- Visualizing simple methods for estimating missing values.
- Run the nearest neighbor method to establish baseline.

**Neural Network 
- prepare the data to train a neural network 
- split data into training and testing sets. 
- train and test neural network model for estimate missing data.
- Visualizing the result from baseline model compared to neural network.
- estimate missing sensor measurements across all pollutants.

 # Imports





In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import math 
import re 
import warnings 
import torch 
import tensorflow  as tf
import ipywidgets as widgets
from tensorflow import keras
from keras import layers 
from ipywidgets import interact
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error 
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer 
from typing import List , Tuple , Dict 
from datetime import datetime


warnings.simplefilter('ignore')

FONT_SIZE_TICKS = 14
FONT_SIZE_TITLE=20
FONT_SIZE_AXES =16


# Function Implementation

In [43]:
pollutants_list = ['PM2.5','PM10','NO','NO2','NOX','CO','OZONE']

# fixed date 

def fixed_date(df: pd.core.frame.DataFrame , date_column: str)->List[str]:
    
    dates = df[date_column]
    fixed_date = []
    for row in dates:
        line = list(row)
        hour = int(''.join(line[11:13]))-1
        fixed_date.append(''.join( line[:11] + [str(int(hour/10)) + str(int(hour%10))] + line[13:]   ))
    return fixed_date
    
# parse dms 

def pars_dms(coor: str)-> float:
    
    pars = re.split('[^\d\w]+' , coor)
    degree   = float(pars[0])
    minuts   = float(pars[1])
    seconds  = float(pars[2]+'.'+pars[3])
    direction= pars[4]
    dec_coor =  degree + minuts/60 + seconds/(3600)
    if direction == 'S' or direction == 'W':
        dec_coor *= -1 
    return dec_coor
    
    


# time series plot 

def create_time_series_plot(df: pd.core.frame.DataFrame  , start_date: datetime , end_date:datetime):
    
    
    
    def interactive_time(station , target , range_date):
        plt.figure(figsize=(15,6))
        df_station = df[df['Station']== station]
        df_station = df_station[df['DateTime'] > range_date[0]]
        df_station = df_station[df['DateTime'] < range_date[1]]
        if f'{target}_imputed_flag' in df_station:
            # If there is imputed flag, separate the data and plot in two colors
            imputed_data = df_station[['DateTime', target, f'{target}_imputed_flag']]
            imputed_data.loc[imputed_data[f'{target}_imputed_flag'].isnull(), target] = None 
            original_data = df_station[['DateTime', target, f'{target}_imputed_flag']]
            original_data.loc[imputed_data[f'{target}_imputed_flag'].notnull(), target] = None
            plt.plot(imputed_data["DateTime"], imputed_data[target], 'r-', label='Imputed')
            plt.plot(original_data["DateTime"], original_data[target], '-', label='Real')
            plt.legend()
        else :
            plt.plot(df_station['DateTime'] , df_station[target] , '-', label='Real')
        plt.title(f'temporal change of {target}' ,  fontsize= FONT_SIZE_TITLE)
        plt.ylabel(f'{target} concenteration' , fontsize = FONT_SIZE_AXES)
        plt.xticks(rotation = 20 , fontsize =FONT_SIZE_TICKS)
        plt.yticks(fontsize = FONT_SIZE_TICKS)
        plt.show()
        
    
    station_widget  = widgets.Dropdown( 
        
        options=  df.Station.unique(),
        discription = 'Station'
    ) 
    
    pollutant_widget= widgets.Dropdown(  
        
        options= pollutants_list , 
        discription = 'Pollutant' ,
        value = 'PM2.5'
    )
    
    dates  = pd.date_range(start_date , end_date , freq='D')
    options= [ (date.strftime(' %d/%m/%Y ') , date ) for date in dates ]
    index  = (0 , len(options)-1)
    
    selection_dates = widgets.SelectionRangeSlider( 
        
        options = options , 
        index   = index   , 
        discription = 'Dates' , 
        orientation = 'horizontal' , 
        layout={'width': '500px'}  
    )
    
    interact( interactive_time  , station = station_widget , target = pollutant_widget , range_date = selection_dates)




# Visualizing missing values estimation 


def visualizing_missing_values_estimation(df: pd.core.frame.DataFrame , day:datetime):
    
    
    '''Visualizes two ways of interpolating the data: nearest neighbor and last value
    and compares them to the real data'''
    
    day = day.date()
    row_of_day = df.apply(lambda row: row['DateTime'].date()== day , axis=1)
    sample = df[row_of_day] 
    
    
    def draw( sample , station , missing_index , target   ):
        
        sample = sample.copy()
        sample.insert(  
             0,
            'time_discriminator' ,
            (sample['DateTime'].dt.dayofyear*100000 + sample['DateTime'].dt.hour *100).values,
            True
        )
        Real = sample[sample['Station'] == station]
        Example1= Real.copy()
        Real = Real.reset_index()
        Example1= Example1.reset_index()
        Example1.loc[missing_index ,target]  = float('NaN')
        
        missing = missing_index 
        missing_befor_after =  [missing[0]-1] + missing + [missing[-1]+1]
        dates   =set(list(Example1.loc[missing_index , 'DateTime'].astype(str)))
        
        plt.figure(figsize=(10,5))
        plt.plot(missing_befor_after ,  Real.loc[missing_befor_after][target] , 'r--o' , label ='actual values')
        
        sample_copy = sample.copy()
        sample_copy = sample_copy.reset_index()
        
        to_nan =  sample_copy.apply(lambda row: str(row['DateTime']) in dates and row['Station']==station ,axis=1)
        sample_copy.loc[to_nan , target] = float('NaN')
        
        imputer = KNNImputer(n_neighbors=1)
        imputer.fit(sample_copy[['time_discriminator' , 'Latitude' , 'Longitude' , target]])
        
        Example1[f'new_{target}'] = imputer.transform(Example1[['time_discriminator' , 'Latitude' , 'Longitude' ,target]])[:,3]
        
        plt.plot(missing_befor_after , Example1.loc[missing_befor_after][f'new_{target}'] , 'g--o' , label='nearest neighbor')
        plt.plot(Example1.index , Example1[target] ,'-*')
        
        # now full all NaN values with latest value
        Example1[f'ffill_{target}'] = Example1.fillna(method='ffill')[target]
        plt.plot(missing_befor_after , Example1.loc[missing_befor_after][f'ffill_{target}'] , 'y--*' , label='last known value')
        
        plt.title('Estimation missing value' , fontsize = FONT_SIZE_TITLE)
        plt.xlabel('Hours of day' , fontsize = FONT_SIZE_AXES)
        plt.ylabel(f'{target} Concenteration' ,fontsize=FONT_SIZE_AXES)
        plt.xticks(fontsize = FONT_SIZE_TICKS)
        plt.yticks(fontsize = FONT_SIZE_TICKS)
        plt.legend(loc = 'upper left' , fontsize = FONT_SIZE_TICKS)
        plt.show()
        
        
    def selector(station , hour_start , window_size , target):
        missing_index_list = list(range(hour_start , hour_start+window_size))
        
        draw(sample = sample , 
             station= station ,
             missing_index  = missing_index_list,
             target = target)
    
    
    # select the params 
    
    # make widgets 
    
    station_widget = widgets.Dropdown( 
        options = df.Station.unique() ,
        discription= 'Station',
        value = 'USM'
        
    )
    target_widget  = widgets.Dropdown(
        options = pollutants_list ,
        discription = 'Pollutant' ,
        value = 'PM2.5'
        
    )
    
    start_hour_widget = widgets.Dropdown( 
        options = list([2,3,4,5,6,7,8,9,10]) ,
        discription = 'Hour start' , 
        value = 3 
        
    )
    window_size_widget= widgets.Dropdown( 
        options = list([1, 2, 3, 5, 6, 9, 12]) ,
        discription= 'Window Size',
        value =1
    
    )
    
    return interact( selector  , station=station_widget , hour_start=start_hour_widget , window_size =window_size_widget ,
                    target = target_widget)

        

# calculate Mean Absolute Error  for nearest Station 



def calculate_MAE_KNN(df:pd.core.frame.DataFrame , target:str)-> Dict[str,float]:
    
    df2 = df.dropna(inplace=False)
    df2.insert(0 , 'time_discriminator' ,(df2['DateTime'].dt.dayofyear * 100000 + df2['DateTime'].dt.hour * 100).values, True)
    train_data , test_data = train_test_split(df2 , test_size=0.2 , random_state = 57)
    imputer = KNNImputer(n_neighbors=1)
    imputer.fit(train_data[['time_discriminator' , 'Latitude' , 'Longitude' , target]])
    regression_scores = {}
    
    # prepare test data 
    
    y_test = test_data[target].values 
    test_data2= test_data.copy()
    test_data2.loc[test_data.index,target]= float('NaN')
    
    y_predict = imputer.transform(test_data2[['time_discriminator' , 'Latitude' , 'Longitude' ,target]])[:,3]
    
    return {'MAE': mean_absolute_error(y_predict ,y_test)}
    
    

    
    
    
    
def create_model(input_size:int)-> tf.keras.Model:
    # build a neural network with three fully connected layers (64,32,1)
    
    
    model = keras.Sequential( [     
        keras.layers.Dense(64 ,activation = 'relu',input_shape= [input_size] ) , 
        keras.layers.Dense(32 ,activation='relu') ,
        keras.layers.Dense(1)]
        
        )
    optimizer = keras.optimizers.RMSprop(0.007)
    model.compile(loss = 'mse' , 
                 optimizer = optimizer ,
                 metrics = ['mae'])
    return model 



def train_and_test_model( 
                            features_name:List[str] , 
                            target: str  ,
                            train_data: pd.core.frame.DataFrame ,
                            test_data : pd.core.frame.DataFrame , 
                            model : keras.Model      , 
                            number_epochs : int=100 ,
                            batch_size: int=64)-> Tuple[keras.Model  , StandardScaler ,Dict[str ,float]]:
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_data[features_name])
    y_train = train_data[target]
    X_test  = scaler.transform(test_data[features_name])
    y_test  = test_data[target] 
    
    # Biuld and train the model 
    model.fit(X_train ,y_train , batch_size =batch_size , epochs = number_epochs) 
    y_pred = model.predict(X_test)
    MAE    = {'MAE' : mean_absolute_error(y_pred , y_test)}
    
    return model , scaler , MAE
    

    
def create_plot_with_predictions(df        : pd.core.frame.DataFrame ,
                                 model     : keras.Model             ,
                                 scaler    : StandardScaler          ,
                                 target    : str                     ,
                                 start_date: datetime                ,
                                 end_date  : datetime):
    
    def draw(sample , station , prediction , missing_index):
        sample = sample.copy() 
        sample.insert( 0 , 
                     'time_discriminator' ,
                     (sample['DateTime'].dt.dayofyear *100000 +sample['DateTime'].dt.hour*100).values , True)
        real_data = sample[sample['Station'] == station] 
        example   = real_data.copy() 
        real_data = real_data.reset_index()
        example   = example.reset_index()
        example.loc[missing_index , target] = float('NaN')
        
        missing = missing_index 
        missing_befor_after  = [missing[0]-1] +missing + [missing[-1]+1]
        dates =   set(list(example.loc[missing_index , 'DateTime'].astype(str)))
        
        plt.plot(missing_befor_after , real_data.loc[missing_befor_after][target] , 'r--o' , label ='actual values')
        
        copy_data = sample.copy()
        copy_data = copy_data.reset_index()
        to_nan    = copy_data.apply(lambda row: str(row['DateTime']) in dates and row['Station']==station , axis=1)
        copy_data.loc[to_nan , target] = float('NaN')
        
        imputer = KNNImputer(n_neighbors=1)
        imputer.fit(copy_data[['time_discriminator' , 'Latitude' , 'Longitude' ,target]])
        example[f'new_{target}'] = imputer.transform(example[['time_discriminator' , 'Latitude' , 'Longitude' , target]])[:,3]
        plt.plot(missing_befor_after , example.loc[missing_befor_after][f'new_{target}'] , 'g--o' , label='nearest neighbor')
        plt.plot(example.index , example[target] ,'-*')
        
        example[f'nn_{target}'] = example[target].copy()
        example.loc[missing , f'nn_{target}'] = prediction[np.array(missing)]
        plt.plot(missing_befor_after , example.loc[missing_befor_after][f'nn_{target}'] ,'y--*' , label='neural network')
        
        plt.title('Value Predictions' , fontsize = FONT_SIZE_TITLE)
        plt.xlabel('Index' , fontsize = FONT_SIZE_AXES)
        plt.ylabel(f'{target} concentration' , fontsize=FONT_SIZE_AXES)
        plt.xticks(fontsize=FONT_SIZE_TICKS)
        plt.yticks(fontsize=FONT_SIZE_TICKS)
        plt.legend(loc ='upper left' , fontsize=FONT_SIZE_TICKS)
    
    
    
    def plot_prediction(station , size , start_index):
        try:
            data = df[df['DateTime']>start_date]
            data = data[data['DateTime']<end_date]

            X_test = data[data.Station==station]
            X_test = X_test[features_name]
            X_test = scaler.transform(X_test)
            y_test = data[target]

            y_prediction = model.predict(X_test)

            plt.figure(figsize=(10,5))
            draw(data , station , y_prediction ,list(range(start_index , start_index+size)))
            plt.show()
        except Exception as e :
            print(f'The selected range cannot be plotted due to missing values. Please select other values.\n')
            print(e)
        
        
    # create widgets 
    
    station_widget = widgets.Dropdown( 
        options = df.Station.unique(),
        discription= 'Station'
        
    ) 
    window_size_widget= widgets.Dropdown(
        options = list([1, 2, 3, 5, 6, 12, 24]) ,
        discription= 'Window Size'
        
    )
    index_selector = widgets.IntSlider( value =1 ,
                                       min = 1 ,
                                      max  = 48 ,
                                      step =1 ,
                                      discription= 'Index')
    
    interact(plot_prediction , station= station_widget , size=window_size_widget , start_index = index_selector)
    

    

    
def imput_non_target_missing_values_interpolate( 
        df_with_missing: pd.core.frame.DataFrame ,
        features_name:List[str] ,
        target: str)-> pd.core.frame.DataFrame:
    '''
    Imputes data to non-target variables using interpolation.
    This data can then be used by NN to impute the target column.'''
    
    
    pollutants_except_target = [i for i in pollutants_list if i !=target]
    #flag the data that was imputes 
    imputed_flag = df_with_missing[pollutants_except_target]
    
    for pollutant in pollutants_except_target:
        # create imputed flag for each pollutant
        imputed_flag[f'{pollutant}_imputed_flag'] = np.where(imputed_flag[pollutant].isnull(), 'interpolated' , None)
        imputed_flag.drop(pollutant , axis=1 , inplace=True)
        # impute a value to the first one if it is missing , because interpolate doesn't fixe the first value
        if np.any(df_with_missing.loc[[df_with_missing.index[0]] , [pollutant]].isnull()):
            df_with_missing.loc[[df_with_missing.index[0]] , [pollutant]]=[12]
    
    #interpolate missing value 
    imputed_value = df_with_missing[features_name].interpolate(method='linear')
    
    imputed_value_with_flag = imputed_value.join(imputed_flag)
    return imputed_value_with_flag



            
        
    
def imput_target_missing_values_neural_network( 
    df_with_missing :pd.core.frame.DataFrame ,
    model:keras.Model ,
    scaler:StandardScaler ,
    baseline_imputed:pd.core.frame.DataFrame ,
    target:str)->pd.core.frame.DataFrame:
    
    # Metadata columns that we want to output in the end
    metadata_columns = ['DateTime' , 'Station' ,'Latitude', 'Longitude']
    # save the data and imputes flags of non-target for outputing later 
    baseline_imputed_data_and_flag = baseline_imputed[[i for i in list(baseline_imputed) if i in pollutants_list or 'flag' in i]]
    
    #flag the data that will be imputed with NN
    imputed_flag = df_with_missing[[target]]
    imputed_flag[f'{target}_imputed_flag'] = np.where(imputed_flag[target].isnull() , 'neural network' , None)
    imputed_flag.drop(target  ,axis=1 , inplace=True)
    
    # for perdicting drop the flags, because neural network doesn't work with it. 
    baseline_imputed = baseline_imputed[[i for i in list(baseline_imputed.columns) if 'flag' not in  i]]
    # for predicting we just need the rows that hav missing values.
    baseline_imputed = baseline_imputed[df_with_missing[target].isnull()]
    
    # prediction 
    baseline_imputed = scaler.transform(baseline_imputed)
    predicted_target = model.predict(baseline_imputed)
    
    # replace the missing values with actual data with predicted one.
    index_missing = df_with_missing[target].isnull()
    data_with_imputed = df_with_missing.copy()
    data_with_imputed.loc[index_missing , target] = predicted_target
    
    # add the flag to imputed target data 
    final_data = data_with_imputed[metadata_columns + [target] ].join(imputed_flag).join(baseline_imputed_data_and_flag)
    
    # let's rearange columns to get easy show 
    
    rearange_columns = metadata_columns + pollutants_list + [f'{i}_imputed_flag' for i in pollutants_list]
    final_data = final_data[rearange_columns]
    
    return final_data
    
    
    
    


In [58]:
# LOad data 
path_dir = 'E:\\projects\\air_quality\\data'
data_file= 'RMCAB_air_quality_sensor_data.csv'
station_file= 'stations_loc.csv'

data = pd.read_csv(f'{path_dir}\\{data_file}')
data['DateTime'] = pd.to_datetime(fixed_date(data ,'DateTime') , dayfirst =True)
data = data.rename(columns={'OZONO':'OZONE'})

data.head()

Unnamed: 0,PM10,PM2.5,NO,NO2,NOX,CO,OZONE,Station,DateTime
0,56.6,32.7,7.504,15.962,23.493,0.44924,2.431,USM,2021-01-01 00:00:00
1,59.3,39.3,16.56,17.866,34.426,0.69832,1.121,USM,2021-01-01 01:00:00
2,96.4,70.8,22.989,17.802,40.791,0.88243,1.172,USM,2021-01-01 02:00:00
3,108.3,81.0,3.704,9.886,13.591,0.29549,6.565,USM,2021-01-01 03:00:00
4,87.7,56.1,2.098,9.272,11.371,0.16621,9.513,USM,2021-01-01 04:00:00


# add Location ( latitude and longitude of sensor stations) to the data frame



In [59]:
station_location_data = pd.read_csv(f'{path_dir}\\{station_file}')
station_location_data.head()

Unnamed: 0,estacion,Sigla,Latitud,Longitud,Altitud (m),Altura (m),Localidad,Tipo de zona,Tipo de estación,Dirección
0,guaymaral,GYR,"4°47'01.5""N","74°02'38.9""W",2580,0,Suba,Sub urbana,De fondo,Autopista Norte # 205-59
1,usaquen,USQ,"4°42'37.26""N","74°1'49.50""W",2570,10,Usaquén,Urbana,De fondo,Carrera 7B Bis # 132-11
2,suba,SUB,"4°45'40.49""N","74° 5'36.46""W",2571,6,Suba,Sub urbana,De fondo,Carrera 111 # 159A-61
3,bolivia,BOL,"4°44'08.9""N","74°07'33.2""W",2574,0,Engativá,Sub urbana,De fondo,Avenida Calle 80 # 121-98
4,las_ferias,LFR,"4°41'26.52""N","74°4'56.94""W",2552,0,Engativá,Urbana,De tráfico,Avenida Calle 80 # 69Q-50


 as we sea, we have location of sensor station, now let's add it in air pollution data, 
 but this columns are spanish so let's translate it into english first
 - we need ( Sigla : 'Station' , Latitud :'Latitude' , Longitud: 'Longitude' ) this columns have informations that we need.
 - then we need to get ( latitude and longitude ) as a numers not degeres
 
 let's do it 

In [60]:
station = station_location_data[['Sigla' , 'Latitud' , 'Longitud']]

station = station.rename(columns = {'Sigla':'Station' , 'Longitud':'Longitude' , 'Latitud':'Latitude' })

station['Latitude'] = station['Latitude'].apply(pars_dms)
station['Longitude']= station['Longitude'].apply(pars_dms)


station.head()

Unnamed: 0,Station,Latitude,Longitude
0,GYR,4.78375,-74.044139
1,USQ,4.71035,-74.030417
2,SUB,4.761247,-74.093461
3,BOL,4.735806,-74.125889
4,LFR,4.6907,-74.082483


# Merge

In [61]:

data   = pd.merge(data , station , on='Station' , how ='inner')
data.head()

Unnamed: 0,PM10,PM2.5,NO,NO2,NOX,CO,OZONE,Station,DateTime,Latitude,Longitude
0,56.6,32.7,7.504,15.962,23.493,0.44924,2.431,USM,2021-01-01 00:00:00,4.532056,-74.117139
1,59.3,39.3,16.56,17.866,34.426,0.69832,1.121,USM,2021-01-01 01:00:00,4.532056,-74.117139
2,96.4,70.8,22.989,17.802,40.791,0.88243,1.172,USM,2021-01-01 02:00:00,4.532056,-74.117139
3,108.3,81.0,3.704,9.886,13.591,0.29549,6.565,USM,2021-01-01 03:00:00,4.532056,-74.117139
4,87.7,56.1,2.098,9.272,11.371,0.16621,9.513,USM,2021-01-01 04:00:00,4.532056,-74.117139


# Count Null Values

In [7]:
data.isnull().sum()

PM10         20014
PM2.5        15312
NO           27664
NO2          27662
NOX          27668
CO           31238
OZONE        32132
Station          0
DateTime         0
Latitude         0
Longitude        0
dtype: int64

# Visualizing Missing Data In a Time Series

In [8]:
start_date = datetime(2021, 1, 1)
end_date   = datetime(2021, 1, 31)

create_time_series_plot(data , start_date , end_date)

interactive(children=(Dropdown(description='station', options=('USM', 'BOL', 'SUB', 'TUN', 'LFR', 'PTE', 'MAM'…

# Visualizing Simple Methods For Estimating missing values

In [9]:
day = datetime(2021,5,6)


visualizing_missing_values_estimation(data , day)

interactive(children=(Dropdown(description='station', options=('USM', 'BOL', 'SUB', 'TUN', 'LFR', 'PTE', 'MAM'…

<function __main__.visualizing_missing_values_estimation.<locals>.selector(station, hour_start, window_size, target)>

In [10]:
regression_scores = {}
regression_scores['baseline_model'] = calculate_MAE_KNN(data , target='PM2.5')

In [11]:
regression_scores['baseline_model']

{'MAE': 8.035857704059362}

# prepare the data to train neural network

In [62]:
# add week_day and hour columns to data frame 

data2 = data.copy()
data2['week_day'] = pd.DatetimeIndex(data2['DateTime']).weekday
data2['hour'] =  pd.DatetimeIndex(data2['DateTime']).hour


# create a numerical representation of station ID and add as extra columns

one_hot =  pd.get_dummies(data2['Station'] , prefix ='Station')
data2 = data2.join(one_hot)


# let's drop all rows that have nulls values 


data_no_missing = data2.copy()
data_no_missing.dropna(inplace=True)


no_nulls = data_no_missing['PM2.5'].isnull().sum()

if no_nulls == 0:
    print(f'data prepared successfully and missing values removed')




data prepared successfully and missing values removed


In [13]:
one_hot

Unnamed: 0,Station_7MA,Station_BOL,Station_CBV,Station_CDAR,Station_COL,Station_CSE,Station_FTB,Station_GYR,Station_JAZ,Station_KEN,Station_LFR,Station_MAM,Station_MOV2,Station_PTE,Station_SCR,Station_SUB,Station_TUN,Station_USM,Station_USQ
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166435,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
166436,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
166437,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
166438,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [14]:
print(list(data['Station'].unique()))

['USM', 'BOL', 'SUB', 'TUN', 'LFR', 'PTE', 'MAM', 'CBV', 'CDAR', 'SCR', 'FTB', 'JAZ', '7MA', 'CSE', 'GYR', 'COL', 'MOV2', 'USQ', 'KEN']


# Split data into train and test data 


In [15]:
train_data , test_data  = train_test_split(data_no_missing ,test_size=0.1 , random_state =57)


print(f'train shape : {train_data.shape}')
print(f'test shape  : {test_data.shape}')

train shape : (82473, 32)
test shape  : (9164, 32)


# Train and Test model

In [16]:
target = 'PM2.5'
pollutants_list = ['PM10' ,'PM2.5' , 'NO' ,'NO2' , 'NOX' , 'CO' ,'OZONE']
pollutants_except_target = [i for i in pollutants_list if i !=target]
features_name   = ['week_day' , 'hour'] + list(one_hot.columns) + pollutants_except_target 

#create model 
model = create_model(input_size = len(features_name))
# train the model 
model , scaler , MAE  = train_and_test_model(features_name = features_name ,
                                             target        = target        ,
                                             train_data    = train_data    ,
                                             test_data     = test_data     , 
                                             model         = model         , 
                                             number_epochs = 100           ,
                                             batch_size    = 64            
                                            )

regression_scores['neural network'] = MAE



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [17]:
for model_name , score in regression_scores.items():
    print(f'{model_name}:\t{score}')

baseline_model:	{'MAE': 8.035857704059362}
neural network:	{'MAE': 4.049615209636497}


# Visualizing the result from baseline model and neural network

In [34]:
start_date = datetime(2021,2,1)
end_date   = datetime(2021,2,3)

create_plot_with_predictions(    df        = data_no_missing ,
                                 model     = model           ,
                                 scaler    = scaler          ,
                                 target    =target           ,
                                 start_date=start_date       ,
                                 end_date  =end_date)



interactive(children=(Dropdown(description='station', options=('USM', 'BOL', 'SUB', 'TUN', 'LFR', 'PTE', 'MAM'…

# Estimate missing sensors measurements across all pollutants


use two methods to get missing values in PM2.5 

In [53]:
# estimate non-PM2.5 missing values with nearest neighbor
imputed_with_baseline_model = imput_non_target_missing_values_interpolate(data2 , features_name , target)

In [63]:
# estimate PM2.5 missing values with NN 
imputed_with_nn = imput_target_missing_values_neural_network(df_with_missing = data2 ,
                                                            model = model ,
                                                            scaler = scaler ,
                                                            target = target,
                                                            baseline_imputed= imputed_with_baseline_model)




In [64]:
# show if there are any nulls 

imputed_with_nn[pollutants_list].isnull().sum()

PM2.5    0
PM10     0
NO       0
NO2      0
NOX      0
CO       0
OZONE    0
dtype: int64

In [66]:
# let's show the new data 

imputed_with_nn.head()

Unnamed: 0,DateTime,Station,Latitude,Longitude,PM2.5,PM10,NO,NO2,NOX,CO,OZONE,PM2.5_imputed_flag,PM10_imputed_flag,NO_imputed_flag,NO2_imputed_flag,NOX_imputed_flag,CO_imputed_flag,OZONE_imputed_flag
0,2021-01-01 00:00:00,USM,4.532056,-74.117139,32.7,56.6,7.504,15.962,23.493,0.44924,2.431,,,,,,,
1,2021-01-01 01:00:00,USM,4.532056,-74.117139,39.3,59.3,16.56,17.866,34.426,0.69832,1.121,,,,,,,
2,2021-01-01 02:00:00,USM,4.532056,-74.117139,70.8,96.4,22.989,17.802,40.791,0.88243,1.172,,,,,,,
3,2021-01-01 03:00:00,USM,4.532056,-74.117139,81.0,108.3,3.704,9.886,13.591,0.29549,6.565,,,,,,,
4,2021-01-01 04:00:00,USM,4.532056,-74.117139,56.1,87.7,2.098,9.272,11.371,0.16621,9.513,,,,,,,


# Visualizing the results of filling in missing PM2.5 values

In [67]:
start_date = datetime(2021, 3, 1)
end_date = datetime(2021, 4, 30)
create_time_series_plot(imputed_with_nn ,start_date ,end_date)


interactive(children=(Dropdown(description='station', options=('USM', 'BOL', 'SUB', 'TUN', 'LFR', 'PTE', 'MAM'…