For real-time ozone predictions the input will be weather predictions, not actual future weather. To account for this difference, add uncertainty to future weather predictions.

In [None]:
import pandas as pd
import numpy as np
from datetime import date
import calendar

In [193]:
#import data with lags and reset datetime index
df = pd.read_csv('ozone_8hr_lags.csv')
df.rename(columns = {"Unnamed: 0": "Date"}, inplace = True) 
df = df.set_index('Date')
df.index = pd.to_datetime(df.index)
df.dropna(inplace = True) 
df.head()

Unnamed: 0_level_0,Temp,Ozone,Pressure,Humidity,Wind Speed,Temp-8,Ozone-8,Pressure-8,Humidity-8,Wind Speed-8,...,Humidity+40,Wind Speed+40,Temp+48,Ozone+48,Pressure+48,Humidity+48,Wind Speed+48,Month,Hour,Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-04-02 00:00:00,59.8,0.014,1010.0,81.0,3.2,67.8,0.054,1009.0,77.0,7.0,...,66.0,6.7,61.6,0.011,1008.0,74.0,2.4,Apr,0,Sat
2016-04-02 08:00:00,75.0,0.077,1010.0,57.0,8.2,59.8,0.014,1010.0,81.0,3.2,...,74.0,2.4,78.0,0.058,1009.0,62.0,9.3,Apr,8,Sat
2016-04-02 16:00:00,72.6,0.064,1007.0,76.0,7.0,75.0,0.077,1010.0,57.0,8.2,...,62.0,9.3,74.0,0.06,1007.0,60.0,8.6,Apr,16,Sat
2016-04-03 00:00:00,62.0,0.015,1007.0,78.0,4.0,72.6,0.064,1007.0,76.0,7.0,...,60.0,8.6,64.5,0.007,1008.0,58.0,3.2,Apr,0,Sun
2016-04-03 08:00:00,78.1,0.077,1007.0,43.0,10.7,62.0,0.015,1007.0,78.0,4.0,...,58.0,3.2,85.7,0.068,1008.0,40.0,7.9,Apr,8,Sun


In [195]:
#function that adds noise to a feature
def add_noise(df2, holder):
    #go through the years I have data for
    for i in range(2016, 2019):
        #go through all 12 months
        for j in range (1, 13):
            #get number of values for this specific condition, then stdev, then create noise
            count_condition = df2[(df2['Hour'] == 0) & (df2.index.month == j) & (df2.index.year == i)][holder].count()
            stdev_0 = df2[(df2['Hour'] == 0) & (df2.index.month == j) & (df2.index.year == i)][holder].std()
            #get normal distribution with mean of zero and stdev that was just calculated for the correct sample size
            noise_0 = np.random.normal(0, stdev_0, count_condition)
            #add noise to original values and replace those original values
            df2.loc[(df2['Hour'] == 0) & (df2.index.month == j) & (df2.index.year == i), holder] = df2[(df2['Hour'] == 0) & 
                                                                                                       (df2.index.month == j) & (df2.index.year == i)][holder] + noise_0
            #repeat this for the 8-16 hour time of day slot
            stdev_8 = df2[(df2['Hour'] == 8) & (df2.index.month == j) & (df2.index.year == i)][holder].std()
            noise_8 = np.random.normal(0, stdev_8, count_condition)
            df2.loc[(df2['Hour'] == 8) & (df2.index.month == j) & (df2.index.year == i), holder] = df2[(df2['Hour'] == 8) & 
                                                                                                       (df2.index.month == j) & (df2.index.year == i)][holder] + noise_8
            #repeat this for the 16-24 hour time of day slot
            stdev_16 = df2[(df2['Hour'] == 16) & (df2.index.month == j) & (df2.index.year == i)][holder].std()
            noise_16 = np.random.normal(0, stdev_16, count_condition)
            df2.loc[(df2['Hour'] == 16) & (df2.index.month == j) & (df2.index.year == i), holder] = df2[(df2['Hour'] == 16) & 
                                                                                                        (df2.index.month == j) & (df2.index.year == i)][holder] + noise_16
    return df2

In [196]:
#call noise function for each weather feature into the future
df2 = add_noise(df, 'Temp+8')
df2 = add_noise(df2, 'Temp+16')
df2 = add_noise(df2, 'Temp+24')
df2 = add_noise(df2, 'Temp+32')
df2 = add_noise(df2, 'Temp+40')
df2 = add_noise(df2, 'Temp+48')

df2 = add_noise(df2, 'Pressure+8')
df2 = add_noise(df2, 'Pressure+16')
df2 = add_noise(df2, 'Pressure+24')
df2 = add_noise(df2, 'Pressure+32')
df2 = add_noise(df2, 'Pressure+40')
df2 = add_noise(df2, 'Pressure+48')

df2 = add_noise(df2, 'Humidity+8')
df2 = add_noise(df2, 'Humidity+16')
df2 = add_noise(df2, 'Humidity+24')
df2 = add_noise(df2, 'Humidity+32')
df2 = add_noise(df2, 'Humidity+40')
df2 = add_noise(df2, 'Humidity+48')

df2 = add_noise(df2, 'Wind Speed+8')
df2 = add_noise(df2, 'Wind Speed+16')
df2 = add_noise(df2, 'Wind Speed+24')
df2 = add_noise(df2, 'Wind Speed+32')
df2 = add_noise(df2, 'Wind Speed+40')
df2 = add_noise(df2, 'Wind Speed+48')

In [197]:
df2.head()

Unnamed: 0_level_0,Temp,Ozone,Pressure,Humidity,Wind Speed,Temp-8,Ozone-8,Pressure-8,Humidity-8,Wind Speed-8,...,Humidity+40,Wind Speed+40,Temp+48,Ozone+48,Pressure+48,Humidity+48,Wind Speed+48,Month,Hour,Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-04-02 00:00:00,59.8,0.014,1010.0,81.0,3.2,67.8,0.054,1009.0,77.0,7.0,...,78.283179,5.085063,64.243641,0.011,1007.763574,73.931115,3.15429,Apr,0,Sat
2016-04-02 08:00:00,75.0,0.077,1010.0,57.0,8.2,59.8,0.014,1010.0,81.0,3.2,...,96.71916,2.283628,70.722498,0.058,1008.797404,46.476841,6.526764,Apr,8,Sat
2016-04-02 16:00:00,72.6,0.064,1007.0,76.0,7.0,75.0,0.077,1010.0,57.0,8.2,...,61.736666,11.318968,68.168729,0.06,1007.532273,24.662359,7.650176,Apr,16,Sat
2016-04-03 00:00:00,62.0,0.015,1007.0,78.0,4.0,72.6,0.064,1007.0,76.0,7.0,...,47.711675,7.786286,65.234036,0.007,1011.128792,58.606405,-0.018919,Apr,0,Sun
2016-04-03 08:00:00,78.1,0.077,1007.0,43.0,10.7,62.0,0.015,1007.0,78.0,4.0,...,60.25524,1.698319,87.443534,0.068,1006.013352,41.274911,6.848243,Apr,8,Sun


In [198]:
df2.to_csv('uncertainty.csv')