In [389]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [390]:
df_P = pd.io.parsers.read_csv(filepath_or_buffer = 'data/D0_filt_10mino3.csv',index_col = 0)
print df_P[:5]

                       O3_ppb    UnixTime       e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                         
7/10/14 20:25              41  1405023930  114.714286  45.800000  11.400000   
7/10/14 20:26              39  1405023991  120.428571  45.814286  11.314286   
7/10/14 20:27              44  1405024053  119.142857  45.857143  11.300000   
7/10/14 20:28              47  1405024110  117.500000  45.900000  11.300000   
7/10/14 20:29              44  1405024167  120.285714  45.900000  11.457143   

                       Zenith Angle [degrees]  days from start  ref_o3_smooth  \
DATE (MM/DD/YYYY)_MST                                                           
7/10/14 20:25                        99.48960                0             41   
7/10/14 20:26                        99.63789                0             39   
7/10/14 20:27                        99.78580                0             44   
7/10/14 20:28                        99.9

In [391]:
df_P.drop(df_P.columns[[7,8]], axis=1, inplace=True)
print df_P[:5]

                       O3_ppb    UnixTime       e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                         
7/10/14 20:25              41  1405023930  114.714286  45.800000  11.400000   
7/10/14 20:26              39  1405023991  120.428571  45.814286  11.314286   
7/10/14 20:27              44  1405024053  119.142857  45.857143  11.300000   
7/10/14 20:28              47  1405024110  117.500000  45.900000  11.300000   
7/10/14 20:29              44  1405024167  120.285714  45.900000  11.457143   

                       Zenith Angle [degrees]  days from start  
DATE (MM/DD/YYYY)_MST                                           
7/10/14 20:25                        99.48960                0  
7/10/14 20:26                        99.63789                0  
7/10/14 20:27                        99.78580                0  
7/10/14 20:28                        99.93332                0  
7/10/14 20:29                       100.08045           

####Define the slope function.

In [392]:
def find_lag_slope(df_P, int_min, data_col):
    df_P['diff'] = df_P[data_col].diff()
    
    slope = []
    for i in range(int_min,len(df_P['diff'])):
        top = i-int_min
        slope.append(df_P['diff'][top:i].mean())
    
    for i in range(0,int_min):
        slope.insert(i, 'NaN')

    return slope

In [393]:
def find_lead_slope(df_P, int_min, data_col):
    df_P['diff'] = df_P[data_col].diff()
    slope = []
    for i in range(0,len(df_P[data_col])-int_min):
        top = i + int_min
        slope.append(df_P['diff'][i:top].mean())
    
    for i in range(len(df_P[data_col])-int_min, len(df_P[data_col])):
        slope.insert(i, 'NaN')

    return slope

####Define functions that create features that represent the area under the temp and humidity curves.

In [394]:
def find_lag_integral(df_P, int_min, data_col):
    area_curve = []

    for i in range(int_min,len(df_P[data_col])):
        top = i - int_min
        area_curve.append(np.trapz(df_P[data_col][top:i]))
    
    for i in range(0,int_min):
        area_curve.insert(i, 'NaN')
    
    return area_curve

In [395]:
def find_lead_integral(df_P, int_min, data_col):
    area_curve = []

    for i in range(0,len(df_P[data_col])-int_min):
        top = i + int_min
        area_curve.append(np.trapz(df_P[data_col][i:top]))
    
    for i in range(len(df_P[data_col])-int_min, len(df_P[data_col])):
        area_curve.insert(i, 'NaN')
    
    return area_curve

In [396]:
#relative start time must be less than relative end time
def sliding_integral_lag(df_P, start_min_before, end_min_before, column):
    interval = start_min_before - end_min_before 
    a = df_P['e2v03'].shift(end_min_before).values
    v = np.zeros(interval)
    v[0:interval] = 1
    out = np.convolve(a, v, 'valid')
    out = np.concatenate((np.array([float('nan')] * (interval-1)), out))
    return out

In [397]:
#relative start time must be less than relative end time
def sliding_integral_lead(df_P, start_min_after, end_min_after, column):
    interval = end_min_after - start_min_after 
    a = df_P[column].shift(start_min_after).values
    a = a[~np.isnan(a)]
    v = np.zeros(end_min_after)
    v[0:interval] = 1
    out = np.convolve(a, v, 'valid')
    out = np.concatenate((out, np.array([float('nan')] * (start_min_after+end_min_after-1))))
    return out

###Call the slope functions and add them to the dataframe.

In [398]:
def make_func_caller_find_lag_integral(df_P, min_time, max_time, interval, column):
    i = min_time
    while i <= max_time:    
        df_P[str(column) + '_int' + '_lag_' + str(i)] = find_lag_integral(df_P, i, column)      
        i += interval
    return df_P  

def make_func_caller_find_lead_integral(df_P, min_time, max_time, interval, column):
    i = min_time
    while i <= max_time:    
        df_P[str(column) + '_int' + '_lead_' + str(i)] = find_lead_integral(df_P, i, column)      
        i += interval
    return df_P 

def make_func_caller_sliding_integral_lag(df_P, max_end_min_before, min_end_min_before, frequency, interval_size, column):
    i = min_end_min_before
    while i <= max_end_min_before:
        start = i + interval_size
        df_P[str(column) + '_int' + '_slide_' + str(start) + '_to_' + str(i) + '_lag'] = sliding_integral_lag(df_P, start, i, column)      
        i += frequency
    return df_P 

def make_func_caller_sliding_integral_lead(df_P, min_start_min_after, max_start_min_after, frequency, interval_size, column):
    i = min_start_min_after
    while i <= max_start_min_after:
        start = i + interval_size
        df_P[str(column) + '_int' + '_slide_' + str(i) + '_to_' + str(start) + '_lead'] = sliding_integral_lead(df_P, start, i, column)      
        i += frequency
    return df_P 

def make_func_caller_find_lag_slope(df_P, min_time, max_time, interval, column):
    i = min_time
    while i <= max_time:    
        df_P[str(column) + '_slope' + '_lag_' + str(i)] = find_lag_slope(df_P, i, column)      
        i += interval
    return df_P 

def make_func_caller_find_lead_slope(df_P, min_time, max_time, interval, column):
    i = min_time
    while i <= max_time:    
        df_P[str(column) + '_slope' + '_lead_' + str(i)] = find_lead_slope(df_P, i, column)      
        i += interval
    return df_P 

def make_func_caller_find_sliding_slope(df_P, start_bef_or_aft_point, end_bef_or_aft_point, size, column):
    i = size
    while i <= size:    
        df_P[str(column) + '_slope' + '_slide_' + str(i)] = find_sliding_slope(df_P, start_bef_or_aft_point, end_bef_or_aft_point, i, column)      
        i += size
    return df_P

In [399]:
#call the lag int function for o3, temp, and rh
make_func_caller_find_lag_integral(df_P, 5, 120, 15, 'e2v03')
make_func_caller_find_lag_integral(df_P, 5, 120, 15, 'Temp')
make_func_caller_find_lag_integral(df_P, 5, 120, 15, 'Rh')

#call the lead int for o3, temp, and rh
make_func_caller_find_lead_integral(df_P, 5, 120, 15, 'e2v03')
make_func_caller_find_lead_integral(df_P, 5, 120, 15, 'Temp')
make_func_caller_find_lead_integral(df_P, 5, 120, 15, 'Rh')

#call the lag sliding integral function
# inputs- (df_P, max_end_min_before, min_end_min_before, frequency, interval_size, column)
make_func_caller_sliding_integral_lag(df_P, 15, 5, 5, 5, 'e2v03')
make_func_caller_sliding_integral_lag(df_P, 20, 5, 5, 10, 'e2v03')

make_func_caller_sliding_integral_lag(df_P, 15, 5, 5, 5, 'Temp')
make_func_caller_sliding_integral_lag(df_P, 20, 5, 5, 10, 'Temp')

make_func_caller_sliding_integral_lag(df_P, 15, 5, 5, 5, 'Rh')
make_func_caller_sliding_integral_lag(df_P, 20, 5, 5, 10, 'Rh')

#call the lead sliding integral function
# inputs- (df_P, min_start_min_after, max_start_min_after, frequency, interval_size, column)
make_func_caller_sliding_integral_lead(df_P, 2, 10, 2, 5, 'e2v03')
make_func_caller_sliding_integral_lead(df_P, 2, 10, 2, 10, 'e2v03')

make_func_caller_sliding_integral_lead(df_P, 2, 10, 2, 5, 'Temp')
make_func_caller_sliding_integral_lead(df_P, 2, 10, 2, 10, 'Temp')

make_func_caller_sliding_integral_lead(df_P, 2, 10, 2, 5, 'Rh')
make_func_caller_sliding_integral_lead(df_P, 2, 10, 2, 10, 'Rh')

#call the lag slope for o3, temp, and rh
make_func_caller_find_lag_slope(df_P, 5, 120, 15, 'e2v03')
make_func_caller_find_lag_slope(df_P, 5, 120, 15, 'Temp')
make_func_caller_find_lag_slope(df_P, 5, 120, 15, 'Rh')

#call the lead slope for o3, temp, and rh
make_func_caller_find_lead_slope(df_P, 5, 120, 15, 'e2v03')
make_func_caller_find_lead_slope(df_P, 5, 120, 15, 'Temp')
make_func_caller_find_lead_slope(df_P, 5, 120, 15, 'Rh')

features = list(df_P.columns)

In [400]:
features

['O3_ppb',
 'UnixTime',
 'e2v03',
 'Temp',
 'Rh',
 'Zenith Angle [degrees]',
 'days from start',
 'e2v03_int_lag_5',
 'e2v03_int_lag_20',
 'e2v03_int_lag_35',
 'e2v03_int_lag_50',
 'e2v03_int_lag_65',
 'e2v03_int_lag_80',
 'e2v03_int_lag_95',
 'e2v03_int_lag_110',
 'Temp_int_lag_5',
 'Temp_int_lag_20',
 'Temp_int_lag_35',
 'Temp_int_lag_50',
 'Temp_int_lag_65',
 'Temp_int_lag_80',
 'Temp_int_lag_95',
 'Temp_int_lag_110',
 'Rh_int_lag_5',
 'Rh_int_lag_20',
 'Rh_int_lag_35',
 'Rh_int_lag_50',
 'Rh_int_lag_65',
 'Rh_int_lag_80',
 'Rh_int_lag_95',
 'Rh_int_lag_110',
 'e2v03_int_lead_5',
 'e2v03_int_lead_20',
 'e2v03_int_lead_35',
 'e2v03_int_lead_50',
 'e2v03_int_lead_65',
 'e2v03_int_lead_80',
 'e2v03_int_lead_95',
 'e2v03_int_lead_110',
 'Temp_int_lead_5',
 'Temp_int_lead_20',
 'Temp_int_lead_35',
 'Temp_int_lead_50',
 'Temp_int_lead_65',
 'Temp_int_lead_80',
 'Temp_int_lead_95',
 'Temp_int_lead_110',
 'Rh_int_lead_5',
 'Rh_int_lead_20',
 'Rh_int_lead_35',
 'Rh_int_lead_50',
 'Rh_int_lea

In [401]:
#Delete the first and last 115 rows of the dataframe to remove all NaNs.
df_P = df_P.ix[120:len(df_P['e2v03'])-120]

In [404]:
#ozone terms
df_P['O3_sqrt'] = np.sqrt(df_P['e2v03'].astype(float))
df_P['O3_sq'] = df_P['e2v03']**2
df_P['O3_cu'] = df_P['e2v03']**3
df_P['ln_O3'] = np.log(df_P['e2v03'])

#temp terms
df_P['temp_sqrt'] = np.sqrt(df_P['Temp'].astype(float))
df_P['temp_sq'] = df_P['Temp']**2
df_P['temp_cu'] = df_P['Temp']**3
df_P['ln_temp'] = np.log(df_P['Temp'])


#rh terms
df_P['rh_sqrt'] = np.sqrt(df_P['Rh'].astype(float))
df_P['rh_sq'] = df_P['Rh']**2
df_P['rh_cu'] = df_P['Rh']**3
df_P['ln_rh'] = np.log(df_P['Rh'])


#ozone/temp interactions
df_P['03_mult_temp'] = df_P['e2v03']*df_P['Temp']
df_P['03_mult_temp_sq'] = (df_P['e2v03']*df_P['Temp'])**2
df_P['03_mult_temp_cu'] = (df_P['e2v03']*df_P['Temp'])**3
df_P['ln_03_mult_temp'] = np.log(df_P['03_mult_temp'])

df_P['o3_temp_int_lag_35'] = df_P['e2v03_int_lag_35'] * df_P['Temp_int_lag_35'] 
df_P['o3_temp_slope_lag_35'] = df_P['e2v03_slope_lag_35'] * df_P['Temp_slope_lag_35']
df_P['o3_temp_int_lag_65'] = df_P['e2v03_int_lag_65'] * df_P['Temp_int_lag_65'] 
df_P['o3_temp_slope_lag_65'] = df_P['e2v03_slope_lag_65'] * df_P['Temp_slope_lag_65'] 
df_P['o3_temp_int_lag_95'] = df_P['e2v03_int_lag_95'] * df_P['Temp_int_lag_95'] 
df_P['o3_temp_slope_lag_95'] = df_P['e2v03_slope_lag_95'] * df_P['Temp_slope_lag_95'] 

df_P['o3_temp_int_lead_35'] = df_P['e2v03_int_lead_35'] * df_P['Temp_int_lead_35'] 
df_P['o3_temp_slope_lead_35'] = df_P['e2v03_slope_lead_35'] * df_P['Temp_slope_lead_35'] 
df_P['o3_temp_int_lead_65'] = df_P['e2v03_int_lead_65'] * df_P['Temp_int_lead_65'] 
df_P['o3_temp_slope_lead_65'] = df_P['e2v03_slope_lead_65'] * df_P['Temp_slope_lead_65'] 
df_P['o3_temp_int_lead_95'] = df_P['e2v03_int_lead_95'] * df_P['Temp_int_lead_95'] 
df_P['o3_temp_slope_lead_95'] = df_P['e2v03_slope_lead_95'] * df_P['Temp_slope_lead_95']


#ozone/rh interactions
df_P['03_mult_rh'] = df_P['e2v03']*df_P['Rh']
df_P['03_mult_rh_sq'] = (df_P['e2v03']*df_P['Rh'])**2
df_P['03_mult_rh_cu'] = (df_P['e2v03']*df_P['Rh'])**3
df_P['ln_03_mult_rh'] = np.log(df_P['03_mult_rh'])

df_P['o3_rh_int_lag_35'] = df_P['e2v03_int_lag_35'] * df_P['Rh_int_lag_35'] 
df_P['o3_rh_slope_lag_35'] = df_P['e2v03_slope_lag_35'] * df_P['Rh_slope_lag_35'] 
df_P['o3_rh_int_lag_65'] = df_P['e2v03_int_lag_65'] * df_P['Rh_int_lag_65'] 
df_P['o3_rh_slope_lag_65'] = df_P['e2v03_slope_lag_65'] * df_P['Rh_slope_lag_65'] 
df_P['o3_rh_int_lag_95'] = df_P['e2v03_int_lag_95'] * df_P['Rh_int_lag_95'] 
df_P['o3_rh_slope_lag_95'] = df_P['e2v03_slope_lag_95'] * df_P['Rh_slope_lag_95'] 

df_P['o3_rh_int_lead_35'] = df_P['e2v03_int_lead_35'] * df_P['Rh_int_lead_35'] 
df_P['o3_rh_slope_lead_35'] = df_P['e2v03_slope_lead_35'] * df_P['Rh_slope_lead_35'] 
df_P['o3_rh_int_lead_65'] = df_P['e2v03_int_lead_65'] * df_P['Rh_int_lead_65'] 
df_P['o3_rh_slope_lead_65'] = df_P['e2v03_slope_lead_65'] * df_P['Rh_slope_lead_65'] 
df_P['o3_rh_int_lead_95'] = df_P['e2v03_int_lead_95'] * df_P['Rh_int_lead_95'] 
df_P['o3_rh_slope_lead_95'] = df_P['e2v03_slope_lead_95'] * df_P['Rh_slope_lead_95']


#temp/rh interactions
df_P['temp_mult_rh'] = df_P['Temp']*df_P['Rh']
df_P['temp_mult_rh_sq'] = (df_P['Temp']*df_P['Rh'])**2
df_P['temp_mult_rh_cu'] = (df_P['Temp']*df_P['Rh'])**3
df_P['ln_temp_mult_rh'] = np.log(df_P['temp_mult_rh'])

df_P['temp_rh_int_lag_35'] = df_P['Temp_int_lag_35'] * df_P['Rh_int_lag_35'] 
df_P['temp_rh_slope_lag_35'] = df_P['Temp_slope_lag_35'] * df_P['Rh_slope_lag_35'] 
df_P['temp_rh_int_lag_65'] = df_P['Temp_int_lag_65'] * df_P['Rh_int_lag_65'] 
df_P['temp_rh_slope_lag_65'] = df_P['Temp_slope_lag_65'] * df_P['Rh_slope_lag_65'] 
df_P['temp_rh_int_lag_95'] = df_P['Temp_int_lag_95'] * df_P['Rh_int_lag_95'] 
df_P['temp_rh_slope_lag_95'] = df_P['Temp_slope_lag_95'] * df_P['Rh_slope_lag_95'] 

df_P['temp_rh_int_lead_35'] = df_P['Temp_int_lead_35'] * df_P['Rh_int_lead_35'] 
df_P['temp_rh_slope_lead_35'] = df_P['Temp_slope_lead_35'] * df_P['Rh_slope_lead_35'] 
df_P['temp_rh_int_lead_65'] = df_P['Temp_int_lead_65'] * df_P['Rh_int_lead_65'] 
df_P['temp_rh_slope_lead_65'] = df_P['Temp_slope_lead_65'] * df_P['Rh_slope_lead_65'] 
df_P['temp_rh_int_lead_95'] = df_P['Temp_int_lead_95'] * df_P['Rh_int_lead_95'] 
df_P['temp_rh_slope_lead_95'] = df_P['Temp_slope_lead_95'] * df_P['Rh_slope_lead_95']


#ozone/rh/temp interactions
df_P['03_mult_rh_&_temp'] = df_P['e2v03']*df_P['Rh']*df_P['Temp']
df_P['03_mult_rh_&_temp_sq'] = (df_P['e2v03']*df_P['Rh']*df_P['Temp'])**2
df_P['03_mult_rh_&_temp_cu'] = (df_P['e2v03']*df_P['Rh']*df_P['Temp'])**3
df_P['ln_03_mult_rh_&_temp_cu'] = np.log(df_P['03_mult_rh_&_temp_cu'])

df_P['o3_temp_rh_int_lag_35'] = df_P['Temp_int_lag_35'] * df_P['Rh_int_lag_35']*df_P['e2v03_int_lag_35'] 
df_P['o3_temp_rh_slope_lag_35'] = df_P['Temp_slope_lag_35'] * df_P['Rh_slope_lag_35'] * df_P['e2v03_slope_lag_35'] 
df_P['o3_temp_rh_int_lag_65'] = df_P['Temp_int_lag_65'] * df_P['Rh_int_lag_65']*df_P['e2v03_int_lag_65'] 
df_P['o3_temp_rh_slope_lag_65'] = df_P['Temp_slope_lag_65'] * df_P['Rh_slope_lag_65'] * df_P['e2v03_slope_lag_65'] 
df_P['o3_temp_rh_int_lag_95'] = df_P['Temp_int_lag_95'] * df_P['Rh_int_lag_95'] * df_P['e2v03_int_lag_95']
df_P['o3_temp_rh_slope_lag_95'] = df_P['Temp_slope_lag_95'] * df_P['Rh_slope_lag_95'] * df_P['e2v03_slope_lag_95'] 

df_P['o3_temp_rh_int_lead_35'] = df_P['Temp_int_lead_35'] * df_P['Rh_int_lead_35'] * df_P['e2v03_int_lead_35'] 
df_P['o3_temp_rh_slope_lead_35'] = df_P['Temp_slope_lead_35'] * df_P['Rh_slope_lead_35'] * df_P['e2v03_slope_lead_35']
df_P['o3_temp_rh_int_lead_65'] = df_P['Temp_int_lead_65'] * df_P['Rh_int_lead_65'] * df_P['e2v03_int_lead_65'] 
df_P['o3_temp_rh_slope_lead_65'] = df_P['Temp_slope_lead_65'] * df_P['Rh_slope_lead_65'] * df_P['e2v03_slope_lead_65'] 
df_P['o3_temp_rh_int_lead_95'] = df_P['Temp_int_lead_95'] * df_P['Rh_int_lead_95'] * df_P['e2v03_int_lead_95']
df_P['o3_temp_rh_slope_lead_95'] = df_P['Temp_slope_lead_95'] * df_P['Rh_slope_lead_95'] * df_P['e2v03_slope_lead_95']


print df_P[:5]

                       O3_ppb    UnixTime      e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                        
7/10/14 22:25              57  1405031127  89.285714  43.900000  11.514286   
7/10/14 22:26              56  1405031188  88.000000  43.900000  11.485714   
7/10/14 22:27              52  1405031250  92.428571  43.900000  11.600000   
7/10/14 22:28              57  1405031311  89.714286  43.928571  11.585714   
7/10/14 22:29              49  1405031373  91.285714  43.985714  11.742857   

                       Zenith Angle [degrees]  days from start  \
DATE (MM/DD/YYYY)_MST                                            
7/10/14 22:25                       113.79576                0   
7/10/14 22:26                       113.87865                0   
7/10/14 22:27                       113.96083                0   
7/10/14 22:28                       114.04229                0   
7/10/14 22:29                       114.12302            

In [405]:
#Delete the first and last 115 rows of the dataframe to remove all NaNs.
df_P = df_P.ix[5:len(df_P['e2v03'])-5]
print df_P.ix[:5]

                       O3_ppb    UnixTime       e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                         
7/10/14 22:30              33  1405031430  101.166667  43.966667  11.916667   
7/10/14 22:31              48  1405031487   98.142857  43.942857  11.742857   
7/10/14 22:32              51  1405031548   97.142857  43.928571  11.600000   
7/10/14 22:33              47  1405031610   94.000000  43.942857  11.628571   
7/10/14 22:34              41  1405031671   97.714286  43.914286  11.642857   

                       Zenith Angle [degrees]  days from start  \
DATE (MM/DD/YYYY)_MST                                            
7/10/14 22:30                       114.20303                0   
7/10/14 22:31                       114.28231                0   
7/10/14 22:32                       114.36086                0   
7/10/14 22:33                       114.43868                0   
7/10/14 22:34                       114.51576     

In [406]:
df_P.to_csv(path_or_buf = 'data/D0_raw_features.csv')