In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [82]:
df_P = pd.io.parsers.read_csv(filepath_or_buffer = 'data/D0_filt_10mino3.csv',index_col = 0)
print df_P[:5]

                       O3_ppb    UnixTime       e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                         
7/10/14 20:25              41  1405023930  114.714286  45.800000  11.400000   
7/10/14 20:26              39  1405023991  120.428571  45.814286  11.314286   
7/10/14 20:27              44  1405024053  119.142857  45.857143  11.300000   
7/10/14 20:28              47  1405024110  117.500000  45.900000  11.300000   
7/10/14 20:29              44  1405024167  120.285714  45.900000  11.457143   

                       Zenith Angle [degrees]  days from start  ref_o3_smooth  \
DATE (MM/DD/YYYY)_MST                                                           
7/10/14 20:25                        99.48960                0             41   
7/10/14 20:26                        99.63789                0             39   
7/10/14 20:27                        99.78580                0             44   
7/10/14 20:28                        99.9

In [83]:
df_P.drop(df_P.columns[[7,8]], axis=1, inplace=True)
print df_P[:5]

                       O3_ppb    UnixTime       e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                         
7/10/14 20:25              41  1405023930  114.714286  45.800000  11.400000   
7/10/14 20:26              39  1405023991  120.428571  45.814286  11.314286   
7/10/14 20:27              44  1405024053  119.142857  45.857143  11.300000   
7/10/14 20:28              47  1405024110  117.500000  45.900000  11.300000   
7/10/14 20:29              44  1405024167  120.285714  45.900000  11.457143   

                       Zenith Angle [degrees]  days from start  
DATE (MM/DD/YYYY)_MST                                           
7/10/14 20:25                        99.48960                0  
7/10/14 20:26                        99.63789                0  
7/10/14 20:27                        99.78580                0  
7/10/14 20:28                        99.93332                0  
7/10/14 20:29                       100.08045           

####Define the slope function.

In [67]:
def find_lag_slope(df_P, int_min, data_col):
    df_P['diff'] = df_P[data_col].diff()
    
    slope = []
    for i in range(int_min,len(df_P['diff'])):
        top = i-int_min
        slope.append(df_P['diff'][top:i].mean())
    
    for i in range(0,int_min):
        slope.insert(i, 'NaN')

    return slope

In [68]:
def find_lead_slope(df_P, int_min, data_col):
    df_P['diff'] = df_P[data_col].diff()
    slope = []
    for i in range(0,len(df_P[data_col])-int_min):
        top = i + int_min
        slope.append(df_P['diff'][i:top].mean())
    
    for i in range(len(df_P[data_col])-int_min, len(df_P[data_col])):
        slope.insert(i, 'NaN')

    return slope

####Define functions that create features that represent the area under the temp and humidity curves.

In [69]:
def find_lag_integral(df_P, int_min, data_col):
    area_curve = []

    for i in range(int_min,len(df_P[data_col])):
        top = i - int_min
        area_curve.append(np.trapz(df_P[data_col][top:i]))
    
    for i in range(0,int_min):
        area_curve.insert(i, 'NaN')
    
    return area_curve

In [70]:
def find_lead_integral(df_P, int_min, data_col):
    area_curve = []

    for i in range(0,len(df_P[data_col])-int_min):
        top = i + int_min
        area_curve.append(np.trapz(df_P[data_col][i:top]))
    
    for i in range(len(df_P[data_col]-int_min), len(df_P[data_col])):
        area_curve.insert(i, 'NaN')
    
    return area_curve

In [71]:
#relative start time must be less than relative end time
def sliding_integral_lag(df_P, start_min_before, end_min_before, column):
    interval = start_min_before - end_min_before 
    a = df_P['e2v03'].shift(end_min_before).values
    v = np.zeros(interval)
    v[0:interval] = 1
    out = np.convolve(a, v, 'valid')
    out = np.concatenate((np.array([float('nan')] * interval), out))
    out = np.delete(out, len(out)-1)
    return out

In [22]:
#0-5 minutes before, i = 5
a = df_P['e2v03'].shift(0)[:25].values
v = np.zeros(5)
v[0:5] = 1
out = np.convolve(a, v, 'valid')
out = np.concatenate((np.array([float('nan')] * 5), out))
out = np.delete(out, len(out)-1)
print out
print len(a)
print len(out)

[         nan          nan          nan          nan          nan
  592.0714285  594.0714285  587.3571428  575.5        563.8571429
  548.4047619  537.1190476  533.6904762  535.6904762  537.547619
  538.4285714  539.5        537.2142857  536.2142857  534.5        541.0714286
  546.1428572  546.6428572  544.2142858  544.5000001]
25
25


In [24]:
#5-10 minutes before
a = df_P['e2v03'].shift(5)[:25].values
v = np.zeros(5)
v[0:5] = 1
out = np.convolve(a, v, 'valid')
out = np.concatenate((np.array([float('nan')] * 5), out))
out = np.delete(out, len(out)-1)
print out
print len(out)

[         nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
  592.0714285  594.0714285  587.3571428  575.5        563.8571429
  548.4047619  537.1190476  533.6904762  535.6904762  537.547619
  538.4285714  539.5        537.2142857  536.2142857  534.5      ]
25


In [49]:
#0-10 minutes before
a = df_P['e2v03'].shift(0).values
v = np.zeros(10)
v[0:10] = 1
out = np.convolve(a, v, 'valid')
out = np.concatenate((np.array([float('nan')] * 5), out))
out = np.delete(out, len(out)-1)
print out
print len(out)

[          nan           nan           nan ...,  976.42857138  975.28571423
  975.42857138]
10073


In [47]:
#10-15 minutes before
a = df_P['e2v03'].shift(10).values
v = np.zeros(10)
v[0:5] = 1
out = np.convolve(a, v, 'valid')
out = np.concatenate((np.array([float('nan')] * 5), out))
out = np.delete(out, len(out)-1)
print out[:20]
print len(out)
print len(a)

[         nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
          nan          nan          nan          nan          nan
  548.4047619  537.1190476  533.6904762  535.6904762  537.547619 ]
10073
10078


###Call the slope functions and add them to the dataframe.

In [72]:
def make_func_caller_find_lag_integral(df_P, min_time, max_time, interval, column):
    i = min_time
    while i <= max_time:    
        df_P[str(column) + '_int' + '_lag_' + str(i)] = find_lag_integral(df_P, i, column)      
        i += interval
    return df_P  

def make_func_caller_find_lead_integral(df_P, min_time, max_time, interval, column):
    i = min_time
    while i <= max_time:    
        df_P[str(column) + '_int' + '_lead_' + str(i)] = find_lead_integral(df_P, i, column)      
        i += interval
    return df_P 

def make_func_caller_sliding_integral_lag(df_P, max_end_min_before, min_end_min_before, frequency, interval_size, column):
    i = min_end_min_before
    while i <= max_end_min_before:
        start = i + interval_size
        df_P[str(column) + '_int' + '_slide_' + str(start) + '_to_' + str(i) + '_before'] = sliding_integral_lag(df_P, start, i, column)      
        i += frequency
    return df_P 

def make_func_caller_find_lag_slope(df_P, min_time, max_time, interval, column):
    i = min_time
    while i <= max_time:    
        df_P[str(column) + '_slope' + '_lag_' + str(i)] = find_lag_slope(df_P, i, column)      
        i += interval
    return df_P 

def make_func_caller_find_lead_slope(df_P, min_time, max_time, interval, column):
    i = min_time
    while i <= max_time:    
        df_P[str(column) + '_slope' + '_lead_' + str(i)] = find_lead_slope(df_P, i, column)      
        i += interval
    return df_P 

def make_func_caller_find_sliding_slope(df_P, start_bef_or_aft_point, end_bef_or_aft_point, size, column):
    i = size
    while i <= size:    
        df_P[str(column) + '_slope' + '_slide_' + str(i)] = find_sliding_slope(df_P, start_bef_or_aft_point, end_bef_or_aft_point, i, column)      
        i += size
    return df_P

In [None]:
#call the lag int function for o3, temp, and rh
make_func_caller_find_lag_integral(df_P, 1, 115, 10, 'e2v03')
make_func_caller_find_lag_integral(df_P, 1, 115, 10, 'Temp')
make_func_caller_find_lag_integral(df_P, 1, 115, 10, 'Rh')

#call the lead int for o3, temp, and rh
make_func_caller_find_lead_integral(df_P, 1, 115, 10, 'e2v03')
make_func_caller_find_lead_integral(df_P, 1, 115, 10, 'Temp')
make_func_caller_find_lead_integral(df_P, 1, 115, 10, 'Rh')

#call the lag sliding integral function
# inputs- (df_P, max_end_min_before, min_end_min_before, frequency, interval_size, column)
make_func_caller_sliding_integral_lag(df_P, 90, 0, 5, 5, 'e2v03')
make_func_caller_sliding_integral_lag(df_P, 90, 0, 5, 10, 'e2v03')
make_func_caller_sliding_integral_lag(df_P, 90, 0, 5, 15, 'e2v03')
make_func_caller_sliding_integral_lag(df_P, 90, 0, 5, 30, 'e2v03')

#call the lag slope for o3, temp, and rh
make_func_caller_find_lag_slope(df_P, 1, 115, 10, 'e2v03')
make_func_caller_find_lag_slope(df_P, 1, 115, 10, 'Temp')
make_func_caller_find_lag_slope(df_P, 1, 115, 10, 'Rh')

#call the lead slope for o3, temp, and rh
make_func_caller_find_lead_slope(df_P, 1, 115, 10, 'e2v03')
make_func_caller_find_lead_slope(df_P, 1, 115, 10, 'Temp')
make_func_caller_find_lead_slope(df_P, 1, 115, 10, 'Rh')

df_P[:5]

In [85]:
df_P

Unnamed: 0_level_0,O3_ppb,UnixTime,e2v03,Temp,Rh,Zenith Angle [degrees],days from start,e2v03_int_lag_1,e2v03_int_lag_11,e2v03_int_lag_21,...,Rh_int_lag_21,Rh_int_lag_31,Rh_int_lag_41,Rh_int_lag_51,Rh_int_lag_61,Rh_int_lag_71,Rh_int_lag_81,Rh_int_lag_91,Rh_int_lag_101,Rh_int_lag_111
DATE (MM/DD/YYYY)_MST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7/10/14 20:25,41,1405023930,114.714286,45.800000,11.400000,99.48960,0,,,,...,,,,,,,,,,
7/10/14 20:26,39,1405023991,120.428571,45.814286,11.314286,99.63789,0,0,,,...,,,,,,,,,,
7/10/14 20:27,44,1405024053,119.142857,45.857143,11.300000,99.78580,0,0,,,...,,,,,,,,,,
7/10/14 20:28,47,1405024110,117.500000,45.900000,11.300000,99.93332,0,0,,,...,,,,,,,,,,
7/10/14 20:29,44,1405024167,120.285714,45.900000,11.457143,100.08045,0,0,,,...,,,,,,,,,,
7/10/14 20:30,45,1405024228,116.714286,45.900000,11.271429,100.22718,0,0,,,...,,,,,,,,,,
7/10/14 20:31,49,1405024290,113.714286,45.842857,11.114286,100.37352,0,0,,,...,,,,,,,,,,
7/10/14 20:32,55,1405024351,107.285714,45.800000,11.042857,100.51946,0,0,,,...,,,,,,,,,,
7/10/14 20:33,58,1405024412,105.857143,45.800000,11.000000,100.66500,0,0,,,...,,,,,,,,,,
7/10/14 20:34,58,1405024470,104.833333,45.733333,10.883333,100.81014,0,0,,,...,,,,,,,,,,


In [87]:
#Delete the first and last 115 rows of the dataframe to remove all NaNs.
df_P = df_P.ix[115:len(df_P['e2v03'])-115]
print df_P.ix[:5]

                       O3_ppb    UnixTime      e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                        
7/11/14 0:49               45  1405039772  90.142857  38.200000  18.500000   
7/11/14 0:50               43  1405039833  92.000000  38.142857  18.500000   
7/11/14 0:51               45  1405039890  92.000000  38.116667  18.500000   
7/11/14 0:52               47  1405039947  93.714286  38.114286  18.500000   
7/11/14 0:53               48  1405040008  96.142857  38.100000  18.542857   

                       Zenith Angle [degrees]  days from start  \
DATE (MM/DD/YYYY)_MST                                            
7/11/14 0:49                        117.36216                0   
7/11/14 0:50                        117.32460                0   
7/11/14 0:51                        117.28620                0   
7/11/14 0:52                        117.24695                0   
7/11/14 0:53                        117.20686            

In [55]:
#df_P.drop(df_P.columns[[9]], axis=1, inplace=True)
#print df_P[:5]

In [88]:
#ozone terms
df_P['O3_sqrt'] = np.sqrt(df_P['e2v03'].astype(float))
df_P['O3_sq'] = df_P['e2v03']**2
df_P['O3_cu'] = df_P['e2v03']**3
df_P['ln_O3'] = np.log(df_P['e2v03'])


#temp terms
df_P['temp_sqrt'] = np.sqrt(df_P['Temp'].astype(float))
df_P['temp_sq'] = df_P['Temp']**2
df_P['temp_cu'] = df_P['Temp']**3
df_P['ln_temp'] = np.log(df_P['Temp'])


#rh terms
df_P['rh_sqrt'] = np.sqrt(df_P['Rh'].astype(float))
df_P['rh_sq'] = df_P['Rh']**2
df_P['rh_cu'] = df_P['Rh']**3
df_P['ln_rh'] = np.log(df_P['Rh'])


#ozone/temp interactions
df_P['03_mult_temp'] = df_P['e2v03']*df_P['Temp']
df_P['03_mult_temp_sq'] = (df_P['e2v03']*df_P['Temp'])**2
df_P['03_mult_temp_cu'] = (df_P['e2v03']*df_P['Temp'])**3
df_P['ln_03_mult_temp'] = np.log(df_P['03_mult_temp'])


#ozone/rh interactions
df_P['03_mult_rh'] = df_P['e2v03']*df_P['Rh']
df_P['03_mult_rh_sq'] = (df_P['e2v03']*df_P['Rh'])**2
df_P['03_mult_rh_cu'] = (df_P['e2v03']*df_P['Rh'])**3
df_P['ln_03_mult_rh'] = np.log(df_P['03_mult_rh'])

#temp/rh interactions
df_P['temp_mult_rh'] = df_P['Temp']*df_P['Rh']
df_P['temp_mult_rh_sq'] = (df_P['Temp']*df_P['Rh'])**2
df_P['temp_mult_rh_cu'] = (df_P['Temp']*df_P['Rh'])**3
df_P['ln_temp_mult_rh'] = np.log(df_P['temp_mult_rh'])


#ozone/rh/temp interactions
df_P['03_mult_rh_&_temp'] = df_P['e2v03']*df_P['Rh']*df_P['Temp']
df_P['03_mult_rh_&_temp_sq'] = (df_P['e2v03']*df_P['Rh']*df_P['Temp'])**2
df_P['03_mult_rh_&_temp_cu'] = (df_P['e2v03']*df_P['Rh']*df_P['Temp'])**3
df_P['ln_03_mult_rh_&_temp_cu'] = np.log(df_P['03_mult_rh_&_temp_cu'])

print df_P[:5]

                       O3_ppb    UnixTime      e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                        
7/11/14 0:49               45  1405039772  90.142857  38.200000  18.500000   
7/11/14 0:50               43  1405039833  92.000000  38.142857  18.500000   
7/11/14 0:51               45  1405039890  92.000000  38.116667  18.500000   
7/11/14 0:52               47  1405039947  93.714286  38.114286  18.500000   
7/11/14 0:53               48  1405040008  96.142857  38.100000  18.542857   

                       Zenith Angle [degrees]  days from start  \
DATE (MM/DD/YYYY)_MST                                            
7/11/14 0:49                        117.36216                0   
7/11/14 0:50                        117.32460                0   
7/11/14 0:51                        117.28620                0   
7/11/14 0:52                        117.24695                0   
7/11/14 0:53                        117.20686            

In [89]:
df_P.to_csv(path_or_buf = 'data/D0_raw_features.csv')