In [123]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [124]:
#df_P = pd.read_csv(filepath_or_buffer = 'D3_pre.csv', parse_dates = [[1,2]], index_col = False)  
#df_P = df_P.set_index('YYYY/MM/DD_HH:MM:SS')
#df_P.drop(df_P.columns[[0,2,3,4,5,6,9,10,11,12,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]], axis=1, inplace=True)
df_P = pd.io.parsers.read_csv(filepath_or_buffer = 'F6_filt_10mino3.csv',index_col = 0)
print df_P[:5]

                       O3_ppb      UnixTime      e2v03       Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                          
2014-07-11 02:00:00        39  1.405044e+09  93.115385  34.188462  24.403846   
2014-07-11 02:01:00        36  1.405044e+09  80.600000  34.120000  24.380000   
2014-07-11 02:02:00        35  1.405044e+09  75.920000  34.100000  24.444000   
2014-07-11 02:03:00        36  1.405044e+09  81.346154  34.100000  24.361538   
2014-07-11 02:04:00        36  1.405044e+09  83.240000  34.080000  24.632000   

                       Zenith Angle [degrees]  days from start  ref_o3_smooth  \
DATE (MM/DD/YYYY)_MST                                                           
2014-07-11 02:00:00                 112.71042                0             39   
2014-07-11 02:01:00                 112.61820                0             36   
2014-07-11 02:02:00                 112.52530                0             35   
2014-07-11 02:03:00               

####Define functions that create features, which represent the area under the temp and humidity curves.

In [125]:
def make_temp_time_integral_features(df_P,int_min):
    int_min = 90
    area_curve = []

    for i in range(int_min,len(df_P.Temp)):
        top = i-int_min
        area_curve.append(np.trapz(df_P.Temp[top:i]))
    
    for i in range(0,int_min):
        area_curve.insert(i, 'NaN')
    
    return area_curve

In [126]:
def make_rh_time_integral_features(df_P,int_min):
    int_min = 90
    area_curve = []

    for i in range(int_min,len(df_P.Rh)):
        top = i-int_min
        area_curve.append(np.trapz(df_P.Rh[top:i]))
    
    #insert NaNs into the beginning of the file, where 
    for i in range(0,int_min):
        area_curve.insert(i, 'NaN')
    
    return area_curve

###Call the integration functions and add them to the dataframe.

In [127]:
df_P['rh_int_90'] = make_rh_time_integral_features(df_P,90)
df_P['rh_int_75'] = make_rh_time_integral_features(df_P,75)
df_P['rh_int_60'] = make_rh_time_integral_features(df_P,60)
df_P['rh_int_45'] = make_rh_time_integral_features(df_P,45)
df_P['rh_int_30'] = make_rh_time_integral_features(df_P,30)

df_P['temp_int_90'] = make_temp_time_integral_features(df_P,90)
df_P['temp_int_75'] = make_temp_time_integral_features(df_P,75)
df_P['temp_int_60'] = make_temp_time_integral_features(df_P,60)
df_P['temp_int_45'] = make_temp_time_integral_features(df_P,45)
df_P['temp_int_30'] = make_temp_time_integral_features(df_P,30)

In [128]:
#Delete the first 90 rows of the dataframe to remove all NaNs.
df_P = df_P.ix[90:]


In [129]:
#df_P.drop(df_P.columns[[9]], axis=1, inplace=True)
#print df_P[:5]

In [130]:
#ozone terms
df_P['O3_sq'] = df_P['e2v03']**2
df_P['O3_cu'] = df_P['e2v03']**3
df_P['ln_O3'] = np.log(df_P['e2v03'])

#temp terms
df_P['temp_sq'] = df_P['Temp']**2
df_P['temp_cu'] = df_P['Temp']**3
df_P['ln_temp'] = np.log(df_P['Temp'])
df_P['ln_temp_int_90'] = np.log(df_P['temp_int_90'].astype(int))

#rh terms
df_P['rh_sq'] = df_P['Rh']**2
df_P['rh_cu'] = df_P['Rh']**3
df_P['ln_rh'] = np.log(df_P['Rh'])
df_P['ln_rh_int_90'] = np.log(df_P['rh_int_90'].astype(int))

#ozone/temp interactions
df_P['03_mult_temp'] = df_P['e2v03']*df_P['Temp']
df_P['03_mult_temp_sq'] = (df_P['e2v03']*df_P['Temp'])**2
df_P['03_mult_temp_cu'] = (df_P['e2v03']*df_P['Temp'])**3
df_P['ln_03_mult_temp'] = np.log(df_P['03_mult_temp'])

#ozone/rh interactions
df_P['03_mult_rh'] = df_P['e2v03']*df_P['Rh']
df_P['03_mult_rh_sq'] = (df_P['e2v03']*df_P['Rh'])**2
df_P['03_mult_rh_cu'] = (df_P['e2v03']*df_P['Rh'])**3
df_P['ln_03_mult_rh'] = np.log(df_P['03_mult_rh'])

#temp/rh interactions
df_P['temp_mult_rh'] = df_P['Temp']*df_P['Rh']
df_P['temp_mult_rh_sq'] = (df_P['Temp']*df_P['Rh'])**2
df_P['temp_mult_rh_cu'] = (df_P['Temp']*df_P['Rh'])**3
df_P['ln_temp_mult_rh'] = np.log(df_P['temp_mult_rh'])


#ozone/rh/temp interactions
df_P['03_mult_rh_&_temp'] = df_P['e2v03']*df_P['Rh']*df_P['Temp']
df_P['03_mult_rh_&_temp_sq'] = (df_P['e2v03']*df_P['Rh']*df_P['Temp'])**2
df_P['03_mult_rh_&_temp_cu'] = (df_P['e2v03']*df_P['Rh']*df_P['Temp'])**3
df_P['ln_03_mult_rh_&_temp_cu'] = np.log(df_P['03_mult_rh_&_temp_cu'])

print df_P[:5]

                       O3_ppb      UnixTime      e2v03    Temp         Rh  \
DATE (MM/DD/YYYY)_MST                                                       
2014-07-11 03:30:00        31  1.405049e+09  71.560000  32.200  25.232000   
2014-07-11 03:31:00        30  1.405049e+09  64.200000  32.200  24.976000   
2014-07-11 03:32:00        29  1.405050e+09  61.960000  32.200  24.996000   
2014-07-11 03:33:00        26  1.405050e+09  59.640000  32.132  25.128000   
2014-07-11 03:34:00        22  1.405050e+09  55.423077  32.100  25.242308   

                       Zenith Angle [degrees]  days from start  ref_o3_smooth  \
DATE (MM/DD/YYYY)_MST                                                           
2014-07-11 03:30:00                 102.04297                0             31   
2014-07-11 03:31:00                 101.90141                0             30   
2014-07-11 03:32:00                 101.75942                0             29   
2014-07-11 03:33:00                 101.61701          

In [131]:
df_P.to_csv(path_or_buf = 'F6_clean_features.csv')