In [1]:
import numpy as np
import pandas as pd
import datetime, pytz, calendar, urllib2, json

In [2]:
target_channels = ["Benzene","Toluene","Xylene","Hydrogen_Sulfide","m_p_Xylene",
                   "o_Xylene","Black_Carbon", "Ethylbenzene","Sulfur_Dioxide","voc","dust"]

In [39]:
def exec_ipynb(url):
    import json, re, urllib2
    nb = (urllib2.urlopen(url) if re.match(r'https?:', url) else open(url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('python-utils/esdr-library.ipynb')

In [3]:
#returns start and end timestamps of provided date
def getEpochTimeBounds(d, duration):
    pacific = pytz.timezone("US/Pacific")
    dt = pacific.localize(datetime.datetime(d.year,d.month,d.day))

    start = calendar.timegm(dt.utctimetuple())
    end = calendar.timegm((dt + datetime.timedelta(days=duration)).utctimetuple())
    return {'start' : start, 'end': end}
#getEpochTimeBounds(datetime.date(2017,2,1))

In [83]:
def makeDataFrameFromEsdr(feed, channel, timeOptions={}):
    if timeOptions.get('bounds') == None:
        duration = timeOptions.get('duration') or 1
        bounds = getEpochTimeBounds(timeOptions.get('day'), duration)
    else:
        bounds = timeOptions.get('bounds')
    url = "https://esdr.cmucreatelab.org/api/v1/feeds/%s/channels/%s/export?from=%s&to=%s&format=json" % (feed, channel, bounds['start'], bounds['end'])
    try:
        r = json.loads(urllib2.urlopen(url).read())
        print "loaded " + str(len(r['data'])) + " data points for feed " + feed + ", channel " + channel + ", time " + str(bounds['start'])
    except:
        print "error loading data from ESDR: feed " + feed + ", channel " + channel + ", time " + str(bounds['start'])
    cols = [name.split('.')[2] for name in r['channel_names']]
    cols.insert(0,'Time')
    df = pd.DataFrame(r["data"],columns=cols).set_index(['Time'])
    return df
#df = makeDataFrameFromEsdr("4910","Benzene",{'day':datetime.date(2017,2,1)})

In [118]:
def maxHourlyAverage(df):
    def avg(x, delta):
        ser = df.iloc[(df.index >= x - delta) & (df.index <= x + delta), 0]
        return ser.mean()
    
    healthLimit = 1
    df['avg'] = pd.Series(data = df.index, index = df.index).apply(lambda x: avg(x,delta=1800))
    maxAvg = df.nlargest(1,'avg')
    
    #get wind data for hour with highest average
    bounds = {'start':maxAvg.index[0] - 1800,'end':maxAvg.index[0] + 1800}
    wind = makeDataFrameFromEsdr("4910","Wind_Direction,Wind_Speed_MPH",{'bounds':bounds})
    
    #break into quadrants and select the prevailing one
    quads = [0,90,180,270,360]
    quad_names = ['NE','SE','SW','NW']
    wind['Compass_Dir'] = pd.cut(wind['Wind_Direction'],quads,labels=quad_names)
    direction = wind.groupby('Compass_Dir').sum().nlargest(1,'Wind_Speed_MPH').index[0]
    print "The max hourly average is " + str(maxAvg.avg.item()) + " and the prevailing wind direction was towards " + direction
maxHourlyAverage(df)
#pd.value_counts(wind['Compass_Dir'])
#sort into 4 groups by quadrant; quadrant with the most values is what ill display? should I also factor in wind speed? (quadrant with the highest sum of speeds?)
#wind
#df.sort_values('avg',ascending=False)
#df2.loc[df2['avg'] > 1].apply(lambda x: df.loc[(df.index >= x -  )])

loaded 47 data points for feed 4910, channel Wind_Direction,Wind_Speed_MPH, time 1485943440
The max hourly average is 2.84042553191 and the prevailing wind direction was towards SE


In [16]:
def calcDailyMean(df, nd=0):
    if nd == 0:
        return df['val'].mean()
    else:
        #substitute readings of 0 for the passed-in non-detect value
        #(which should represent that chemicals' detection limit)
        return df.replace(0.0,nd)['val'].mean()
calcDailyMean(df,0.5)

0.5268156021050452

In [9]:
#total time, in hours, that a detection was present of given chemical or aggregated set of chemicals
def calcHoursDetected(df):
    detected = df.loc[df['val'] > 0, ['val']]
    return len(detected) / float(60) #each reading represents 1 minute
calcHoursDetected(df)

8.4

In [13]:
#total time, in hours, that detection was greater than health threshold of given chemical
def calcHoursAboveHealthLimit(df, limit):
    detected = df.loc[df['val'] > limit, ['val']]
    return len(detected) / float(60) #each reading represents 1 minute
calcHoursAboveHealthLimit(df,1)

2.683333333333333

In [12]:
detected = df.loc[df['val'] > 0, ['val']]
len(detected)

504

In [42]:
df2 = set()
delta = 1800
x = 1486021320
ser = df.loc[(df.index >= x - delta) & (df.index <= x + delta)]
#pd.merge(ser, df2, how='outer')
#set(ser.index).union(df2)