# Initial naive modeling
This is an effort to try creating a simple vectorfield to see if the discontinuities are plausible source locations. Each sensor location in the bounding box becomes a node with four vectors attached, one at each of the cardinal points of the compass. If the wind is blowing to the North, the vector attached to the north side of the node will be shown as pointing away from the node (an emitter) if the pollution is elevated; otherwise the vector will be shown pointing toward the node (a sink). In this scenario, we would expect to see this where P is a source of pollution and N are nodes:  
<-- N <-- P --> N -->

In [8]:
# import libraries
import pandas as pd
import numpy as np 
import warnings
import gmplot
import sys
from copy import deepcopy
sys.path.append("../SingleDayAnalysis/")
from getData import get_data

warnings.filterwarnings('ignore')

In [9]:
# for now, use one day of data from parquet file
# grab sep27_full.parquet from the shared google drive

# Ben's local path to the parquet file
datafolder = "../my_stash/data/purpleair/"

# Mark's local path to the parquet file
# datafolder = "../data/purpleair"

In [3]:
# Ben path
#data_df = pd.read_parquet("{}/sep27_full.parquet".format(datafolder))

# Mark path prior to using getData.py
# data_df = pd.read_parquet("{}/0927Full.parquet".format(datafolder))

In [12]:
# this call to get_data function that will take bounding box and timeframe and return cleaned data

UP_LEFT = (38.008050, -122.536985)    # (lat, lon)
UP_RIGHT = (38.008050, -122.186437)   # (lat, lon)
DOWN_RIGHT = (37.701933, -122.186437) # (lat, lon)
DOWN_LEFT = (37.701933, -122.536985)  # (lat, lon)
START_DATE = '2019/09/01' # begin date to start taking data
END_DATE = '2019/09/30'   # end date to start taking data
START_HOUR = '10'         # hour EACH DAY to start, this allows us to control for time of day effects
END_HOUR = '18'           # hour EACH DAY to end, this allows us to control for time of day effects

data_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)

In [13]:
# remove rows with na data for 2_5um
data_df = data_df[data_df['2_5um'].notna()]

In [14]:
data_df.head()

Unnamed: 0,0_3um,0_5um,10_0um,1_0um,2_5um,5_0um,a_h,agency_name,aqi,call_sign,...,uptime,variable_wind_info,variable_winds,wban_number,wind_data,wind_direction,wind_speed,year,zipcode,zulu_time
0,921.88,270.26,0.53,54.56,5.24,1.85,,San Francisco Bay Area AQMD,4,,...,59678.0,,,,,,,2019,94112,
1,581.91,169.14,0.38,38.39,4.9,2.93,,San Francisco Bay Area AQMD,4,,...,59688.0,,,,,,,2019,94112,
2,721.61,209.68,0.93,38.59,6.29,2.09,,San Francisco Bay Area AQMD,4,,...,59698.0,,,,,,,2019,94112,
3,625.29,184.16,0.49,49.73,5.23,2.14,,San Francisco Bay Area AQMD,4,,...,59708.0,,,,,,,2019,94112,
4,618.0,184.75,1.97,40.01,6.71,3.51,,San Francisco Bay Area AQMD,4,,...,59718.0,,,,,,,2019,94112,


In [15]:
# calculate average counts by sensor
average_col = data_df.groupby(['sensor_id'])['2_5um'].mean() 
data_df = data_df.set_index(['sensor_id'])
data_df['avg_2_5um'] = average_col
data_df = data_df.reset_index() 

In [16]:
# define direction degree range
NORTH = (316,45)
EAST = (46,135)
SOUTH = (136,225)
WEST = (226,315)

In [19]:
# go through the dataframe and add new categorical column that indicates direction: 
# North, South, East, West, No wind, Missing, ERROR

wind_compass = [] 
for row in range(len(data_df)):
    try:
        degree = int(data_df.loc[row].wind_direction)
    except:
        wind_compass.append('Missing')
        continue
    if data_df.loc[row].wind_speed == 0:
        wind_compass.append('No wind')
    elif degree >= NORTH[0] or degree <= NORTH[1]:
        wind_compass.append('North')
    elif degree >= EAST[0] and degree <= EAST[1]:
        wind_compass.append('East')
    elif degree >= SOUTH[0] and degree <= SOUTH[1]:
        wind_compass.append('South')
    elif degree >= WEST[0] and degree <= WEST[1]:
        wind_compass.append('West')
    else:
        wind_compass.append('ERROR')
data_df['wind_compass'] = wind_compass

In [20]:
# take a look at the counts. this is one day, so it may not show all possibilities.
data_df.groupby('wind_compass').count()

Unnamed: 0_level_0,sensor_id,0_3um,0_5um,10_0um,1_0um,2_5um,5_0um,a_h,agency_name,aqi,...,variable_wind_info,variable_winds,wban_number,wind_data,wind_direction,wind_speed,year,zipcode,zulu_time,avg_2_5um
wind_compass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
East,8248,8248,8248,8248,8248,8248,8248,26,8204,8204,...,8248,8248,8248,8248,8248,8248,8248,8182,8248,8248
Missing,111015,111015,111015,111015,111015,111015,111015,42,110691,110691,...,8302,8302,8302,8302,8302,6799,111015,110143,8302,111015
No wind,11223,11223,11223,11223,11223,11223,11223,34,11212,11212,...,11223,11223,11223,11223,11223,11223,11223,11132,11223,11223
North,44201,44201,44201,44201,44201,44201,44201,76,39221,39221,...,44201,44201,44201,44201,44201,44201,44201,43854,44201,44201
South,30088,30088,30088,30088,30088,30088,30088,39,29955,29955,...,30088,30088,30088,30088,30088,30088,30088,30052,30088,30088
West,608985,608985,608985,608985,608985,608985,608985,336,593725,593725,...,608985,608985,608985,608985,608985,608985,608985,604295,608985,608985


In [21]:
# get average for each sensor
sensor_avg = data_df[['sensor_id','wind_compass','lat','lon','2_5um']].groupby(['sensor_id']).mean()
# get average for each sensor *given a particular cardinal wind direction*
sensor_avg_wind = data_df[['sensor_id','wind_compass','lat','lon','2_5um']].groupby(['sensor_id','wind_compass']).mean()

# create model_df
# desired columns: sensor_id, lat, lon, avg_2_5um, north_delta, south_delta, east_delta, west_delta
sensor_avg.reset_index(inplace=True)
sensor_avg_wind.reset_index(inplace=True)
sensor_avg.rename(columns={"2_5um": "avg_2_5um"}, inplace=True)
sensor_avg["north_delta"] = np.nan
sensor_avg["south_delta"] = np.nan
sensor_avg["east_delta"] = np.nan
sensor_avg["west_delta"] = np.nan
model_df = deepcopy(sensor_avg)

# this is hacky and inefficient and probably won't scale well, but I'll fix it later
for ind, row in sensor_avg.iterrows():
    north_avg = sensor_avg_wind[(sensor_avg_wind['sensor_id'] == sensor_avg.sensor_id[ind]) &
                                             (sensor_avg_wind['wind_compass'] == 'North')]['2_5um']
    if not north_avg.empty:
        model_df.north_delta[ind] = north_avg.values[0] - sensor_avg.avg_2_5um[ind]
    else:
        model_df.north_delta[ind] = 0
    
    south_avg = sensor_avg_wind[(sensor_avg_wind['sensor_id'] == sensor_avg.sensor_id[ind]) &
                                             (sensor_avg_wind['wind_compass'] == 'South')]['2_5um']
    if not south_avg.empty:
        model_df.south_delta[ind] = south_avg.values[0] - sensor_avg.avg_2_5um[ind]
    else:
        model_df.south_delta[ind] = 0
        
    east_avg = sensor_avg_wind[(sensor_avg_wind['sensor_id'] == sensor_avg.sensor_id[ind]) &
                                             (sensor_avg_wind['wind_compass'] == 'East')]['2_5um']
    if not east_avg.empty:
        model_df.east_delta[ind] = east_avg.values[0] - sensor_avg.avg_2_5um[ind]
    else:
        model_df.east_delta[ind] = 0
        
    west_avg = sensor_avg_wind[(sensor_avg_wind['sensor_id'] == sensor_avg.sensor_id[ind]) &
                                             (sensor_avg_wind['wind_compass'] == 'West')]['2_5um'] 
    if not west_avg.empty:
        model_df.west_delta[ind] = west_avg.values[0] - sensor_avg.avg_2_5um[ind]
    else:
        model_df.west_delta[ind] = 0
        
# save RAM
sensor_avg = None

In [22]:
model_df.tail()

Unnamed: 0,sensor_id,lat,lon,avg_2_5um,north_delta,south_delta,east_delta,west_delta
541,7811,37.800237,-122.456112,3.992599,0.111686,1.066812,-0.869523,-0.10968
542,7824,37.871848,-122.271067,1.221791,0.0,-0.406791,0.0,-0.051124
543,7825,37.871848,-122.271067,1.447313,0.0,-0.902313,0.0,-0.043647
544,8468,37.87515,-122.257743,0.78071,0.675188,-0.237932,-0.32071,-0.01109
545,8469,37.87515,-122.257743,0.923025,0.878257,-0.209691,-0.923025,-0.029962


In [23]:
# map the sensors on google maps api
# arrow drawing code drawn from this repo: https://github.com/selimamrouni/hexagon-and-arrow-for-gmplot/blob/master/gmplot_h_a.ipynb

def arrow(center, length_arrow, angle):
    """
    This function is used to draw an arrow.
    
    The function takes as input:
    - center: tuple (lat, lon) which is the center point of the hexagon
    - length: real number of the length of the arrow, 
        - if negative, arrow points inward to the centroid
        - if positive. arrow points outward away from the centroid
    - angle: angle of the arrow (in degree)

    the return is a list of tuple (latitude, longitude) which is used to draw the arrow

    """

    # let's define the origin of the arrow as the center
    origin = center
    angle_rad = np.pi / 180 * angle

    # let's define the vectors pointing the end of the arrow
    # each vector point the one point of the arrow
    # so the arrow is entirely defined

    if length_arrow >= 0: # if sensor reading is positive, arrow points outward
    
        vec1 = (length_arrow * np.sin(angle_rad),
                length_arrow * np.cos(angle_rad))
        vec2 = (-0.4*length_arrow * np.sin(angle_rad + np.pi/12),
                -0.4*length_arrow * np.cos(angle_rad + np.pi/12))
        vec3 = (-0.4*length_arrow * np.sin(angle_rad - np.pi/12),
                -0.4*length_arrow * np.cos(angle_rad - np.pi/12))

        point1 = tuple(map(sum, zip(origin, vec1)))
        point2 = tuple(map(sum, zip(point1, vec2)))
        point3 = tuple(map(sum, zip(point1, vec3)))
        
        draw1 = []
        draw1.append(origin)
        draw1.append(point1)

        draw2 = []
        draw2.append(point1)
        draw2.append(point2)

        draw3 = []
        draw3.append(point1)
        draw3.append(point3)

    else: # if sensor reading is negative, arrow points inward
        
        vec1 = (-length_arrow * np.sin(angle_rad),
                -length_arrow * np.cos(angle_rad))
        vec2 = (-0.4*length_arrow * np.sin(angle_rad + np.pi/12),
                -0.4*length_arrow * np.cos(angle_rad + np.pi/12))
        vec3 = (-0.4*length_arrow * np.sin(angle_rad - np.pi/12),
                -0.4*length_arrow * np.cos(angle_rad - np.pi/12))

        point1 = tuple(map(sum, zip(vec1, origin)))
        point2 = tuple(map(sum, zip(origin, vec2)))
        point3 = tuple(map(sum, zip(origin, vec3)))       
        
        draw1 = []
        draw1.append(origin)
        draw1.append(point1)

        draw2 = []
        draw2.append(origin)
        draw2.append(point2)

        draw3 = []
        draw3.append(origin)
        draw3.append(point3)

    result = [draw1, draw2, draw3]
    
    return result

# get the plot started
latitude_list = []
longitude_list = []
for row in range(len(model_df)):
    try:
        latitude_list.append(model_df.loc[row].lat)
        longitude_list.append(model_df.loc[row].lon)
    except:
        pass
gmap3 = gmplot.GoogleMapPlotter((max(latitude_list)+min(latitude_list))/2,
                                (max(longitude_list)+min(longitude_list))/2,
                                zoom=11)
gmap3.scatter(latitude_list, longitude_list, '# FF0000', 
                              size = 40, marker = False ) 
# gmap3.apikey = "AIzaSyA2TdrwntJVu6IuS_3fOY7WLTLvhl3xntk" # this is Ben's key

# make the vectors
scaling = .002 # parameter for scaling arrow size

for row in range(len(model_df)):
    sensor_coords = (model_df.loc[row]['lat'], model_df.loc[row]['lon'])

    # north arrow
    # get arrow line coords
    delta = model_df.loc[row]['north_delta']
    if delta != 0:
        L = arrow(sensor_coords, delta*scaling, 90) # I think 90 would form a 90-degree angle from centroid?
        #unpack the tuple to create the lists of latitude and longitude points
        for l in range(3):
            list_arrow_lat = list(map(lambda x: x[0], L[l]))
            list_arrow_lon = list(map(lambda x: x[1], L[l]))
            gmap3.polygon(list_arrow_lat,
                     list_arrow_lon,
                     edge_color="red", edge_width=1, face_color="red", face_alpha=0.1)
    # west arrow
    # get arrow line coords
    delta = model_df.loc[row]['west_delta']
    if delta != 0:
        L = arrow(sensor_coords, delta*scaling, 180) 
        #unpack the tuple to create the lists of latitude and longitude points
        for l in range(3):
            list_arrow_lat = list(map(lambda x: x[0], L[l]))
            list_arrow_lon = list(map(lambda x: x[1], L[l]))
            gmap3.polygon(list_arrow_lat,
                     list_arrow_lon,
                     edge_color="red", edge_width=1, face_color="red", face_alpha=0.1)

    # south arrow
    # get arrow line coords
    delta = model_df.loc[row]['south_delta']
    if delta != 0:
        L = arrow(sensor_coords, delta*scaling, -90) 
        #unpack the tuple to create the lists of latitude and longitude points
        for l in range(3):
            list_arrow_lat = list(map(lambda x: x[0], L[l]))
            list_arrow_lon = list(map(lambda x: x[1], L[l]))
            gmap3.polygon(list_arrow_lat,
                     list_arrow_lon,
                     edge_color="red", edge_width=1, face_color="red", face_alpha=0.1)

    # east arrow
    # get arrow line coords
    delta = model_df.loc[row]['east_delta']
    if delta != 0:
        L = arrow(sensor_coords, delta*scaling, 0) 
        #unpack the tuple to create the lists of latitude and longitude points
        for l in range(3):
            list_arrow_lat = list(map(lambda x: x[0], L[l]))
            list_arrow_lon = list(map(lambda x: x[1], L[l]))
            gmap3.polygon(list_arrow_lat,
                     list_arrow_lon,
                     edge_color="red", edge_width=1, face_color="red", face_alpha=0.1)

# draw the map and send to file
gmap3.draw("data/model_map.html") 
print("Done")

Done
