# Initial naive modeling
This is an effort to try creating a simple vectorfield to see if the discontinuities are plausible source locations. Each sensor location in the bounding box becomes a node with four vectors attached, one at each of the cardinal points of the compass. If the wind is blowing to the North, the vector attached to the north side of the node will be shown as pointing away from the node (an emitter) if the pollution is elevated; otherwise the vector will be shown pointing toward the node (a sink). In this scenario, we would expect to see this where P is a source of pollution and N are nodes:  
<-- N <-- P --> N -->

In [1]:
# import libraries
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set bounding box
#UP_LEFT = # (lat, lon)
#UP_RIGHT = # (lat, lon)
#DOWN_RIGHT = # (lat, lon)
#DOWN_LEFT = # (lat, lon)

In [None]:
# set timeframe
#START_DATE = # begin date to start taking data
#END_DATE = # end date to start taking data
#START_HOUR = # hour EACH DAY to start, this allows us to control for time of day effects
#END_HOUR = # hour EACH DAY to end, this allows us to control for time of day effects

In [None]:
# FUTURE BUILDOUT: get data function

# month_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)
# this call to get_data function that will take bounding box and timeframe and return cleaned data


In [2]:
# for now, use one day of data from parquet file
# grab sep27_full.parquet from the shared google drive

# Ben's local path to the parquet file
datafolder = "../my_stash/data"

In [3]:
data_df = pd.read_parquet("{}/sep27_full.parquet".format(datafolder))

In [4]:
# remove rows with na data for 2_5um
data_df = data_df[data_df['2_5um'].notna()]

In [5]:
# calculate average counts by sensor
average_col = data_df.groupby(['sensor_id'])['2_5um'].mean() 
data_df = data_df.set_index(['sensor_id'])
data_df['avg_2_5um'] = average_col
data_df = data_df.reset_index() 

In [6]:
# define direction degree range
NORTH = (316,45)
EAST = (46,135)
SOUTH = (136,225)
WEST = (226,315)

In [7]:
data_df.head()

Unnamed: 0,sensor_id,0_3um,0_5um,1_0um,2_5um,5_0um,10_0um,pm1_0,pm10_0,created,...,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,datetime,avg_2_5um
0,16939,935.56,270.69,45.63,7.76,2.35,0.0,4.57,8.82,201909270000,...,260,10.0,0.0,,0.0,,0.0,23.0,201909300000.0,5.132586
1,16939,935.56,270.69,45.63,7.76,2.35,0.0,4.57,8.82,201909270000,...,250,10.0,0.0,,0.0,,0.0,20.0,201909300000.0,5.132586
2,16939,842.38,244.69,51.99,9.53,3.35,0.0,3.78,8.69,201909270010,...,270,10.0,0.0,,0.0,,0.0,19.0,201909300000.0,5.132586
3,16939,842.38,244.69,51.99,9.53,3.35,0.0,3.78,8.69,201909270010,...,250,13.0,1.0,17.0,0.0,,0.0,20.0,201909300000.0,5.132586
4,16939,832.65,247.2,41.0,6.56,2.14,1.1,3.93,8.15,201909270020,...,260,10.0,0.0,,0.0,,0.0,22.0,201909300000.0,5.132586


In [11]:
# go through the dataframe and add new categorical column that indicates direction: 
# North, South, East, West, No wind, Missing, ERROR

wind_compass = [] 
for row in range(len(data_df)):
    try:
        degree = int(data_df.loc[row].wind_direction)
    except:
        wind_compass.append('Missing')
        continue
    if data_df.loc[row].wind_speed == 0:
        wind_compass.append('No wind')
    elif degree >= NORTH[0] and degree <= NORTH[1]:
        wind_compass.append('North')
    elif degree >= EAST[0] and degree <= EAST[1]:
        wind_compass.append('East')
    elif degree >= SOUTH[0] and degree <= SOUTH[1]:
        wind_compass.append('South')
    elif degree >= WEST[0] and degree <= WEST[1]:
        wind_compass.append('West')
    else:
        wind_compass.append('ERROR')
data_df['wind_compass'] = wind_compass

In [12]:
# take a look at the counts. this is one day, so it may not show all possibilities.
data_df.groupby('wind_compass').count()

Unnamed: 0_level_0,sensor_id,0_3um,0_5um,1_0um,2_5um,5_0um,10_0um,pm1_0,pm10_0,created,...,wind_direction,wind_speed,gusts,gust_speed,variable_winds,variable_wind_info,sys_maint_reqd,num_fields,datetime,avg_2_5um
wind_compass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Missing,70,70,70,70,70,70,70,70,70,70,...,66,30,66,6,66,66,66,66,66,70
No wind,6,6,6,6,6,6,6,6,6,6,...,6,6,6,0,6,6,6,6,6,6
South,4880,4880,4880,4880,4880,4880,4880,4880,4880,4880,...,4880,4880,4880,1156,4880,4880,4880,4880,4880,4880
West,140608,140608,140608,140608,140608,140608,140608,140608,140608,140608,...,140608,140608,140608,12051,140608,140608,140608,140608,140608,140608


In [None]:
# calculate average by compass counts

# similar to above but need to do additional groupby on new categorical column 'wind_from_compass'
# loop through each record, do a trig calculation based on angle to true direction, and sum up. 
# divide for avg and subtract the mean for the sensor from this average to get a +/- vector
# add the five different vector values in a new column called 'vector'

In [1]:
# map the sensors on google maps api

In [None]:
# map the four vectors emerging from each direction off the sensor point