# Initial naive modeling
This is an effort to try creating a simple vectorfield to see if the discontinuities are plausible source locations. Each sensor location in the bounding box becomes a node with four vectors attached, one at each of the cardinal points of the compass. If the wind is blowing to the North, the vector attached to the north side of the node will be shown as pointing away from the node (an emitter) if the pollution is elevated; otherwise the vector will be shown pointing toward the node (a sink). In this scenario, we would expect to see this where P is a source of pollution and N are nodes:  
<-- N <-- P --> N -->

In [1]:
# import libraries
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

In [2]:
# set bounding box
#UP_LEFT = # (lat, lon)
#UP_RIGHT = # (lat, lon)
#DOWN_RIGHT = # (lat, lon)
#DOWN_LEFT = # (lat, lon)

In [3]:
# set timeframe
#START_DATE = # begin date to start taking data
#END_DATE = # end date to start taking data
#START_HOUR = # hour EACH DAY to start, this allows us to control for time of day effects
#END_HOUR = # hour EACH DAY to end, this allows us to control for time of day effects

In [4]:
# FUTURE BUILDOUT: get data function

# month_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)
# this call to get_data function that will take bounding box and timeframe and return cleaned data


In [5]:
# for now, use one day of data from parquet file
# grab sep27_full.parquet from the shared google drive

# Ben's local path to the parquet file
datafolder = "../my_stash/data"

In [None]:
data_df = pd.read_parquet("{}/sep27_full.parquet".format(datafolder))

### copied in Angshuman's data importing code now because my import wasn't working. I'll fix this soon. -Mark

In [26]:
#!/usr/bin/env python3

import datetime
from datetime import date, timedelta
from os import path
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Class for throwing custom errors
class CustomError(Exception):
    def __init__(self, m):
        self.message = m
    def __str__(self):
        return self.message

# Helper function for getting dates in a given range
def getDates(start, end):
    date_list = []
    start_date = datetime.datetime.strptime(start, "%Y/%m/%d").date()
    end_date = datetime.datetime.strptime(end, "%Y/%m/%d").date()

    delta = end_date - start_date       # as timedelta

    for i in range(delta.days + 1):
        day = start_date + timedelta(days=i)
        date_list.append(day.strftime("%Y%m%d"))

    return date_list

# Helper function for loading data into a dataframe
def loadDataframe(files):
    df = pd.DataFrame(columns=['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0','pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime','rssi', 
                       'temperature', 'humidity', 'pm2_5_cf_1', 'a_h', 'device_loc_typ', 'high_reading_flag', 'hidden', 'sensor_id', 'sensor_name', 'lat', 'lon', 'parent_id', 
                       'is_owner', 'city', 'county', 'zipcode', 'created_at', 'year', 'month', 'day', 'hour', 'minute', 'wban_number', 'call_sign', 'call_sign2', 'interval', 
                       'call_sign3', 'zulu_time', 'report_modifier', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 'variable_winds', 'variable_wind_info', 
                       'sys_maint_reqd', 'epa_pm25_unit', 'epa_pm25_value', 'raw_concentration', 'aqi', 'category', 'site_name', 'agency_name', 'full_aqs_code', 'intl_aqs_code'])

    for file in files:
        file_name = "{}.parquet".format(file)
        if path.exists(file_name):
            tmp_df = pd.read_parquet(file_name)
            df = pd.concat([df,tmp_df],ignore_index=True)
        else:
            print("File {} does not exist".format(file_name))

    return df

# Main function for getting data
def get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR):

    # Create variables from parameters
    startfile = int(START_DATE.replace('/',''))
    endfile = int(END_DATE.replace('/',''))
    lat_min = DOWN_LEFT[0]
    lat_max = UP_RIGHT[0]
    lon_min = DOWN_LEFT[1]
    lon_max = UP_RIGHT[1]

    try:
        if startfile <= endfile:
            file_list = getDates(START_DATE, END_DATE)
            df = loadDataframe(file_list)
            # Filter data for input bounding box
            df = df[(df.lat > lat_min) & (df.lat < lat_max) 
                              & (df.lon > lon_min) & (df.lon < lon_max)]
            
            # Filter data for input  hours
            df = df[(df.hour >= START_HOUR) & (df.hour <= END_HOUR)]
            df.reset_index(inplace=True, drop=True)
            return df
        else:
            raise CustomError("INPUT ERROR: Start Date is greater than End Date")
    except Exception as e:
        print(e)


In [29]:
# import sys, os
# sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'SingleDayAnalysis', 'getData'))
# sys.path.append('..SingleDayAnalysis')
# from getData import get_data

# from ..SingleDayAnalysis/getData import get_data

UP_LEFT = (38.008050, -122.536985)
UP_RIGHT = (38.008050, -122.186437)
DOWN_RIGHT = (37.701933, -122.186437)
DOWN_LEFT = (37.701933, -122.536985)
START_DATE = '2019/09/27'
END_DATE = '2019/09/27'
START_HOUR = '10'
END_HOUR = '18'

df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)

ValueError: invalid literal for int() with base 10: '..datapurpleair20190927'

In [None]:
# remove rows with na data for 2_5um
data_df = data_df[data_df['2_5um'].notna()]

In [None]:
# calculate average counts by sensor
average_col = data_df.groupby(['sensor_id'])['2_5um'].mean() 
data_df = data_df.set_index(['sensor_id'])
data_df['avg_2_5um'] = average_col
data_df = data_df.reset_index() 

In [None]:
# define direction degree range
NORTH = (316,45)
EAST = (46,135)
SOUTH = (136,225)
WEST = (226,315)

In [None]:
data_df.head()

In [None]:
# go through the dataframe and add new categorical column that indicates direction: 
# North, South, East, West, No wind, Missing, ERROR

wind_compass = [] 
for row in range(len(data_df)):
    try:
        degree = int(data_df.loc[row].wind_direction)
    except:
        wind_compass.append('Missing')
        continue
    if data_df.loc[row].wind_speed == 0:
        wind_compass.append('No wind')
    elif degree >= NORTH[0] and degree <= NORTH[1]:
        wind_compass.append('North')
    elif degree >= EAST[0] and degree <= EAST[1]:
        wind_compass.append('East')
    elif degree >= SOUTH[0] and degree <= SOUTH[1]:
        wind_compass.append('South')
    elif degree >= WEST[0] and degree <= WEST[1]:
        wind_compass.append('West')
    else:
        wind_compass.append('ERROR')
data_df['wind_compass'] = wind_compass

In [None]:
# take a look at the counts. this is one day, so it may not show all possibilities.
data_df.groupby('wind_compass').count()

In [None]:
# calculate average by compass counts

# similar to above but need to do additional groupby on new categorical column 'wind_from_compass'
# loop through each record, do a trig calculation based on angle to true direction, and sum up. 
# divide for avg and subtract the mean for the sensor from this average to get a +/- vector
# add the five different vector values in a new column called 'vector'

In [None]:
# map the sensors on google maps api

In [None]:
# map the four vectors emerging from each direction off the sensor point