# Build static feature file

In [78]:
# libraries
import pandas as pd
import geopy
from geopy.distance import distance

In [17]:
# define bounding box
UP_LEFT = (38.008050, -122.536985)    
UP_RIGHT = (38.008050, -122.186437)   
DOWN_RIGHT = (37.701933, -122.186437) 
DOWN_LEFT = (37.701933, -122.536985) 

In [18]:
# MAYBE DON'T RELY ON A CSV
# load in the grid from csv file
vsensors_df = pd.read_csv('./data/500m_grid.csv')

In [19]:
# MAYBE DON'T RELY ON A CSV
# get elevations from csv file and merge into dataframe
def getElevations(data_df, filename):
    """
    This function takes as input a dataframe with latitutes and longitudes and 
    the filepath to a csv file with the same latitudes and longitudes and elevations.
    It returns a dataframe that includes the elevations. 
    """
    
    elev_df = pd.read_csv(filename, header='infer')
    elev_df = elev_df[['x','y','elevation']]
    
    data_df = pd.merge(data_df, elev_df)
    if data_df.elevation.isna().sum() > 0:
        print("Error. Not all elevations were found.")
    
    return(data_df)

vsensors_df = getElevations(vsensors_df, './data/grid_elevations.csv')

# set a few grid points at seashore and read from TIF as ocean to 0
vsensors_df.elevation.replace(-32768, 0, inplace=True)

In [20]:
# remove virtual sensors that are in water
vsensors_df = vsensors_df[vsensors_df.in_water == False]

In [21]:
# MAYBE DON'T RELY ON A CSV
# add NDVI vegetation data
vsensors_df.head()

Unnamed: 0,min_lat,max_lat,min_lon,max_lon,x,y,center_lat,center_lon,in_water,ndvi,elevation
0,37.822662,37.82621,-122.536985,-122.532493,0,34,37.824436,-122.534739,False,-2000,17
1,37.82621,37.829757,-122.536985,-122.532493,0,35,37.827984,-122.534739,False,-2000,-32768
2,37.829757,37.833305,-122.536985,-122.532493,0,36,37.831531,-122.534739,False,-2000,-32768
3,37.833305,37.836852,-122.536985,-122.532493,0,37,37.835079,-122.534739,False,5159,55
4,37.836852,37.8404,-122.536985,-122.532493,0,38,37.838626,-122.534739,False,7053,132


In [22]:
def calc_distance(origin, destination):
    """
    Input: two tuples, each containing (lat, lon)
    Output: distance between the two coordinates in kilometers
    """

    origin = geopy.point.Point(origin)
    destination = geopy.point.Point(destination)
    distance = geopy.distance.distance(origin, destination).km

    return distance

In [24]:
# CURRENTLY THIS TAKES 50 MINUTES. SPEED UP WITH LAMBDAS POSSIBLE?
# for each virtual sensor, find k closest
getreading_df = pd.read_json(path_or_buf="https://www.purpleair.com/json") # get fresh data from purple air
k = 5

# get k-NN sensor IDs
kNN_list = []
for grid_coord in rsensors_df.columns[4:]:
    temp_df = rsensors_df[['ID', grid_coord]]
    temp_df.sort_values(by=[grid_coord], axis=0, ascending=True, inplace=True)
    kNN_list.append(list(temp_df.ID[0:k]))
vsensors_df['NN_list'] = kNN_list 

# reduce data to what we need
rsensors_df = pd.DataFrame.from_records(getreading_df.results)
rsensors_df = rsensors_df[['ID', 'Lat', 'Lon']]
rsensors_df = rsensors_df[(rsensors_df.Lat <= UP_LEFT[0]) & (rsensors_df.Lat >= DOWN_LEFT[0]) & 
                    (rsensors_df.Lon >= UP_LEFT[1]) & (rsensors_df.Lon <= UP_RIGHT[1])] # just keep sensors in bounding box
rsensors_df['coords'] = list(zip(rsensors_df.Lat, rsensors_df.Lon))

# remove all the double entries for sensors
rsensors_df.drop_duplicates(subset ="coords", inplace = True) 

# build dataframe of distances between real and virtual sensors
empty_col = [100000] * len(rsensors_df) # put out of bounds large value in new empty column
for row in range(len(vsensors_df)):
    col_name = (vsensors_df.iloc[row].center_lat, vsensors_df.iloc[row].center_lon)
    col_name = str(col_name)
    rsensors_df[col_name] = empty_col

for grid_coord in rsensors_df.columns[4:]:
    for sensor_coord in rsensors_df.coords:
        distance = calc_distance(tuple(float(s) for s in grid_coord.strip("()").split(",")), sensor_coord)
        rsensors_df[grid_coord][rsensors_df.coords == sensor_coord] = distance

# get k-NN sensor IDs
kNN_list = []
for grid_coord in rsensors_df.columns[4:]:
    temp_df = rsensors_df[['ID', grid_coord]]
    temp_df.sort_values(by=[grid_coord], axis=0, ascending=True, inplace=True)
    kNN_list.append(list(temp_df.ID[0:k]))
vsensors_df['NN_list'] = kNN_list

NameError: name 'k' is not defined

In [46]:
# add closest epa sensor

# Read historical epa data from s3
bucket = "capstone-air-pollution"
file_name = "EPA/historical_PM25.csv"  # historical
s3 = boto3.client('s3') 
obj = s3.get_object(Bucket= bucket, Key= file_name) 
epa_df = pd.read_csv(obj['Body']) 

#either use site_name or full_aqs_code... confirm that full_aqs_code is a unique, numeric id for station

epa_dict = {}
for station in epa_df.SiteName.unique():
    row = epa_df[epa_df.SiteName == station].iloc[0]
    epa_dict[station] = (row.Latitude, row.Longitude)

closest_epa = []
for vsensor in range(len(vsensors_df)):
    v_coords = (vsensors_df.iloc[vsensor].center_lat, vsensors_df.iloc[vsensor].center_lon)
    nn_distance = 0
    nn_station = ""
    for station in epa_dict.keys():
        distance = calc_distance(epa_dict[station], v_coords)
        if (nn_station == "" or nn_distance > distance):
            nn_distance = distance
            nn_station = station
    closest_epa.append(nn_station)

vsensors_df['closest_epa'] = closest_epa

# this is very fast, but as a lambda:
# vsensor['closest_epa'] = vsensors_df.apply(lambda x: closest_epa(x), axis=1)
# [bay_and_ocean.contains(pt) for pt in boxes_as_points]

In [92]:
# add closest noaa sensor

# read in all ASOS stations w/ lat lon info
filepath = "ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt"
stations = [] # an array of each read line
for station in pd.read_csv(filepath_or_buffer=filepath , encoding='utf-8', chunksize=1):
    stations.append(station.iloc[0,0])

# parse ugly text file to dataframe    
station_cols = ['usaf','wban_number','descriptor', 'lat','lon','elev_m','begin_date','end_date']
stations = stations[17:] # remove header (meta data and column names)
station_data = [] # an array of arrays, inner arrays are all data for one record, outer array is all records
for station in stations:
    data_start = 0 # position after awk location data in each record
    USAF = station[0:6]
    WBAN = station[7:12]
    data_start = station.find('+')
    location_string = station[12:data_start]
    rest_of_data = station[data_start:].split()    
    station_record = [USAF, WBAN, location_string] + rest_of_data
    station_data.append(station_record)
NOAA_df = pd.DataFrame(station_data, columns = station_cols)

# keep on the current Bay Area sensors
NOAA_df = NOAA_df.astype({'lat': float, 'lon': float, 'wban_number': int, 'end_date': str})
NOAA_df = NOAA_df[(NOAA_df.lat <= UP_LEFT[0]) & (NOAA_df.lat >= DOWN_LEFT[0]) & 
                    (NOAA_df.lon >= UP_LEFT[1]) & (NOAA_df.lon <= UP_RIGHT[1])] # just keep sensors in bounding box
NOAA_df = NOAA_df[NOAA_df.end_date.str.contains("2019")] # just get current sensors, sorta

NOAA_dict = {}
for station in NOAA_df.descriptor.unique():
    row = NOAA_df[NOAA_df.descriptor == station].iloc[0]
    NOAA_dict[station] = (row.lat, row.lon)

closest_NOAA = []
for vsensor in range(len(vsensors_df)):
    v_coords = (vsensors_df.iloc[vsensor].center_lat, vsensors_df.iloc[vsensor].center_lon)
    nn_distance = 0
    nn_station = ""
    for station in NOAA_dict.keys():
        distance = calc_distance(NOAA_dict[station], v_coords)
        if (nn_station == "" or nn_distance > distance):
            nn_distance = distance
            nn_station = station
    closest_NOAA.append(nn_station)

vsensors_df['closest_NOAA'] = closest_NOAA


In [94]:
vsensors_df.to_csv(path_or_buf="./data/static_data.csv", index=True)

DO I NEED THESE?
import numpy as np
import json
from geopy import distance

from time import sleep
import shapely.geometry
import pyproj
import geopandas as gpd
from matplotlib import pyplot as plt
from shapely.geometry import Point
import datetime
from datetime import date, timedelta
from os import path
import statistics
import boto3
import s3fs
import sys
from fastparquet import ParquetFile
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import gmplot
import math
import time
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

csv built that has:
    virtual sensor grid fields
    elevation
    ocean yes/no
    closest epa/noaa
    NDVI
    closest 5 sensors to virtual sensor grid
static_data.csv
    
each hour, build dataframe from csv adding:
    previous 7 days NOAA wind readings -- (lookup by x,y in csv)
    previous 7 EPA readings -- (lookup by x,y in csv)
    getData() for 7 days -- previous 7 days humidity/temp from nearest PA sensor (lookup by sensor_id in csv, top 5)
    ---- grab the point in time that falls on the hour, so 24 * 7 = 168 time point observations going into model
X_data_df
    
from model:
    get predicted 168 PM2.5 values for each virtual sensor
    average the 168 values down to one
    sort highest to lowest readings
most_polluted.csv
    
Freshest possible:
    NOAA wind
    EPA readings (hourly)
    humidity & temp from nearest PA sensor
    
Occasional refresh:
    NDVI
    closest sensor to virtual sensor grid
    
Never refresh:
    elevation
    ocean yes/no
    closest epa/noaa to virtual sensor grid 
    


