# Build static feature file

In [109]:
# libraries
import pandas as pd
import boto3
import geopy
from geopy.distance import distance
import warnings
warnings.filterwarnings('ignore')

In [110]:
# define bounding box
UP_LEFT = (38.063446, -122.683496)    
UP_RIGHT = (38.063446, -121.814281)   
DOWN_RIGHT = (37.2781261, -121.814281) 
DOWN_LEFT = (37.2781261, -122.683496) 

# old, smaller bounding box
#UP_LEFT = (38.008050, -122.536985)    
#UP_RIGHT = (38.008050, -122.186437)   
#DOWN_RIGHT = (37.701933, -122.186437) 
#DOWN_LEFT = (37.701933, -122.536985) 

In [111]:
# load in the grid from csv file
vsensors_df = pd.read_csv('./data/bigger_500m_grid.csv')

In [51]:
vsensors_df.head()

Unnamed: 0,min_lat,max_lat,min_lon,max_lon,x,y,center_lat,center_lon,in_water,ndvi
0,37.278126,37.2817,-122.683496,-122.679004,0,0,37.279913,-122.68125,True,-2000
1,37.2817,37.285274,-122.683496,-122.679004,0,1,37.283487,-122.68125,True,-2000
2,37.285274,37.288847,-122.683496,-122.679004,0,2,37.28706,-122.68125,True,-2000
3,37.288847,37.292421,-122.683496,-122.679004,0,3,37.290634,-122.68125,True,-2000
4,37.292421,37.295994,-122.683496,-122.679004,0,4,37.294207,-122.68125,True,-1999


In [112]:
# get elevations from satellite image
import rasterio

coords = [] # stores list of center_lon, center_lat coordinates from the grid
elevs = [] # stores elevations at each of the coordinates

for row in range(len(vsensors_df)):
        coords.append((vsensors_df.iloc[row].center_lon, vsensors_df.iloc[row].center_lat))
        

with rasterio.open('data/srtm_12_05.tif') as src:   
        vals = src.sample(coords)
        for val in vals:
            elevs.append(val[0])

elevs_df = pd.DataFrame({'elevation':elevs, 'lat':vsensors_df.center_lat, 'lon':vsensors_df.center_lon})

In [113]:
# deal with negative spots that I think are just ocean
elevs_df.elevation = elevs_df.elevation.astype(int) 

In [114]:
elevs_df.to_csv("./data/big_grid_sensor_elevations.csv")

In [115]:
vsensors_df['elevation'] = elevs

In [116]:
# remove virtual sensors that are in water
vsensors_df = vsensors_df[vsensors_df.in_water == False]

In [117]:
# add NDVI vegetation data
vsensors_df.head()

Unnamed: 0,min_lat,max_lat,min_lon,max_lon,x,y,center_lat,center_lon,in_water,ndvi,elevation
178,37.911599,37.915142,-122.683496,-122.679004,0,178,37.913371,-122.68125,False,5662,-1
182,37.925772,37.929315,-122.683496,-122.679004,0,182,37.927544,-122.68125,False,4825,14
183,37.929315,37.932858,-122.683496,-122.679004,0,183,37.931086,-122.68125,False,7641,43
184,37.932858,37.9364,-122.683496,-122.679004,0,184,37.934629,-122.68125,False,7641,72
185,37.9364,37.939943,-122.683496,-122.679004,0,185,37.938172,-122.68125,False,8197,175


In [118]:
vsensors_df.describe()

Unnamed: 0,min_lat,max_lat,min_lon,max_lon,x,y,center_lat,center_lon,ndvi,elevation
count,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0
mean,37.68004,37.683594,-122.153268,-122.148776,118.049415,112.86011,37.681817,-122.151022,4297.998636,-122.17222
std,0.232086,0.232075,0.22286,0.22286,49.617235,65.280391,0.232081,0.22286,2279.298629,2994.453111
min,37.278126,37.2817,-122.683496,-122.679004,0.0,0.0,37.279913,-122.68125,-2000.0,-32768.0
25%,37.474438,37.478002,-122.319678,-122.315187,81.0,55.0,37.47622,-122.317433,2918.0,21.0
50%,37.695117,37.698671,-122.122049,-122.117557,125.0,117.0,37.696894,-122.119803,4176.0,108.0
75%,37.893878,37.897423,-121.964844,-121.960352,160.0,173.0,37.895651,-121.962598,5905.0,230.0
max,38.060282,38.063819,-121.816622,-121.81213,193.0,220.0,38.062051,-121.814376,9921.0,1123.0


In [123]:
# handle weird negative values
vsensors_df = vsensors_df.replace(-32767, 0) 
vsensors_df = vsensors_df.replace(-32768, 0) 

In [124]:
vsensors_df.describe()

Unnamed: 0,min_lat,max_lat,min_lon,max_lon,x,y,center_lat,center_lon,ndvi,elevation
count,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0,29323.0
mean,37.68004,37.683594,-122.153268,-122.148776,118.049415,112.86011,37.681817,-122.151022,4297.998636,150.494015
std,0.232086,0.232075,0.22286,0.22286,49.617235,65.280391,0.232081,0.22286,2279.298629,154.902819
min,37.278126,37.2817,-122.683496,-122.679004,0.0,0.0,37.279913,-122.68125,-2000.0,-24.0
25%,37.474438,37.478002,-122.319678,-122.315187,81.0,55.0,37.47622,-122.317433,2918.0,21.0
50%,37.695117,37.698671,-122.122049,-122.117557,125.0,117.0,37.696894,-122.119803,4176.0,108.0
75%,37.893878,37.897423,-121.964844,-121.960352,160.0,173.0,37.895651,-121.962598,5905.0,230.0
max,38.060282,38.063819,-121.816622,-121.81213,193.0,220.0,38.062051,-121.814376,9921.0,1123.0


In [126]:
def calc_distance(origin, destination):
    """
    Input: two tuples, each containing (lat, lon)
    Output: distance between the two coordinates in kilometers
    """

    origin = geopy.point.Point(origin)
    destination = geopy.point.Point(destination)
    distance = geopy.distance.distance(origin, destination).km

    return distance

In [127]:
# THIS TAKES A VERY LONG TIME WITH BIG MAP. SPEED UP WITH LAMBDAS POSSIBLE?

# for each virtual sensor, find k closest
getreading_df = pd.read_json(path_or_buf="https://www.purpleair.com/json") # get fresh data from purple air
k = 5 

# reduce data to what we need
rsensors_df = pd.DataFrame.from_records(getreading_df.results)
rsensors_df = rsensors_df[['ID', 'Lat', 'Lon']]
rsensors_df = rsensors_df[(rsensors_df.Lat <= UP_LEFT[0]) & (rsensors_df.Lat >= DOWN_LEFT[0]) & 
                    (rsensors_df.Lon >= UP_LEFT[1]) & (rsensors_df.Lon <= UP_RIGHT[1])] # just keep sensors in bounding box
rsensors_df['coords'] = list(zip(rsensors_df.Lat, rsensors_df.Lon))

# remove all the double entries for sensors
rsensors_df.drop_duplicates(subset ="coords", inplace = True) 

# build dataframe of distances between real and virtual sensors
empty_col = [100000] * len(rsensors_df) # put out of bounds large value in new empty column
for row in range(len(vsensors_df)):
    col_name = (vsensors_df.iloc[row].center_lat, vsensors_df.iloc[row].center_lon)
    col_name = str(col_name)
    rsensors_df[col_name] = empty_col

for grid_coord in rsensors_df.columns[4:]:
    for sensor_coord in rsensors_df.coords:
        distance = calc_distance(tuple(float(s) for s in grid_coord.strip("()").split(",")), sensor_coord)
        rsensors_df[grid_coord][rsensors_df.coords == sensor_coord] = distance

# get k-NN sensor IDs
kNN_list = []
for grid_coord in rsensors_df.columns[4:]:
    temp_df = rsensors_df[['ID', grid_coord]]
    temp_df.sort_values(by=[grid_coord], axis=0, ascending=True, inplace=True)
    kNN_list.append(list(temp_df.ID[0:k]))
vsensors_df['NN_list'] = kNN_list

In [128]:
# add closest epa sensor

# Read historical epa data from s3
bucket = "capstone-air-pollution"
file_name = "EPA/historical_PM25.csv"  # historical
s3 = boto3.client('s3') 
obj = s3.get_object(Bucket= bucket, Key= file_name) 
epa_df = pd.read_csv(obj['Body']) 

#either use site_name or full_aqs_code... confirm that full_aqs_code is a unique, numeric id for station

epa_dict = {}
for station in epa_df.SiteName.unique():
    row = epa_df[epa_df.SiteName == station].iloc[0]
    epa_dict[station] = (row.Latitude, row.Longitude)

closest_epa = []
for vsensor in range(len(vsensors_df)):
    v_coords = (vsensors_df.iloc[vsensor].center_lat, vsensors_df.iloc[vsensor].center_lon)
    nn_distance = 0
    nn_station = ""
    for station in epa_dict.keys():
        distance = calc_distance(epa_dict[station], v_coords)
        if (nn_station == "" or nn_distance > distance):
            nn_distance = distance
            nn_station = station
    closest_epa.append(nn_station)

vsensors_df['closest_epa'] = closest_epa

# this is very fast, but as a lambda:
# vsensor['closest_epa'] = vsensors_df.apply(lambda x: closest_epa(x), axis=1)
# [bay_and_ocean.contains(pt) for pt in boxes_as_points]

In [129]:
# add closest noaa sensor

# read in all ASOS stations w/ lat lon info
filepath = "./data/maybe_noaa_sensors.csv"
NOAA_df = pd.read_csv(filepath)
closest_NOAA = []
for vsensor in range(len(vsensors_df)):
    v_coords = (vsensors_df.iloc[vsensor].center_lat, vsensors_df.iloc[vsensor].center_lon)
    nn_distance = 0
    nn_station = ""
    for station in range(len(NOAA_df)):
        distance = calc_distance((NOAA_df.iloc[station].lat, NOAA_df.iloc[station].lon), v_coords)
        if (nn_station == "" or nn_distance > distance):
            nn_distance = distance
            nn_station = NOAA_df.iloc[station].call_sign
    closest_NOAA.append(nn_station)

vsensors_df['closest_NOAA'] = closest_NOAA


In [130]:
vsensors_df.to_csv(path_or_buf="./data/big_static_data.csv", index=True)

In [132]:
vsensors_df.head()

Unnamed: 0,min_lat,max_lat,min_lon,max_lon,x,y,center_lat,center_lon,in_water,ndvi,elevation,NN_list,closest_epa,closest_NOAA
178,37.911599,37.915142,-122.683496,-122.679004,0,178,37.913371,-122.68125,False,5662,-1,"[40763, 4793, 39815, 15251, 38851]",San Rafael,KSFO
182,37.925772,37.929315,-122.683496,-122.679004,0,182,37.927544,-122.68125,False,4825,14,"[4793, 15251, 13143, 40763, 4782]",San Rafael,KSFO
183,37.929315,37.932858,-122.683496,-122.679004,0,183,37.931086,-122.68125,False,7641,43,"[4793, 15251, 13143, 4782, 4766]",San Rafael,KSFO
184,37.932858,37.9364,-122.683496,-122.679004,0,184,37.934629,-122.68125,False,7641,72,"[15251, 4793, 4782, 13143, 4766]",San Rafael,KSFO
185,37.9364,37.939943,-122.683496,-122.679004,0,185,37.938172,-122.68125,False,8197,175,"[15251, 4793, 4782, 13143, 4766]",San Rafael,KSFO
