# Use k-NN model to predict worst air pollution locations on the grid

In [12]:
%matplotlib inline

from geopy.distance import distance
import pandas as pd
from time import sleep
import shapely.geometry
import pyproj
import geopandas as gpd
from matplotlib import pyplot as plt
from shapely.geometry import Point
import datetime
from datetime import date, timedelta
from os import path
import pandas as pd
import numpy as np
import statistics
import boto3
import s3fs
import sys
from fastparquet import ParquetFile
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import geopy
from geopy import distance
import gmplot

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [13]:
# load in the grid from csv file
boxes = pd.read_csv('data/500m_grid.csv')

In [15]:
# load in the model from a pickle file
from joblib import dump, load
model = load('VirtualSensing/models/kNN_model.joblib') 

In [16]:
# create a features dataframe
X_data_df = boxes.copy(deep=True) 
X_data_df = X_data_df[X_data_df.in_water == False]
X_data_df = X_data_df.drop(columns = ['min_lat', 'max_lat', 'min_lon', 'max_lon', 'x','y','in_water'])
X_data_df.rename(columns={'center_lat': 'lat', 'center_lon': 'lon'}, inplace=True)



In [17]:
# add column for time_delta for kNN model
lats_to_add = list(X_data_df.lat)
lons_to_add = list(X_data_df.lon)
all_sensors = len(X_data_df)
max_time = (pd.Timestamp('2019-09-04 23:50:00') - pd.Timestamp('2019-09-01 00:00:00')) / np.timedelta64(1, 'm')
lat = [lats_to_add]
lon = [lons_to_add]

In [18]:
current_time = 0
times = [[current_time] * all_sensors]
while (current_time <= max_time):
    current_time += 10
    time = [current_time] * all_sensors
    times.append(time)
    lat.append(lats_to_add[:])
    lon.append(lons_to_add[:])


In [19]:
flat_lat = [item for sublist in lat for item in sublist]
flat_lon = [item for sublist in lon for item in sublist]
flat_times = [item for sublist in times for item in sublist]

In [20]:
# make the dataframe 
data = {'lat': flat_lat, 'lon': flat_lon, 'time_delta': flat_times}
X_df = pd.DataFrame(data)

In [31]:
X_df.head()

Unnamed: 0,lat,lon,time_delta,pred_PM2_5,avg
0,37.824436,-122.534739,0,2.921875,2.921875
1,37.827984,-122.534739,0,2.921875,2.921875
2,37.831531,-122.534739,0,2.921875,2.921875
3,37.835079,-122.534739,0,2.921875,2.921875
4,37.838626,-122.534739,0,2.921875,2.921875


In [21]:
X_df.describe()

Unnamed: 0,lat,lon,time_delta
count,2448788.0,2448788.0,2448788.0
mean,37.86556,-122.3379,2880.0
std,0.0898563,0.1114132,1665.653
min,37.70371,-122.5347,0.0
25%,37.7854,-122.4449,1440.0
50%,37.87409,-122.3057,2880.0
75%,37.94496,-122.2383,4320.0
max,38.00869,-122.1844,5760.0


In [22]:
# make predictions
y_pred = model.predict(X_df)

In [23]:
X_df['pred_PM2_5'] = y_pred

In [26]:
X_df['avg'] = y_pred

In [32]:
boxes['avg_PM2_5'] = [0] * len(boxes)

In [33]:
for sensor in range(len(boxes)):
    boxes.avg_PM2_5.iloc[sensor] = X_df[(X_df.lat == boxes.center_lat.iloc[sensor]) & 
         (X_df.lon == boxes.center_lon.iloc[sensor])].pred_PM2_5.mean()

In [43]:
# create a dataframe for mapping
map_df = boxes[boxes.in_water == False]
map_df = map_df.sort_values(by='avg_PM2_5', ascending=False)

In [45]:
# plot top most polluted virtual sensor locations

HOW_MANY = 30 # how many sensors to place

gmap3=gmplot.GoogleMapPlotter(map_df.center_lat.iloc[0], map_df.center_lon.iloc[0], 10, apikey = "AIzaSyA2TdrwntJVu6IuS_3fOY7WLTLvhl3xntk")
gmap3.coloricon = "http://www.googlemapsmarkers.com/v1/%s/"
for sensor in range(HOW_MANY):
    gmap3.marker(map_df.center_lat.iloc[sensor], map_df.center_lon.iloc[sensor], color='cornflowerblue', title=sensor)#, title=map_df.pred_PM2_5)
gmap3.draw("data/grid_pred_map.html") 

In [None]:
%matplotlib inline

from geopy.distance import distance
import pandas as pd
from time import sleep
import shapely.geometry
import pyproj
import geopandas as gpd
from matplotlib import pyplot as plt
from shapely.geometry import Point

# libraries
import datetime
from datetime import date, timedelta
from os import path
import pandas as pd
import numpy as np
import statistics
import boto3
import s3fs
import sys
from fastparquet import ParquetFile
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import geopy
from geopy import distance

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [None]:
def makeGrid():
    # Set up projections
    p_ll = pyproj.Proj(init='epsg:4283') # grid in lat/lon
    p_mt = pyproj.Proj(init='epsg:3857') # metric; same as EPSG:900913

    # Create corners of rectangle to be transformed to a grid
    MIN_LAT = 37.701933
    MAX_LAT = 38.008050
    MIN_LON = -122.536985
    MAX_LON = -122.186437
    sw = shapely.geometry.Point((MIN_LON, MIN_LAT))
    ne = shapely.geometry.Point((MAX_LON,MAX_LAT))

    stepsize = 500 # 0.5 km grid step size

    # Project corners to target projection
    s = pyproj.transform(p_ll, p_mt, sw.x, sw.y) # Transform NW point to 3857
    e = pyproj.transform(p_ll, p_mt, ne.x, ne.y) # .. same for SE

    # Iterate over 2D area
    boxes = []
    min_lon = s[0]
    x = 0
    while min_lon < e[0]:
        max_lon = min_lon + stepsize
        min_lat = s[1]
        y = 0
    
        while min_lat < e[1]:
            max_lat = min_lat + stepsize
            b_left = shapely.geometry.Point(pyproj.transform(p_mt, p_ll, min_lon, min_lat))
            t_right = shapely.geometry.Point(pyproj.transform(p_mt, p_ll, max_lon, max_lat))
        
            bound_box = {'min_lat':b_left.y, 'max_lat':t_right.y, 'min_lon':b_left.x, 'max_lon':t_right.x, 'x': x, 'y':y}
        
            boxes.append(bound_box)
            min_lat = max_lat
            y += 1
        min_lon = max_lon
        x += 1
        
    boxes = pd.DataFrame(boxes)

    # find the center of each box
    boxes['center_lat'] = (boxes.min_lat + boxes.max_lat)/2
    boxes['center_lon'] = (boxes.min_lon + boxes.max_lon)/2
    
    base = gpd.read_file("bayarea.json")
    
    # map every box to whether it overlaps with the bay as defined by the shapefile
    boxes['in_water'] = [bay_and_ocean.contains(pt) for pt in boxes_as_points]

    # convert lat/lon to Point objects
    boxes_as_points = boxes.apply(lambda line: Point(line.center_lon, line.center_lat), axis = 1)
    
    return(boxes)

In [None]:
# save grid to csv file
boxes[['min_lat', 'max_lat', 'min_lon', 'max_lon', 'x', 'y', 'center_lat',
       'center_lon', 'in_water']].to_csv("500m_grid.csv", index = False)

In [None]:
boxes.head()

In [None]:
# create feature data at each center point of the grid
created, lat, lon, wind_data, wind_direction, wind_speed, gusts, gust_speed, variable_winds, epa_pm25_value, wkday, 
temperature, humidity, elevation, hour, month, timeofday_afternoon, timeofday_evening, timeofday_morning, timeofday_night,
daytype_Weekday, daytype_Weekend, compass_ERROR, compass_East, compass_Missing, compass_No wind, compass_North, 
compass_South, compass_West




In [None]:
# constants
sys.path.append("./HistoricalData/")
from getData import get_data

UP_LEFT = (38.008050, -122.536985)    
UP_RIGHT = (38.008050, -122.186437)   
DOWN_RIGHT = (37.701933, -122.186437) 
DOWN_LEFT = (37.701933, -122.536985)  
START_DATE = '2018/09/10' 
END_DATE = '2019/09/10'   
START_HOUR = '0'        
END_HOUR = '24'   

In [None]:
# load data into dataframe
data_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR, 'Monthly')

In [None]:
# add elevation data
elev_df = pd.read_csv('VirtualSensing/sensor_elevations.csv', header='infer', float_precision='high')
elev_df = elev_df.drop(columns='resolution')
data_df = pd.merge(data_df, elev_df)
print("How many elevations are missing?", data_df.elevation.isna().sum())
print("Shape of the new dataframe:", data_df.shape)

In [None]:
# winnow down the features
columns_to_keep = ['created', 'lat', 'lon', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 
                   'variable_winds', 'variable_wind_info', 'epa_pm25_value', 'wkday', 
                   'daytype', 'timeofday', 'wind_compass', 'temperature', 'humidity', 'elevation', 'hour', 'month']
X_data_df = data_df[columns_to_keep]
y_data_df = data_df['2_5um']

In [None]:
X_data_df.wind_data = X_data_df.wind_data.astype(bool)
X_data_df.variable_winds = X_data_df.variable_winds.astype(bool)
X_data_df.gusts = X_data_df.gusts.astype(bool)
X_data_df.daytype = X_data_df.daytype.astype(str)
X_data_df.daytype = X_data_df.daytype.astype('category')
X_data_df.timeofday = X_data_df.timeofday.astype(str)
X_data_df.timeofday = X_data_df.timeofday.astype('category')
X_data_df.wind_compass = X_data_df.wind_compass.astype(str)
X_data_df.wind_compass = X_data_df.wind_compass.astype('category')
X_data_df.wkday = pd.to_numeric(X_data_df.wkday)
X_data_df.wkday = X_data_df.wkday.astype('category')
X_data_df.hour = X_data_df.hour.astype(int)
X_data_df.month = X_data_df.month.astype(int)

In [None]:
# handle variable winds missing values 
vrb_wind_range_readings = 0
mid_ranges = list()

for row in range(len(X_data_df)):
    if X_data_df.variable_winds.iloc[row]:
        info = X_data_df.variable_wind_info.iloc[row]
        if info:
            vrb_wind_range_readings += 1
            first, second = info.split('V')
            mid_range = int((int(first) + int(second)) / 2)
            if (X_data_df.wind_direction.iloc[row] == 'VRB'):
                X_data_df.wind_direction.lloc[row] = mid_range
            mid_ranges.append(mid_range)

replacement = statistics.mode(mid_ranges)
X_data_df = X_data_df.replace('VRB', replacement) # give variable wind the most frequent midpoint variable range
X_data_df = X_data_df.drop(columns = ['variable_wind_info'])

In [None]:
# handle missing wind values with means 
wind_direction_obs = pd.to_numeric(X_data_df.wind_direction.dropna())
wind_direction_avg = wind_direction_obs.mean()
wind_direction_avg = int(wind_direction_avg)
X_data_df.wind_direction = X_data_df.wind_direction.replace(np.nan, wind_direction_avg) # give missing wind direction the mean
X_data_df.wind_direction = X_data_df.wind_direction.replace('', wind_direction_avg)
X_data_df.wind_direction = X_data_df.wind_direction.astype(int)
wind_speed_obs = pd.to_numeric(X_data_df.wind_speed.dropna())
wind_speed_avg = wind_speed_obs.mean()
X_data_df.wind_speed = X_data_df.wind_speed.replace(np.nan, wind_speed_avg) # give missing wind speed the mean
X_data_df.wind_speed = X_data_df.wind_speed.replace('', wind_speed_avg)
X_data_df.gust_speed = X_data_df.gust_speed.replace(np.nan, 0)
X_data_df.gust_speed = X_data_df.gust_speed.replace('', 0)
X_data_df.epa_pm25_value = X_data_df.epa_pm25_value.replace(np.nan, X_data_df.epa_pm25_value.mean())
X_data_df.temperature = X_data_df.temperature.replace(np.nan, X_data_df.temperature.mean())
X_data_df.humidity = X_data_df.humidity.replace(np.nan, X_data_df.humidity.mean())

In [None]:
# one hot encode the categoricals
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['timeofday'], prefix='timeofday')],axis=1)
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['daytype'], prefix='daytype')], axis=1)
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['wind_compass'], prefix='compass')], axis=1)

In [None]:
# remove the original columns that got one hot encoded, type the one hots as bools
X_data_df = X_data_df.drop(columns=['timeofday','daytype','wind_compass'])
X_data_df.timeofday_afternoon = X_data_df.timeofday_afternoon.astype(bool)
X_data_df.timeofday_evening = X_data_df.timeofday_evening.astype(bool)
X_data_df.timeofday_morning = X_data_df.timeofday_morning.astype(bool)
X_data_df.timeofday_night = X_data_df.timeofday_night.astype(bool)
X_data_df.daytype_Weekday = X_data_df.daytype_Weekday.astype(bool)
X_data_df.daytype_Weekend = X_data_df.daytype_Weekend.astype(bool)
X_data_df.compass_ERROR = X_data_df.compass_ERROR.astype(bool)
X_data_df.compass_East = X_data_df.compass_East.astype(bool)
X_data_df.compass_Missing = X_data_df.compass_Missing.astype(bool)
X_data_df['compass_No wind'] = X_data_df['compass_No wind'].astype(bool)
X_data_df.compass_North = X_data_df.compass_North.astype(bool)
X_data_df.compass_South = X_data_df.compass_South.astype(bool)
X_data_df.compass_West = X_data_df.compass_West.astype(bool)

In [None]:
# confirm no NAs at this point
for col in X_data_df.columns:
    testy = X_data_df[col]
    print(col, testy.isna().sum())