# Virtual Sensing - Linear Regression

In [None]:
import pandas as pd
import numpy as np 
import datetime as dt
import warnings
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import sys
sys.path.append("../HistoricalData/")
from getData import get_data

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [None]:
# constants
from getData import get_data
UP_LEFT = (38.008050, -122.536985)    
UP_RIGHT = (38.008050, -122.186437)   
DOWN_RIGHT = (37.701933, -122.186437) 
DOWN_LEFT = (37.701933, -122.536985)  
START_DATE = '2018/10/01' 
END_DATE = '2019/09/02'   
START_HOUR = '0'        
END_HOUR = '24'   

In [None]:
# load data into dataframe
data_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR, 'Monthly')

In [None]:
# remove outliers naively
data_df = data_df[data_df['2_5um'] < np.percentile(data_df['2_5um'], 99.5)]

In [None]:
data_df['created'] =  pd.to_datetime(data_df['created'], format='%Y%m%d%H%M')
data_df['time_delta'] = (data_df['created'] - pd.Timestamp('2019-09-01 00:00:00')) / np.timedelta64(1, 'm')
data_df.head()

In [None]:
columns_to_keep = ['time_delta', 'lat', 'lon'] 

id_train_dev, id_test = train_test_split(data_df.sensor_id.unique(), test_size=.2)
id_train, id_dev = train_test_split(id_train_dev, test_size=.125)

print(len(id_train))
print(len(id_dev))
print(len(id_test))

data_df_train = data_df[data_df.sensor_id.isin(id_train)]
data_df_dev = data_df[data_df.sensor_id.isin(id_dev)]
data_df_test = data_df[data_df.sensor_id.isin(id_test)]

print(data_df_train.shape)
print(data_df_dev.shape)
print(data_df_test.shape)

X_train = data_df_train[columns_to_keep]
X_dev = data_df_dev[columns_to_keep]
X_test = data_df_test[columns_to_keep]
y_train = data_df_train['2_5um']
y_dev = data_df_dev['2_5um']
y_test = data_df_test['2_5um']

In [None]:
# old non-stratified way of splitting data into train-dev-test

# X_train_and_dev, X_test, y_train_and_dev, y_test = train_test_split(X_data_df, y_data_df, test_size=0.20, random_state=42)
# X_train, X_dev, y_train, y_dev = train_test_split(X_train_and_dev, y_train_and_dev, test_size=0.125, random_state=42)
# print(X_train.shape)
# print(X_dev.shape)
# print(X_test.shape)

In [None]:
# Get scores for a naive model of "guess the mean"

# y_pred = np.empty(y_dev.shape)
# y_pred.fill(np.mean(y_train))
# y_pred.shape

# # Print the mean squared error
# print("Mean squared error: %.2f"
#       % mean_squared_error(y_dev, y_pred))
# # Print the explained variance score: 1 is perfect prediction, 0 is equivalent to guessing the expected value each time
# print('Variance score: %.2f' % r2_score(y_dev, y_pred))

In [None]:
# plot log transformed pollution levels out of curiosity

# plt.hist(np.log(data_df['2_5um']), range=(-4,6), bins=17)

In [None]:
mse = []
r2 = []

max_n = 8

for n in [2**i for i in range(max_n)]:
    print('neighbors:' + str(n))

    # fit the data
    regr = KNeighborsRegressor(n_neighbors=n)
    regr.fit(X_train, y_train)

    # make predictions
    y_pred = regr.predict(X_dev)

    # metrics
    # Print the mean squared error
    print("Mean squared error: %.2f"
          % mean_squared_error(y_dev, y_pred))
    mse.append(mean_squared_error(y_dev, y_pred))
    # Print the explained variance score: 1 is perfect prediction, 0 is equivalent to guessing the expected value each time
    print('Variance score: %.2f' % r2_score(y_dev, y_pred))
    r2.append(r2_score(y_dev, y_pred))
    print('')
    
print(mse)
print(r2)

In [None]:
plt.scatter(range(max_n), mse)
plt.xlabel('log base 2 of neighbors')
plt.ylabel('Mean Squared Error')
plt.show()

plt.scatter(range(max_n), r2)
plt.xlabel('log base 2 of neighbors')
plt.ylabel('R2 Score')
plt.show()