# Virtual Sensing - Linear Regression

In [1]:
# libraries
import datetime
from datetime import date, timedelta
from os import path
import pandas as pd
import numpy as np
import statistics
import boto3
import s3fs
import sys
from fastparquet import ParquetFile
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [2]:
# constants
sys.path.append("../HistoricalData/")
from getData import get_data

UP_LEFT = (38.008050, -122.536985)    
UP_RIGHT = (38.008050, -122.186437)   
DOWN_RIGHT = (37.701933, -122.186437) 
DOWN_LEFT = (37.701933, -122.536985)  
START_DATE = '2019/01/01' 
END_DATE = '2019/09/30'   
START_HOUR = '0'        
END_HOUR = '24'   

In [None]:
# load data into dataframe
data_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)

Processing 20190205 failed
Processing 20190402 failed
Processing 20190410 failed
Processing 20190416 failed
Processing 20190506 failed
Processing 20190519 failed


In [None]:
# get rid of rows that have no wind data? uncomment if yes.
# data_df = data_df[data_df.wind_data == 1.0]

In [None]:
# should be (726292, 68)
data_df.shape

In [None]:
# winnow down the features
columns_to_keep = ['created', 'lat', 'lon', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 
                   'variable_winds', 'variable_wind_info', 'epa_pm25_value', 'wkday', 
                   'daytype', 'timeofday', 'wind_compass', 'temperature', 'humidity'] 
X_data_df = data_df[columns_to_keep]
y_data_df = data_df['2_5um']

In [None]:
# fix datatyping
X_data_df.dtypes

In [None]:
X_data_df.wind_data = X_data_df.wind_data.astype(bool)
X_data_df.variable_winds = X_data_df.variable_winds.astype(bool)
X_data_df.gusts = X_data_df.gusts.astype(bool)
X_data_df.daytype = X_data_df.daytype.astype(str)
X_data_df.daytype = X_data_df.daytype.astype('category')
X_data_df.timeofday = X_data_df.timeofday.astype(str)
X_data_df.timeofday = X_data_df.timeofday.astype('category')
X_data_df.wind_compass = X_data_df.wind_compass.astype(str)
X_data_df.wind_compass = X_data_df.wind_compass.astype('category')
X_data_df.wkday = pd.to_numeric(X_data_df.wkday)
X_data_df.wkday = X_data_df.wkday.astype('category')

In [None]:
# handle variable winds missing values 
vrb_wind_range_readings = 0
mid_ranges = list()

for row in range(len(X_data_df)):
    if X_data_df.variable_winds.iloc[row]:
        info = X_data_df.variable_wind_info.iloc[row]
        if info:
            vrb_wind_range_readings += 1
            first, second = info.split('V')
            mid_range = int((int(first) + int(second)) / 2)
            if (X_data_df.wind_direction.iloc[row] == 'VRB'):
                X_data_df.wind_direction.lloc[row] = mid_range
            mid_ranges.append(mid_range)

replacement = statistics.mode(mid_ranges)
X_data_df = X_data_df.replace('VRB', replacement) # give variable wind the most frequent midpoint variable range
X_data_df = X_data_df.drop(columns = ['variable_wind_info'])

In [None]:
wind_direction_obs = pd.to_numeric(X_data_df.wind_direction.dropna())
wind_direction_avg = wind_direction_obs.mean()
wind_direction_avg = int(wind_direction_avg)
X_data_df.wind_direction = X_data_df.wind_direction.replace(np.nan, wind_direction_avg) # give missing wind direction the mean
X_data_df.wind_direction = X_data_df.wind_direction.replace('', wind_direction_avg)
X_data_df.wind_direction = X_data_df.wind_direction.astype(int)

In [None]:
wind_speed_obs = pd.to_numeric(X_data_df.wind_speed.dropna())
wind_speed_avg = wind_speed_obs.mean()
X_data_df.wind_speed = X_data_df.wind_speed.replace(np.nan, wind_speed_avg) # give missing wind speed the mean
X_data_df.wind_speed = X_data_df.wind_speed.replace('', wind_speed_avg)

In [None]:
X_data_df.gust_speed = X_data_df.gust_speed.replace(np.nan, 0)
X_data_df.gust_speed = X_data_df.gust_speed.replace('', 0)

In [None]:
X_data_df.epa_pm25_value = X_data_df.epa_pm25_value.replace(np.nan, X_data_df.epa_pm25_value.mean())
X_data_df.temperature = X_data_df.temperature.replace(np.nan, X_data_df.temperature.mean())
X_data_df.humidity = X_data_df.humidity.replace(np.nan, X_data_df.humidity.mean())

In [None]:
# one hot encode the categoricals
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['timeofday'], prefix='timeofday')],axis=1)
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['daytype'], prefix='daytype')], axis=1)
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['wind_compass'], prefix='compass')], axis=1)

In [None]:
X_data_df = X_data_df.drop(columns=['timeofday','daytype','wind_compass'])

In [None]:
X_data_df.dtypes

In [None]:
for col in X_data_df.columns:
    testy = X_data_df[col]
    print(col, testy.isna().sum())

In [None]:
# split data into train-dev-test
X_train_and_dev, X_test, y_train_and_dev, y_test = train_test_split(X_data_df, y_data_df, test_size=0.20, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_and_dev, y_train_and_dev, test_size=0.125, random_state=42)

In [None]:
# fit the data
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [None]:
# make predictions
y_pred = regr.predict(X_dev)

In [None]:
# calculate accuracy
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_dev, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance (R2) score: %.2f' % r2_score(y_dev, y_pred))

In [None]:
# notes