# Virtual Sensing - Linear Regression

In [1]:
# libraries
import datetime
from datetime import date, timedelta
from os import path
import pandas as pd
import numpy as np
import statistics
import boto3
import s3fs
from fastparquet import ParquetFile
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [2]:
# constants
from getData import get_data
UP_LEFT = (38.008050, -122.536985)    
UP_RIGHT = (38.008050, -122.186437)   
DOWN_RIGHT = (37.701933, -122.186437) 
DOWN_LEFT = (37.701933, -122.536985)  
START_DATE = '2019/09/01' 
END_DATE = '2019/09/02'   
START_HOUR = '0'        
END_HOUR = '24'   

In [3]:
# load data into dataframe
data_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR, 'Monthly')

In [4]:
# get rid of rows that have no wind data? uncomment if yes.
# data_df = data_df[data_df.wind_data == 1.0]

In [5]:
# should be (726292, 68)
data_df.shape

(726292, 68)

In [6]:
# winnow down the features
columns_to_keep = ['created', 'lat', 'lon', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 
                   'variable_winds', 'variable_wind_info', 'epa_pm25_value', 'wkday', 
                   'daytype', 'timeofday', 'wind_compass', 'temperature', 'humidity'] 
X_data_df = data_df[columns_to_keep]
y_data_df = data_df['2_5um']

In [7]:
# fix datatyping
X_data_df.dtypes

created                 int64
lat                   float64
lon                   float64
wind_data             float32
wind_direction         object
wind_speed            float64
gusts                 float32
gust_speed            float64
variable_winds        float32
variable_wind_info     object
epa_pm25_value        float64
wkday                  object
daytype                object
timeofday              object
wind_compass           object
temperature           float64
humidity              float64
dtype: object

In [8]:
X_data_df.wind_data = X_data_df.wind_data.astype(bool)
X_data_df.variable_winds = X_data_df.variable_winds.astype(bool)
X_data_df.gusts = X_data_df.gusts.astype(bool)
X_data_df.daytype = X_data_df.daytype.astype(str)
X_data_df.daytype = X_data_df.daytype.astype('category')
X_data_df.timeofday = X_data_df.timeofday.astype(str)
X_data_df.timeofday = X_data_df.timeofday.astype('category')
X_data_df.wind_compass = X_data_df.wind_compass.astype(str)
X_data_df.wind_compass = X_data_df.wind_compass.astype('category')
X_data_df.wkday = pd.to_numeric(X_data_df.wkday)
X_data_df.wkday = X_data_df.wkday.astype('category')

In [9]:
# handle variable winds missing values 
vrb_wind_range_readings = 0
mid_ranges = list()

for row in range(len(X_data_df)):
    if X_data_df.variable_winds.iloc[row]:
        info = X_data_df.variable_wind_info.iloc[row]
        if info:
            vrb_wind_range_readings += 1
            first, second = info.split('V')
            mid_range = int((int(first) + int(second)) / 2)
            if (X_data_df.wind_direction.iloc[row] == 'VRB'):
                X_data_df.wind_direction.lloc[row] = mid_range
            mid_ranges.append(mid_range)

replacement = statistics.mode(mid_ranges)
X_data_df = X_data_df.replace('VRB', replacement) # give variable wind the most frequent midpoint variable range
X_data_df = X_data_df.drop(columns = ['variable_wind_info'])

In [10]:
wind_direction_obs = pd.to_numeric(X_data_df.wind_direction.dropna())
wind_direction_avg = wind_direction_obs.mean()
wind_direction_avg = int(wind_direction_avg)
X_data_df.wind_direction = X_data_df.wind_direction.replace(np.nan, wind_direction_avg) # give missing wind direction the mean
X_data_df.wind_direction = X_data_df.wind_direction.replace('', wind_direction_avg)
X_data_df.wind_direction = X_data_df.wind_direction.astype(int)

In [11]:
wind_speed_obs = pd.to_numeric(X_data_df.wind_speed.dropna())
wind_speed_avg = wind_speed_obs.mean()
X_data_df.wind_speed = X_data_df.wind_speed.replace(np.nan, wind_speed_avg) # give missing wind speed the mean
X_data_df.wind_speed = X_data_df.wind_speed.replace('', wind_speed_avg)

In [12]:
X_data_df.gust_speed = X_data_df.gust_speed.replace(np.nan, 0)
X_data_df.gust_speed = X_data_df.gust_speed.replace('', 0)

In [13]:
X_data_df.epa_pm25_value = X_data_df.epa_pm25_value.replace(np.nan, X_data_df.epa_pm25_value.mean())
X_data_df.temperature = X_data_df.temperature.replace(np.nan, X_data_df.temperature.mean())
X_data_df.humidity = X_data_df.humidity.replace(np.nan, X_data_df.humidity.mean())

In [14]:
# one hot encode the categoricals
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['timeofday'], prefix='timeofday')],axis=1)
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['daytype'], prefix='daytype')], axis=1)
X_data_df = pd.concat([X_data_df,pd.get_dummies(X_data_df['wind_compass'], prefix='compass')], axis=1)

In [15]:
X_data_df = X_data_df.drop(columns=['timeofday','daytype','wind_compass'])

In [16]:
X_data_df.dtypes

created                  int64
lat                    float64
lon                    float64
wind_data                 bool
wind_direction           int64
wind_speed             float64
gusts                     bool
gust_speed             float64
variable_winds            bool
epa_pm25_value         float64
wkday                    int64
temperature            float64
humidity               float64
timeofday_afternoon      uint8
timeofday_evening        uint8
timeofday_morning        uint8
timeofday_night          uint8
daytype_Weekday          uint8
daytype_Weekend          uint8
compass_ERROR            uint8
compass_East             uint8
compass_Missing          uint8
compass_No wind          uint8
compass_North            uint8
compass_South            uint8
compass_West             uint8
dtype: object

In [17]:
for col in X_data_df.columns:
    testy = X_data_df[col]
    print(col, testy.isna().sum())

created 0
lat 0
lon 0
wind_data 0
wind_direction 0
wind_speed 0
gusts 0
gust_speed 0
variable_winds 0
epa_pm25_value 0
wkday 0
temperature 0
humidity 0
timeofday_afternoon 0
timeofday_evening 0
timeofday_morning 0
timeofday_night 0
daytype_Weekday 0
daytype_Weekend 0
compass_ERROR 0
compass_East 0
compass_Missing 0
compass_No wind 0
compass_North 0
compass_South 0
compass_West 0


In [18]:
# split data into train-dev-test
X_train_and_dev, X_test, y_train_and_dev, y_test = train_test_split(X_data_df, y_data_df, test_size=0.20, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_and_dev, y_train_and_dev, test_size=0.125, random_state=42)

In [19]:
# fit the data
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
# make predictions
y_pred = regr.predict(X_dev)

In [21]:
# calculate accuracy
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_dev, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance (R2) score: %.2f' % r2_score(y_dev, y_pred))

Coefficients: 
 [ 2.34369968e-06 -5.99445235e+00 -4.99513904e+00  6.57420728e-01
 -1.54406650e-04  4.28689877e-02 -9.02973095e-01  1.99002058e-02
  7.60746937e-01  2.74987889e-01  6.16774883e-02 -7.85051000e-03
  4.77246551e-02 -3.96727566e-01 -5.40584858e-01  3.11733905e-01
  6.25578519e-01 -2.25800383e-01  2.25800383e-01  2.21843585e-02
 -1.45967100e-01  1.37123491e-02  1.30903832e-01  3.39405448e-01
 -1.88922197e-01 -1.71316692e-01]
Mean squared error: 219.23
Variance (R2) score: 0.01


In [22]:
# notes