# Virtual Sensing - Linear Regression

In [None]:
pseudocode:
    libraries
    constants for bounding box
    data_df = getData
    split into test-train
    fit
    predict
    calculate accuracy

In [204]:
# libraries
import datetime
from datetime import date, timedelta
from os import path
import pandas as pd
import numpy as np
import statistics
import boto3
import s3fs
from fastparquet import ParquetFile
from sklearn import linear_model
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

In [6]:
# constants
from getData import get_data
UP_LEFT = (38.008050, -122.536985)    
UP_RIGHT = (38.008050, -122.186437)   
DOWN_RIGHT = (37.701933, -122.186437) 
DOWN_LEFT = (37.701933, -122.536985)  
START_DATE = '2019/09/01' 
END_DATE = '2019/09/02'   
START_HOUR = '0'        
END_HOUR = '24'   

In [59]:
# load data into dataframe
data_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR, 'Monthly')

In [26]:
# get rid of rows that have no wind data? uncomment if yes.
# data_df = data_df[data_df.wind_data == 1.0]

In [73]:
data_df.shape

(726292, 68)

In [194]:
# winnow down the features
columns_to_keep = ['created', 'lat', 'lon', 'wind_data', 'wind_direction', 'wind_speed', 'gusts', 'gust_speed', 
                   'variable_winds', 'variable_wind_info', 'epa_pm25_unit', 'epa_pm25_value', 'wkday', 
                   'daytype', 'timeofday', 'wind_compass'] 
X_data_df = data_df[columns_to_keep]
y_data_df = ['2_5um']

In [195]:
# fix datatyping
X_data_df.dtypes

created                 int64
lat                   float64
lon                   float64
wind_data             float32
wind_direction         object
wind_speed            float64
gusts                 float32
gust_speed            float64
variable_winds        float32
variable_wind_info     object
epa_pm25_unit          object
epa_pm25_value        float64
wkday                  object
daytype                object
timeofday              object
wind_compass           object
dtype: object

In [196]:
X_data_df.wind_data = X_data_df.wind_data.astype(bool)

In [197]:
X_data_df.variable_winds = X_data_df.variable_winds.astype(bool)

In [205]:
# handle variable winds missing values 
vrb_wind_range_readings = 0
mid_ranges = list()

for row in range(len(X_data_df)):
    if X_data_df.variable_winds.iloc[row]:
        info = X_data_df.variable_wind_info.iloc[row]
        if info:
            vrb_wind_range_readings += 1
            first, second = info.split('V')
            mid_range = int((int(first) + int(second)) / 2)
            if (X_data_df.wind_direction.iloc[row] == 'VRB'):
                X_data_df.wind_direction.lloc[row] = mid_range
            mid_ranges.append(mid_range)

replacement = statistics.mode(mid_ranges)


280


In [187]:
X_data_df = X_data_df.replace('VRB', '999') # give variable wind a 999 value

In [188]:
wind_direction_obs = pd.to_numeric(X_data_df.wind_direction.dropna())
wind_direction_avg = wind_direction_obs.mean()
wind_direction_avg = int(wind_direction_avg)

In [133]:
X_data_df.wind_direction = X_data_df.wind_direction.replace(np.nan, wind_direction_avg) # give missing wind direction the mean

In [None]:
X_data_df.wind_direction = X_data_df.wind_direction.astype(int)

In [122]:
wind_speed_obs = pd.to_numeric(X_data_df.wind_speed.dropna())
wind_speed_avg = wind_speed_obs.mean()

In [124]:
X_data_df.wind_speed = X_data_df.wind_speed.replace(np.nan, wind_speed_avg) # give missing wind speed the mean

In [129]:
X_data_df.gusts = X_data_df.gusts.astype(bool)

In [131]:
X_data_df.gust_speed = X_data_df.gust_speed.replace(np.nan, 0)

In [None]:
X_data_df.dtypes

In [78]:
for col in X_data_df.columns:
    testy = X_data_df[col]
    print(col, testy.isna().sum())

created 0
lat 0
lon 0
wind_data 71681
wind_direction 71681
wind_speed 72093
gusts 71681
gust_speed 667534
variable_winds 71681
variable_wind_info 71681
epa_pm25_unit 24023
epa_pm25_value 24023
wkday 0
daytype 0
timeofday 0
wind_compass 0


In [75]:
# naively dump any rows with missing data
X_data_df.dropna(inplace = True)

In [76]:
X_data_df.shape

(58251, 16)

In [70]:
# split data into train-test
X_train, X_test, y_train, y_test = train_test_split(X_data_df, y_data_df, test_size=0.20, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [726292, 1]

In [None]:
# fit the data
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)

In [None]:
# make predictions
diabetes_y_pred = regr.predict(diabetes_X_test)

In [None]:
# calculate accuracy

In [None]:
# notes

In [10]:
data_df.shape

(726292, 68)

In [11]:
data_df.columns

Index(['0_3um', '0_5um', '1_0um', '2_5um', '5_0um', '10_0um', 'pm1_0',
       'pm10_0', 'created', 'pm1_0_atm', 'pm2_5_atm', 'pm10_0_atm', 'uptime',
       'rssi', 'temperature', 'humidity', 'pm2_5_cf_1', 'device_loc_typ',
       'is_owner', 'sensor_id', 'sensor_name', 'parent_id', 'lat', 'lon',
       'thingspeak_primary_id', 'thingspeak_primary_id_read_key',
       'thingspeak_secondary_id', 'thingspeak_secondary_id_read_key', 'a_h',
       'high_reading_flag', 'hidden', 'city', 'county', 'zipcode',
       'created_at', 'year', 'month', 'day', 'hour', 'minute', 'wban_number',
       'call_sign', 'call_sign2', 'interval', 'call_sign3', 'zulu_time',
       'report_modifier', 'wind_data', 'wind_direction', 'wind_speed', 'gusts',
       'gust_speed', 'variable_winds', 'variable_wind_info', 'sys_maint_reqd',
       'agency_name', 'aqi', 'category', 'epa_pm25_unit', 'epa_pm25_value',
       'full_aqs_code', 'intl_aqs_code', 'raw_concentration', 'site_name',
       'wkday', 'daytype', 'time

In [58]:
for col in data_df.columns:
    testy = data_df[col]
    print(col, testy.isna().sum())

0_3um 0
0_5um 0
1_0um 0
2_5um 0
5_0um 0
10_0um 0
pm1_0 0
pm10_0 0
created 0
pm1_0_atm 417
pm2_5_atm 417
pm10_0_atm 417
uptime 417
rssi 417
temperature 7864
humidity 7864
pm2_5_cf_1 417
device_loc_typ 0
is_owner 0
sensor_id 0
sensor_name 0
parent_id 0
lat 0
lon 0
thingspeak_primary_id 0
thingspeak_primary_id_read_key 0
thingspeak_secondary_id 0
thingspeak_secondary_id_read_key 0
a_h 652993
high_reading_flag 0
hidden 450765
city 10883
county 5690
zipcode 3265
created_at 0
year 0
month 0
day 0
hour 0
minute 0
wban_number 0
call_sign 0
call_sign2 0
interval 0
call_sign3 0
zulu_time 0
report_modifier 0
wind_data 0
wind_direction 0
wind_speed 0
gusts 0
gust_speed 595441
variable_winds 0
variable_wind_info 0
sys_maint_reqd 0
agency_name 24016
aqi 24016
category 24016
epa_pm25_unit 24016
epa_pm25_value 24016
full_aqs_code 24016
intl_aqs_code 24016
raw_concentration 24016
site_name 24016
wkday 0
daytype 0
timeofday 0
wind_compass 0
