In [None]:
# import libraries
import pandas as pd
import numpy as np 
import warnings
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import sys
sys.path.append("../HistoricalData/")
from getData import get_data

%matplotlib inline

warnings.filterwarnings('ignore')

In [None]:
# this call to get_data function that will take bounding box and timeframe and return cleaned data

UP_LEFT = (38.008050, -122.536985)    # (lat, lon)
UP_RIGHT = (38.008050, -122.186437)   # (lat, lon)
DOWN_RIGHT = (37.701933, -122.186437) # (lat, lon)
DOWN_LEFT = (37.701933, -122.536985)  # (lat, lon)
START_DATE = '2019/09/01' # begin date to start taking data
END_DATE = '2019/09/30'   # end date to start taking data
START_HOUR = '0'         # hour EACH DAY to start, this allows us to control for time of day effects
END_HOUR = '1'           # hour EACH DAY to end, this allows us to control for time of day effects
color = "red"

data_df = get_data(UP_LEFT, UP_RIGHT, DOWN_RIGHT, DOWN_LEFT, START_DATE, END_DATE, START_HOUR, END_HOUR)

In [None]:
# remove rows with na data for 2_5um
data_df = data_df[data_df['2_5um'].notna()]

In [None]:
# define direction degree range
NORTH = (316,45)
EAST = (46,135)
SOUTH = (136,225)
WEST = (226,315)

In [None]:
# go through the dataframe and add new categorical column that indicates direction: 
# North, South, East, West, No wind, Missing, ERROR

wind_compass = [] 
for row in range(len(data_df)):
    try:
        degree = int(data_df.loc[row].wind_direction)
    except:
        wind_compass.append('Missing')
        continue
    if data_df.loc[row].wind_speed == 0:
        wind_compass.append('No wind')
    elif degree >= NORTH[0] or degree <= NORTH[1]:
        wind_compass.append('North')
    elif degree >= EAST[0] and degree <= EAST[1]:
        wind_compass.append('East')
    elif degree >= SOUTH[0] and degree <= SOUTH[1]:
        wind_compass.append('South')
    elif degree >= WEST[0] and degree <= WEST[1]:
        wind_compass.append('West')
    else:
        wind_compass.append('ERROR')
data_df['wind_compass'] = wind_compass

In [None]:
# pick some random sensor
data_df.sensor_id.unique()
data_df_small = data_df[data_df.sensor_id == '16939']

len(data_df_small.index) # number of rows

In [None]:
np.random.seed(0)

# factorize for regression
data_df_small = data_df_small.join(pd.get_dummies(data_df_small.wind_compass))

train, test = train_test_split(data_df_small, test_size=0.2)
train_x = train[['West', 'East', 'South', 'North', 'No wind']].to_numpy()
train_y = train['2_5um'].to_numpy()
test_x = test[['West', 'East', 'South', 'North', 'No wind']].to_numpy()
test_y = test['2_5um'].to_numpy()

regr = LinearRegression()
regr.fit(train_x, train_y)

# Make predictions using the testing set
y_predictions = regr.predict(test_x)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(test_y, y_predictions))
# Explained variance score: 1 is perfect prediction, 0 is equivalent to predicting the expected value each time
print('Variance score: %.2f' % r2_score(test_y, y_predictions))

# Plot outputs

# x = test_x[:,0]
# plt.scatter(x, test_y,  color='black')
# plt.plot(x, y_predictions, color='blue', linewidth=3)
# plt.xticks(())
# plt.yticks(())
# plt.show()