In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error

from sklearn import metrics

In [27]:
sun_df = pd.read_csv('Solar_Features.csv')
sun_df.columns

Index(['UNIXTime', 'Time', 'Radiation', 'Temperature', 'Pressure', 'Humidity',
       'WindDirection(Degrees)', 'Speed', 'TimeSunRise', 'TimeSunSet', 'Date',
       'Hour', 'Month', 'SecondsAfterSunRise', 'MinutesAfterSunRise',
       'WindDirection_int', 'Hour_7', 'Hour_8', 'Hour_9', 'Hour_10', 'Hour_11',
       'Hour_12', 'Hour_13', 'Hour_14', 'Hour_15', 'Hour_16', 'Hour_17',
       'Hour_18', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Speed_2',
       'Speed_3', 'Humidity_2', 'WindDirection_2', 'Pressure_2', 'Hour_2'],
      dtype='object')

In [73]:
# function to get cross validation scores
def cv_scores(model):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f'R score mean: {np.mean(scores)}')
    print(f'R score STD: {np.std(scores)}')


# Linear Regression (No Higher-Order Terms)

In [83]:
# explanatory and response variables
X = sun_df[['Temperature', 'Pressure', 'Humidity',
       'WindDirection(Degrees)', 'Speed', 
       'Hour_7', 'Hour_8', 'Hour_9', 'Hour_10', 'Hour_11',
       'Hour_12', 'Hour_13', 'Hour_14', 'Hour_15', 'Hour_16', 'Hour_17',
       'Hour_18', 'Month_9', 'Month_10', 'Month_11', 'Month_12']]
y = sun_df['Radiation']

# split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=66)
#, 'WindDirection(Degrees)'

#Instantiate
lm_model = LinearRegression(normalize=True) 

#Fit - why does this break?
lm_model.fit(X_train, y_train) 

# scores
cv_scores(lm_model)

R score mean: 0.7044469965624911
R score STD: 0.009887760063863025


# Linear Regression (With Higher-Order Terms)

In [84]:
# explanatory and response variables
X = sun_df[['Temperature', 'Pressure', 'Humidity',
       'WindDirection(Degrees)', 'Speed',
       'Hour_7', 'Hour_8', 'Hour_9', 'Hour_10', 'Hour_11',
       'Hour_12', 'Hour_13', 'Hour_14', 'Hour_15', 'Hour_16', 'Hour_17',
       'Hour_18', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Speed_2',
       'Speed_3', 'Humidity_2', 'WindDirection_2', 'Pressure_2']]
y = sun_df['Radiation']

# split into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=66)
#, 'WindDirection(Degrees)'

#Instantiate
lm_model = LinearRegression(normalize=True) 

#Fit - why does this break?
lm_model.fit(X_train, y_train) 

# scores
cv_scores(lm_model)

R score mean: 0.7247803014935117
R score STD: 0.009656203014365735


# Ridge Regression

In [76]:
from sklearn.linear_model import Ridge

# Train model with default alpha=1
ridge = Ridge(alpha=1).fit(X_train, y_train)
# get cross val scores
cv_scores(ridge)

R score mean: 0.7248166350733214
R score STD: 0.009668938886184721


### Grid Search

In [88]:
# set up a grid search over parameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

# set up dictionary for grid search 
param_grid = {'alpha':[0.005, 0.05, 0.1, 0.2, 0.3, 0.5, 1.]}
# set up cross-validation shuffles
shuffle_split = ShuffleSplit(test_size=0.3, train_size=0.7, n_splits=25)
# set up search
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=shuffle_split, scoring='r2', 
                              return_train_score=True)
# implement search
grid_search.fit(X,y)
# move results into DataFrame
results = pd.DataFrame(grid_search.cv_results_)
#print(results)
print('Best Score: ', grid_search.best_score_)
print('Best Params: ', grid_search.best_params_)


Best Score:  0.7247560227410288
Best Params:  {'alpha': 0.2}


# Lasso Regression

In [79]:
from sklearn.linear_model import Lasso

# Train model with default alpha=1
lasso = Lasso(alpha=1).fit(X_train, y_train)
# get cross val scores
cv_scores(lasso)

R score mean: 0.715844742638601
R score STD: 0.0100963388376416


In [90]:
# set up dictionary for grid search 
param_grid = {'alpha':[0.001, 0.005, 0.05, 0.1, 0.2, 0.3, 0.5, 1.]}
# set up cross-validation shuffles
shuffle_split = ShuffleSplit(test_size=0.3, train_size=0.7, n_splits=25)
# set up search
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=shuffle_split, scoring='r2', 
                              return_train_score=True)
# implement search
grid_search.fit(X,y)
# move results into DataFrame
results = pd.DataFrame(grid_search.cv_results_)
#print(results)
print('Best Score: ', grid_search.best_score_)
print('Best Params: ', grid_search.best_params_)

Best Score:  0.724665613412773
Best Params:  {'alpha': 0.005}
