In [1]:
# Normalizing data
from sklearn.preprocessing import StandardScaler

# Dimension Reduction
from sklearn.decomposition import PCA

# Modeling Utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load Data
import pickle

# Data Manipulation
import numpy as np

In [2]:
with open('../data/cleaned.pickle', 'rb') as handle:
    data = pickle.load(handle)
originalX = data.loc[:,data.columns!="KWH"].to_numpy()
y = data.loc[:,data.columns=="KWH"].to_numpy().flatten()
scaler = StandardScaler()
X = scaler.fit_transform(originalX)

## Tried PCA, but lost information while reducing dimensionality
# pca = PCA(n_components=300)
# X = pca.fit_transform(X)

In [3]:
trainX, testX, trainy, testy = train_test_split(X,y,train_size=0.7)

# Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(trainX, trainy)
pred = model.predict(testX)
mse = mean_squared_error(pred, testy)
print("Linear Regression with Mean Squared Error:", mse)
print("R^2:",model.score(testX, testy))
print("Possible overfitting?")

Linear Regression with Mean Squared Error: 3.1675833505639946e-05
R^2: 0.9999999999994712
Possible overfitting?


# Ridge Regression

In [4]:
from sklearn.linear_model import RidgeCV
model = RidgeCV(alphas = np.linspace(0.01, 50, 50), cv=5).fit(trainX, trainy)
pred = model.predict(testX)
mse = mean_squared_error(pred, testy)
print("Ridge Regression Best alpha:",model.alpha_, "with Mean Squared Error:", mse)
print("Explanability(R^2):", model.score(testX, testy))

Ridge Regression Best alpha: 0.01 with Mean Squared Error: 0.4256219919156357
Explanability(R^2): 0.9999999928949019


# Lasso Regression

In [5]:
from sklearn.linear_model import LassoCV
model = LassoCV(alphas = np.linspace(0.01, 50, 200), cv=10, random_state=0).fit(trainX, trainy)
pred = model.predict(testX)
mse = mean_squared_error(pred, testy)
print("Lasso Regression Best alpha:",model.alpha_, "with Mean Squared Error:", mse)
print("Explanability(R^2):", model.score(testX, testy))
cnt = 0
for i in model.coef_:
    cnt+=1
print("Removed",cnt,"columns in lasso variable selection side effect")

Lasso Regression Best alpha: 0.01 with Mean Squared Error: 19.628470126217238
Explanability(R^2): 0.9999996723331782
Removed 409 columns in lasso variable selection side effect


# Elastic Net

In [6]:
# from sklearn.linear_model import ElasticNetCV
# model = ElasticNetCV(l1_ratio = np.linspace(1, 50, 50),\
#                      alphas = np.linspace(1, 50, 50), cv=5, random_state=0,max_iter = 1000000000000).fit(trainX, trainy)
# pred = model.predict(testX)
# mse = mean_squared_error(pred, testy)
# print("Lasso Regression Best alpha:",model.alpha_, "with Mean Squared Error:", mse)
# print("Explanability(R^2):", model.score(testX, testy))
# cnt = 0
# for i in model.coef_:
#     cnt+=1
# print("Removed",cnt,"columns in lasso variable selection side effect")

# SVR

In [7]:
# from sklearn.svm import SVR
# model = SVR(C = 0.2).fit(trainX, trainy)
# pred = model.predict(testX)
# mse = mean_squared_error(pred, testy)
# print("SVR with Mean Squared Error:", mse)

SVR with Mean Squared Error: 61941376.425311156
