In [23]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error as mpe
from math import sqrt
from sklearn.metrics import mean_squared_error

In [17]:
no_coll_feat = pd.read_csv('No collinear dataframe.csv')

In [18]:
# Made the naive baseline prediction, using mean absolute error (MAE) as a metric.
mean = no_coll_feat['power_output'].mean()
dummy = np.full(no_coll_feat['power_output'].shape, mean)
mean_absolute_error(no_coll_feat['power_output'], dummy)

839.583720404172

In [4]:
# Set features and target.
x = no_coll_feat.drop('power_output', axis=1)
y = no_coll_feat['power_output']

In [37]:
# Test train & split the data.
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state=24)

In [6]:
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
y_train.shape

(2822,)

In [20]:
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mean_absolute_error(y_test, predictions)

396.28209047347707

In [26]:
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, predictions))
rmse

513.7766463638076

In [9]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mean_absolute_error(y_test, predictions)

315.8417630348616

In [27]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, predictions))
rmse

453.3140294866342

In [10]:
model = SVR()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mean_absolute_error(y_test, predictions)

719.8445632664136

In [28]:
model = SVR()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, predictions))
rmse

832.1967945700634

# Hyper-parameter Tuning of GBR model.

In [11]:
model = GradientBoostingRegressor(n_estimators=250,
                                 loss='huber')
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mean_absolute_error(y_test, predictions)

300.31656466571815

In [29]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, predictions))
rmse

453.3493534347657

In [36]:
#standard deviation for mae
np.std(np.abs(y_test - predictions))

325.35699760366566

In [34]:
import scipy.stats as st
abs_error = np.abs(y_test - predictions)

In [35]:
st.t.interval(0.95, len(abs_error)-1, loc=np.mean(abs_error), scale=st.sem(abs_error))

(298.5840899290443, 332.8221513136231)

### There is a 95% chance of getting an error from 298 to 332.

In [12]:
grid = {'n_estimators':[10,50,100,150,250,300]}
cv = GridSearchCV(model, grid, cv=5, verbose=True)
cv.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


# Evaluating on the test set

In [13]:
%%timeit
model = cv.best_estimator_
predictions = model.predict(X_test)
mean_absolute_error(y_test, predictions)

8.03 ms ± 2.46 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
mpe(y_test, predictions)

231.35845430353191