In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt
import scipy
import statsmodels.api as sm
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv('energy.csv')
dataset.describe()

Unnamed: 0,Ambient Temperature,Exhaust Vacuum,Ambient Pressure,Relative Humidity,Net hourly electrical energy outpu
count,9568.0,9568.0,9568.0,9568.0,9568.0
mean,19.651231,54.305804,1013.259078,73.308978,454.365009
std,7.452473,12.707893,5.938784,14.600269,17.066995
min,1.81,25.36,992.89,25.56,420.26
25%,13.51,41.74,1009.1,63.3275,439.75
50%,20.345,52.08,1012.94,74.975,451.55
75%,25.72,66.54,1017.26,84.83,468.43
max,37.11,81.56,1033.3,100.16,495.76


In [3]:
print(dataset.isnull().sum())

Ambient Temperature                   0
Exhaust Vacuum                        0
Ambient Pressure                      0
Relative Humidity                     0
Net hourly electrical energy outpu    0
dtype: int64


In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(dataset.values, i) for i in range(dataset.shape[1])]
vif["features"] = dataset.columns
vif.round(2)

Unnamed: 0,VIF Factor,features
0,87.95,Ambient Temperature
1,84.04,Exhaust Vacuum
2,10249.44,Ambient Pressure
3,42.89,Relative Humidity
4,8110.56,Net hourly electrical energy outpu


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
x = dataset.iloc[:,1:]
y = dataset.iloc[:,0]
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.2, random_state=123)
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [6]:
import statsmodels.api as sm
X2 = sm.add_constant(x_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

                             OLS Regression Results                            
Dep. Variable:     Ambient Temperature   R-squared:                       0.940
Model:                             OLS   Adj. R-squared:                  0.940
Method:                  Least Squares   F-statistic:                 2.990e+04
Date:                 Mon, 22 Apr 2019   Prob (F-statistic):               0.00
Time:                         15:13:07   Log-Likelihood:                -15468.
No. Observations:                 7654   AIC:                         3.095e+04
Df Residuals:                     7649   BIC:                         3.098e+04
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         19.6079      0.021    939.26

In [7]:
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.05, random_state=123)

In [8]:
import statsmodels.api as sm
X2 = sm.add_constant(x_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

                             OLS Regression Results                            
Dep. Variable:     Ambient Temperature   R-squared:                       0.939
Model:                             OLS   Adj. R-squared:                  0.939
Method:                  Least Squares   F-statistic:                 3.514e+04
Date:                 Mon, 22 Apr 2019   Prob (F-statistic):               0.00
Time:                         15:13:07   Log-Likelihood:                -18427.
No. Observations:                 9089   AIC:                         3.686e+04
Df Residuals:                     9084   BIC:                         3.690e+04
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------

In [9]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(x_test)
coeff_df = pd.DataFrame(regressor.coef_, x.columns, columns=['Coefficient'])  
coeff_df

Unnamed: 0,Coefficient
Exhaust Vacuum,0.06543
Ambient Pressure,-0.072358
Relative Humidity,-0.109556
Net hourly electrical energy outpu,-0.322485


In [10]:
final_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Diff': y_pred - y_test})  
final_df.head(100)

Unnamed: 0,Actual,Predicted,Diff
6503,23.91,19.273687,-4.636313
1586,28.40,25.649787,-2.750213
7033,13.86,13.833248,-0.026752
1484,8.83,9.314838,0.484838
6938,25.77,23.415823,-2.354177
3029,16.20,15.072526,-1.127474
5469,10.29,8.091723,-2.198277
2197,28.85,29.266583,0.416583
449,29.23,29.932242,0.702242
5652,27.20,27.062016,-0.137984


In [11]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1.4504792267272633
Mean Squared Error: 3.4198697029648915
Root Mean Squared Error: 1.8492889722714758
