In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
energy_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv')

In [3]:
energy_data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
# Drop date and lights columns
energy_data = energy_data.drop(['date', 'lights'], axis=1)

In [5]:
energy_data.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [6]:
# Check for null values
energy_data.isnull().sum()

Appliances     0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [7]:
# check basic statistics of the entire dataset.
energy_data.describe()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,25.795,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [8]:
energy_data.corr()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
Appliances,1.0,0.055447,0.086031,0.120073,-0.060465,0.08506,0.036292,0.040281,0.016965,0.01976,...,0.01001,-0.051462,0.099155,-0.034885,-0.152282,0.087122,0.00023,0.015353,-0.011145,-0.011145
T1,0.055447,1.0,0.164006,0.836834,-0.002509,0.892402,-0.02855,0.877001,0.097861,0.885247,...,0.844777,0.071756,0.682846,-0.150574,-0.345481,-0.087654,-0.07621,0.571309,-0.006203,-0.006203
RH_1,0.086031,0.164006,1.0,0.269839,0.797535,0.25323,0.844677,0.10618,0.880359,0.205797,...,0.115263,0.764001,0.340767,-0.293957,0.274126,0.204932,-0.021057,0.639106,-0.000699,-0.000699
T2,0.120073,0.836834,0.269839,1.0,-0.16561,0.735245,0.121497,0.762066,0.231563,0.72055,...,0.675535,0.157346,0.792255,-0.133028,-0.505291,0.052495,-0.069721,0.582602,-0.011087,-0.011087
RH_2,-0.060465,-0.002509,0.797535,-0.16561,1.0,0.137319,0.678326,-0.047304,0.721435,0.110409,...,0.054544,0.676467,0.033674,-0.255646,0.584911,0.06919,-0.005368,0.499152,0.006275,0.006275
T3,0.08506,0.892402,0.25323,0.735245,0.137319,1.0,-0.011234,0.852778,0.122737,0.888169,...,0.901324,0.134602,0.699417,-0.189974,-0.281718,-0.100776,-0.10231,0.645886,-0.005194,-0.005194
RH_3,0.036292,-0.02855,0.844677,0.121497,0.678326,-0.011234,1.0,-0.140457,0.898978,-0.050062,...,-0.19527,0.833538,0.118207,-0.233274,0.356192,0.263188,0.017041,0.414387,-0.000477,-0.000477
T4,0.040281,0.877001,0.10618,0.762066,-0.047304,0.852778,-0.140457,1.0,-0.04865,0.871813,...,0.889439,-0.025549,0.663478,-0.075292,-0.388602,-0.185747,-0.104768,0.519471,-0.001815,-0.001815
RH_4,0.016965,0.097861,0.880359,0.231563,0.721435,0.122737,0.898978,-0.04865,1.0,0.091812,...,-0.044518,0.856591,0.293289,-0.250748,0.336813,0.300192,0.002636,0.616509,-0.001787,-0.001787
T5,0.01976,0.885247,0.205797,0.72055,0.110409,0.888169,-0.050062,0.871813,0.091812,1.0,...,0.911055,0.072308,0.651321,-0.170999,-0.273953,-0.145011,-0.084164,0.588362,-0.00549,-0.00549


In [10]:
# normalising dataset to a common scale.
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalised_data = pd.DataFrame(scaler.fit_transform(energy_data), columns=energy_data.columns)

In [11]:
normalised_data.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [12]:
# split normalised data into dependent and independent variables
x = normalised_data.iloc[:, 1:]
y = normalised_data.iloc[:, :1]

In [13]:
x

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,0.381691,...,0.223032,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449
1,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,0.175506,0.381691,...,0.226500,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083
2,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,0.380037,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,0.380037,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,0.380037,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,0.752031,0.339590,...,0.864724,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981
19731,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,0.754897,0.338487,...,0.864724,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726
19732,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,0.754897,0.337585,...,0.864724,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979
19733,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,0.752031,0.336583,...,0.864724,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371


In [14]:
y

Unnamed: 0,Appliances
0,0.046729
1,0.046729
2,0.037383
3,0.037383
4,0.046729
...,...
19730,0.084112
19731,0.074766
19732,0.242991
19733,0.383178


In [16]:
# split the dataset into train and test set.
from sklearn.model_selection import train_test_split as tts

x_train, x_test, y_train, y_test = tts(x, y, test_size=0.3, random_state=42)

In [17]:
# fit the train dataset to the model
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
# make predictions
linear_model_pred = linear_model.predict(x_test)

In [49]:
# Question 13 - Mean Absolute Error
from sklearn.metrics import mean_absolute_error as mae

linear_model_mae = mae(y_test, linear_model_pred)
print('Mean Absolute Error of Linear Model is %.2f' % linear_model_mae)

Mean Absolute Error of Linear Model is 0.05


In [50]:
# QUESTION 16 - COEFFICIENT OF DETERMINATION OF MULTIPLE LINEAR MODEL
from sklearn.metrics import r2_score as r2

linear_model_rsq = r2(y_test, linear_model_pred)
print('R-SQUARED of Linear model is %.2f' % linear_model_rsq)

R-SQUARED of Linear model is 0.15


In [26]:
# Question 15 - ROOT MEAN SQUARE ERROR
from sklearn.metrics import mean_squared_error as mse
linear_model_rmse = np.sqrt(mse(y_test, linear_model_pred))
print('Root Mean Squared Error is %f' % linear_model_rmse)

Root Mean Squared Error is 0.087514


In [27]:
# Question 14 - RESIDUAL SUM OF SQUARES
rss = np.sum(np.square(y_test - linear_model_pred))
print('RSS is %f' % rss)

RSS is 45.347630


In [28]:
# Question 17 - Linear Model Weights Estimation
coef = linear_model.coef_
coef = coef.flatten()
weights = pd.Series(coef, x_train.columns).sort_values()
weights_df = pd.DataFrame(weights).reset_index()
weights_df.columns = ['Columns', 'Linear Model Coefficient']

In [31]:
weights_df.sort_values(by='Linear Model Coefficient')

Unnamed: 0,Columns,Linear Model Coefficient
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


In [43]:
# model between T2 and T6
xt = normalised_data.iloc[:, 3:4]
yt = normalised_data.iloc[:, 11:12]

In [66]:
# Question 12 - Linear regression on T2 and T6
from sklearn.linear_model import LinearRegression

x = normalised_data.iloc[:, 3:4]
y = normalised_data.iloc[:, 11:12]

x_train, x_test, y_train, y_test = tts(x, y, test_size=0.3, random_state=42)

reg = LinearRegression()
reg.fit(x_train, y_train)

pred = reg.predict(x_test)

# R SQUARED
from sklearn.metrics import r2_score as r2

rsq = r2(y_test, pred)
print('R-SQUARED of Linear model of T2 and T6 is %.2f' % rsq)

R-SQUARED of Linear model is 0.64


In [64]:
# Question 18 - Ridge Regression
from sklearn.linear_model import Ridge

x = normalised_data.iloc[:, 1:]
y = normalised_data.iloc[:, :1]

x_train, x_test, y_train, y_test = tts(x, y, test_size=0.3, random_state=42)

ridge = Ridge(alpha=0.4)
ridge.fit(x_train, y_train)

r_pred = ridge.predict(x_test)

# ROOT MEAN SQUARE ERROR of Ridge Regression
from sklearn.metrics import mean_squared_error as mse

ridge_rmse = np.sqrt(mse(y_test, r_pred))
print('Root Mean Squared Error is %.2f' % ridge_rmse)

Root Mean Squared Error is 0.09


In [55]:
# Question 19 - Lasso Regression
# Lasso
from sklearn.linear_model import Lasso

x = normalised_data.iloc[:, 1:]
y = normalised_data.iloc[:, :1]

x_train, x_test, y_train, y_test = tts(x, y, test_size=0.3, random_state=42)

lasso = Lasso(alpha=0.001)
lasso.fit(x_train, y_train)

l_pred = lasso.predict(x_test)
print(l_pred)

we = pd.DataFrame(lasso.coef_.flatten(), index=x_train.columns, columns=['Lasso Weights'])

[0.07370267 0.08143458 0.07716072 ... 0.07792848 0.09034412 0.08359255]


In [56]:
we

Unnamed: 0,Lasso Weights
T1,0.0
RH_1,0.01788
T2,0.0
RH_2,-0.0
T3,0.0
RH_3,0.0
T4,-0.0
RH_4,0.0
T5,-0.0
RH_5,0.0


In [53]:
# QUESTION 20 - ROOT MEAN SQUARE ERROR OF LASSO REGRESSION
from sklearn.metrics import mean_squared_error as mse
lasso_rmse = np.sqrt(mse(y_test, l_pred))
print('Root Mean Squared Error is %.3f' % lasso_rmse)

Root Mean Squared Error is 0.095
