In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [2]:
# Importing the data-set
df = pd.read_csv('energydata_complete.csv')

In [3]:
# Preview data
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
# check data shapes
df.shape

(19735, 29)

In [5]:
# check for types of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [6]:
# Check if there missing values in data
df.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [7]:
# Data Summary
df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,50.949283,7.910939,54.609083,20.267106,35.3882,22.029107,42.936165,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,9.022034,6.090347,31.149806,2.109993,5.114208,1.956162,5.224361,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,29.815,-6.065,1.0,15.39,23.2,16.306667,29.6,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,45.4,3.626667,30.025,18.7,31.5,20.79,39.066667,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,49.09,7.3,55.29,20.033333,34.863333,22.1,42.375,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,53.663333,11.256,83.226667,21.6,39.0,23.39,46.536,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,25.795,96.321667,28.29,99.9,26.0,51.4,27.23,58.78,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


Q12

In [8]:
#Temperature in living room x=T6 and Temperature outside the building y=T2
y = df.T6
x = df.T2
X = np.array(x).reshape(-1, 1)

In [9]:
#Spliting our dataset into training and testing dataset, Using 70-30 train-test set split with a random state of 42.
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size = 0.3, random_state = 42)
# Instantiate model
model_df = LinearRegression()
#fit the model to the training dataset
model_df.fit(train_X, train_y)
#obtain predictions
predic_val =model_df.predict(val_X)

In [10]:
#Calculate the r2 score 
r_square = r2_score(val_y, predic_val)
r_square.round(2)

0.64

Q13

In [11]:
#normalise dataset to a common scale using the min max scaler
scaler=MinMaxScaler()
df_1 = df.drop(columns = ['date', 'lights'])
normalised_df = pd.DataFrame(scaler.fit_transform(df_1), columns=df_1.columns)
features_df = normalised_df.drop(columns=['Appliances' ])
target_var = normalised_df[ 'Appliances' ]

In [12]:
#Spliting our dataset into training and testing dataset, Using 70-30 train-test set split with a random state of 42.
train_X, val_X, train_y, val_y = train_test_split(features_df, target_var, test_size = 0.3, random_state = 42)
model_df = LinearRegression()

In [13]:
# Instantiate model
model_df1 = LinearRegression()
#fit the model to the training dataset
model_df1.fit(train_X, train_y)
#obtain predictions
predicted_values = model_df1.predict(val_X)

In [14]:
#Calculate MAE
mae = mean_absolute_error(val_y, predicted_values)
round(mae, 2 )

0.05

Q14

In [15]:
#Residual sum of squares
rss = np.sum(np.square(val_y - predicted_values))
round(rss, 2 )

45.35

Q15

In [16]:
#Calculate the root mean squared error
RMSE = np.sqrt(mean_squared_error(val_y,predicted_values))
round(RMSE,3)

0.088

Q16

In [17]:
#Coefficient of determinant
R2_score = r2_score(val_y,predicted_values)
round(R2_score,2)

0.15

Q17

In [18]:
def get_weights_df(model, feat, col_name):
#this function returns the weight of every feature
 weights = pd.Series(model.coef_, feat.columns).sort_values()
 weights_df = pd.DataFrame(weights).reset_index()
 weights_df.columns = ['Features', col_name]
 weights_df[col_name].round(3)
 return weights_df
linear_model_weights = get_weights_df(model_df1, train_X, 'Linear_Model_Weight')


In [19]:
linear_model_weights

Unnamed: 0,Features,Linear_Model_Weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


Q18

In [20]:
#Ridge Regression
# Instantiate model
ridge_reg = Ridge(alpha = 0.4)
# Instantiate model
ridge_reg.fit(train_X, train_y)
#obtain predictions
predic = ridge_reg.predict(val_X)

In [21]:
#Calculate the root mean squared error
Rmse = np.sqrt(mean_squared_error(val_y, predic))
round(Rmse,3)

0.088

Q19

In [22]:
#Initiating the Lasso Model[alpha = 0.001]
lasso = Lasso(alpha=0.001)
#Fitting the lasoo model
lasso.fit(train_X,train_y)
#Making Predictions
lasso_pred = lasso.predict(val_X)
lasso_weights_df = get_weights_df(lasso, train_X, 'Lasso_weight')
lasso_weights_df 

Unnamed: 0,Features,Lasso_weight
0,RH_out,-0.049557
1,RH_8,-0.00011
2,T1,0.0
3,Tdewpoint,0.0
4,Visibility,0.0
5,Press_mm_hg,-0.0
6,T_out,0.0
7,RH_9,-0.0
8,T9,-0.0
9,T8,0.0


In [23]:
# Non-zero weights
non_zero = lasso_weights_df[lasso_weights_df['Lasso_weight'] != 0]
non_zero

Unnamed: 0,Features,Lasso_weight
0,RH_out,-0.049557
1,RH_8,-0.00011
24,Windspeed,0.002912
25,RH_1,0.01788


Q20

In [24]:
#Making Predictions
predic_lasso = lasso.predict(val_X)
#Calculate the root mean squared error
Rmse = np.sqrt(mean_squared_error(val_y, predic_lasso))
Rmse.round(3)

0.094