### Importing the Needed Libraries and Reading the Dataset

In [60]:
# Importing the required libraries
import pandas as pd
import numpy as np

In [61]:
# Reading the dataset to a Pandas DataFrame
df = pd.read_csv("C:/Users/PROSPERITY/Downloads/energydata.csv")

### Linear Regression on a Sample of the Dataset

In [62]:
# Selecting a sample of the dataset
sample_df = df[["T2", "T6"]]
sample_df.head()

Unnamed: 0,T2,T6
0,19.2,7.026667
1,19.2,6.833333
2,19.2,6.56
3,19.2,6.433333
4,19.2,6.366667


In [63]:
# Diving the sample data into the predictor and target
X= sample_df["T2"].values.reshape(-1,1)
y = sample_df["T6"].values.reshape(-1,1)

In [64]:
# Splitting the sample dataset into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [65]:
# Performing linear regression on the sample dataset
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [66]:
# Making predictions
pred = regressor.predict(X_test)
pred

array([[ 2.15578912],
       [10.01116055],
       [ 1.87391554],
       ...,
       [ 4.24758774],
       [ 8.69822311],
       [ 4.9893603 ]])

In [67]:
# Computing the root mean squared error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)

print('Root Mean Squared Error: {}'.format(round(rmse, 3)))

Root Mean Squared Error: 3.63


### Linear  Regression on the Entire Dataset

In [68]:
# Dropping the "date" and "lights" columns
energy_df = df.drop(["date", "lights"], axis=1)
energy_df.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [69]:
# Separating the dataset's features and labels
X1 = energy_df.drop("Appliances", axis=1)
y1 = energy_df["Appliances"]
X1.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [70]:
# Splitting the dataset into the training and test sets
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=42)

In [71]:
train_df = pd.concat([X1_train, y1_train], axis=1)
train_df.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
9129,21.5,35.626667,17.79,40.59,21.7,35.26,20.39,33.863333,19.6,40.425,...,38.26,0.25,766.4,83.0,2.0,65.0,-2.35,36.226675,36.226675,50
2453,19.5,44.56,18.7,44.29,19.823333,44.5,18.1,43.86,17.2,52.0,...,46.163333,3.166667,765.266667,85.333333,2.0,40.0,0.966667,43.199767,43.199767,30
9152,20.79,35.4,16.89,42.03,21.7,36.0,19.7,33.2,19.29,39.9,...,39.0675,-1.566667,766.0,89.333333,1.333333,60.666667,-3.1,24.976055,24.976055,40
12694,22.1,43.26,19.963333,45.5,23.39,39.79,21.1,39.06,20.66,58.054,...,37.4,8.833333,753.366667,81.0,1.666667,26.0,5.733333,16.161125,16.161125,120
16952,24.7,42.36,29.856667,31.79,26.171429,38.59,25.1,39.76,23.166667,60.13,...,44.466667,21.433333,752.1,51.0,2.0,40.0,10.8,17.055346,17.055346,50


In [72]:
test_df = pd.concat([X1_test, y1_test], axis=1)
test_df.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
8980,20.89,35.4,17.76,39.163333,20.29,36.9,19.76,34.2,18.6,51.29,...,39.03,1.766667,768.9,88.333333,2.333333,63.0,0.0,25.622221,25.622221,40
2754,21.89,53.1,21.29,45.36,21.633333,49.226667,20.533333,40.966667,17.677778,58.204444,...,44.2,2.7,754.6,90.0,3.0,27.0,1.2,23.474485,23.474485,90
9132,21.39,35.5,17.633333,40.53,21.666667,35.2,20.29,33.76,19.6,40.29,...,38.29,0.2,766.2,83.0,2.0,65.0,-2.4,0.143368,0.143368,50
14359,21.39,41.033333,23.89,34.84,22.033333,36.933333,22.39,35.236,19.633333,43.266667,...,38.56,8.85,767.8,70.833333,5.166667,40.0,3.716667,10.293451,10.293451,50
8875,19.963333,35.126667,16.463333,40.126667,20.0,36.4,19.26,34.966667,17.89,49.0,...,40.7,-0.466667,769.65,93.333333,4.0,48.833333,-1.45,10.754162,10.754162,70


In [73]:
# Data Normalization
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler()
norm_train = mmscaler.fit_transform(train_df)
norm_df_train = pd.DataFrame(norm_train, columns = train_df.columns)
norm_df_train.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
0,0.49736,0.283004,0.12285,0.585114,0.373878,0.303474,0.476577,0.239968,0.408027,0.155065,...,0.37638,0.16881,0.863125,0.776316,0.142857,0.984615,0.193182,0.724509,0.724509,0.037383
1,0.286167,0.576864,0.188999,0.693396,0.217957,0.735317,0.27027,0.681016,0.178691,0.330033,...,0.703504,0.262594,0.836758,0.807018,0.142857,0.6,0.343939,0.864018,0.864018,0.018692
2,0.422386,0.275548,0.057427,0.627256,0.373878,0.338059,0.414414,0.210702,0.378404,0.147129,...,0.409803,0.110397,0.853819,0.859649,0.095238,0.917949,0.159091,0.49942,0.49942,0.028037
3,0.560718,0.534101,0.280834,0.728807,0.51429,0.515189,0.540541,0.469242,0.509317,0.421545,...,0.340784,0.444802,0.559907,0.75,0.119048,0.384615,0.560606,0.323062,0.323062,0.102804
4,0.835269,0.504496,1.0,0.327578,0.745383,0.459106,0.900901,0.500126,0.748845,0.452926,...,0.633278,0.849946,0.530438,0.355263,0.142857,0.6,0.790909,0.340952,0.340952,0.037383


In [74]:
norm_test = mmscaler.transform(test_df)
norm_df_test = pd.DataFrame(norm_test, columns = test_df.columns)
norm_df_test.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
0,0.432946,0.275548,0.120669,0.543362,0.25673,0.380122,0.41982,0.254822,0.31247,0.319301,...,0.408251,0.217578,0.921287,0.846491,0.166667,0.953846,0.3,0.512348,0.512348,0.028037
1,0.538543,0.857785,0.377272,0.72471,0.368339,0.956224,0.489489,0.553364,0.224346,0.423819,...,0.622241,0.247588,0.5886,0.868421,0.214286,0.4,0.354545,0.469379,0.469379,0.074766
2,0.485744,0.278838,0.111461,0.583358,0.371109,0.30067,0.467568,0.235409,0.408027,0.153024,...,0.377621,0.167203,0.858472,0.776316,0.142857,0.984615,0.190909,0.002597,0.002597,0.037383
3,0.485744,0.460855,0.566271,0.416837,0.401573,0.381679,0.656757,0.300529,0.411212,0.19802,...,0.388797,0.445338,0.895696,0.616228,0.369048,0.6,0.468939,0.205668,0.205668,0.037383
4,0.335093,0.266557,0.026411,0.571554,0.232635,0.356753,0.374775,0.288647,0.244625,0.284685,...,0.477373,0.145766,0.938736,0.912281,0.285714,0.735897,0.234091,0.214886,0.214886,0.056075


In [75]:
# Separating the dataset's features and labels after normalization
X2_train = norm_df_train.drop("Appliances", axis=1)
y2_train = norm_df_train["Appliances"]
X2_train.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.49736,0.283004,0.12285,0.585114,0.373878,0.303474,0.476577,0.239968,0.408027,0.155065,...,0.475893,0.37638,0.16881,0.863125,0.776316,0.142857,0.984615,0.193182,0.724509,0.724509
1,0.286167,0.576864,0.188999,0.693396,0.217957,0.735317,0.27027,0.681016,0.178691,0.330033,...,0.240375,0.703504,0.262594,0.836758,0.807018,0.142857,0.6,0.343939,0.864018,0.864018
2,0.422386,0.275548,0.057427,0.627256,0.373878,0.338059,0.414414,0.210702,0.378404,0.147129,...,0.468262,0.409803,0.110397,0.853819,0.859649,0.095238,0.917949,0.159091,0.49942,0.49942
3,0.560718,0.534101,0.280834,0.728807,0.51429,0.515189,0.540541,0.469242,0.509317,0.421545,...,0.561915,0.340784,0.444802,0.559907,0.75,0.119048,0.384615,0.560606,0.323062,0.323062
4,0.835269,0.504496,1.0,0.327578,0.745383,0.459106,0.900901,0.500126,0.748845,0.452926,...,0.854318,0.633278,0.849946,0.530438,0.355263,0.142857,0.6,0.790909,0.340952,0.340952


In [76]:
X2_test = norm_df_test.drop("Appliances", axis=1)
y2_test = norm_df_test["Appliances"]
X2_test.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.432946,0.275548,0.120669,0.543362,0.25673,0.380122,0.41982,0.254822,0.31247,0.319301,...,0.457856,0.408251,0.217578,0.921287,0.846491,0.166667,0.953846,0.3,0.512348,0.512348
1,0.538543,0.857785,0.377272,0.72471,0.368339,0.956224,0.489489,0.553364,0.224346,0.423819,...,0.145682,0.622241,0.247588,0.5886,0.868421,0.214286,0.4,0.354545,0.469379,0.469379
2,0.485744,0.278838,0.111461,0.583358,0.371109,0.30067,0.467568,0.235409,0.408027,0.153024,...,0.468262,0.377621,0.167203,0.858472,0.776316,0.142857,0.984615,0.190909,0.002597,0.002597
3,0.485744,0.460855,0.566271,0.416837,0.401573,0.381679,0.656757,0.300529,0.411212,0.19802,...,0.561915,0.388797,0.445338,0.895696,0.616228,0.369048,0.6,0.468939,0.205668,0.205668
4,0.335093,0.266557,0.026411,0.571554,0.232635,0.356753,0.374775,0.288647,0.244625,0.284685,...,0.42768,0.477373,0.145766,0.938736,0.912281,0.285714,0.735897,0.234091,0.214886,0.214886


In [77]:
# Performing linear regression on the full dataset
linear_model = LinearRegression()
linear_model.fit(X2_train, y2_train)

In [78]:
# Making predictions on the training set
pred2_train = linear_model.predict(X2_train)
pred2_train

array([0.03736239, 0.08440778, 0.02815806, ..., 0.04623579, 0.08713294,
       0.07302946])

In [79]:
# Mean Absolute Error (MAE) for the training set
from sklearn.metrics import mean_absolute_error
mae1 = mean_absolute_error(y2_train, pred2_train)

print("Mean Absolute Error (Training set): {}".format(round(mae1, 3)))

Mean Absolute Error (Training set): 0.05


In [80]:
# Root Mean Squared Error (RMSE) for the training set
from sklearn.metrics import mean_squared_error
mse1 = mean_squared_error(y2_train, pred2_train)
rmse1 = np.sqrt(mse1)

print('Root Mean Squared Error (Training set): {}'.format(round(rmse1, 3)))

Root Mean Squared Error (Training set): 0.089


In [81]:
# Making predictions on the test set
pred2_test = linear_model.predict(X2_test)
pred2_test

array([0.03322207, 0.24411599, 0.03400024, ..., 0.06844707, 0.10032325,
       0.05722198])

In [82]:
# Mean Absolute Error (MAE) for the test set
mae2 = mean_absolute_error(y2_test, pred2_test)
print("Mean Absolute Error (Test set): {}".format(round(mae2, 3)))

Mean Absolute Error (Test set): 0.05


In [83]:
# Root Mean Squared Error (RMSE) for the test set
from sklearn.metrics import mean_squared_error
mse2 = mean_squared_error(y2_test, pred2_test)
rmse2 = np.sqrt(mse2)

print('Root Mean Squared Error (Test set): {}'.format(round(rmse2, 3)))

Root Mean Squared Error (Test set): 0.088


### Ridge and Lasso Regression

In [84]:
# Ridge regression
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X2_train, y2_train)

# Obtaining predictions
ridge_pred = ridge.predict(X2_test)

In [85]:
# Root Mean Squared Error (RMSE) after ridge regression
mse_ridge = mean_squared_error(y2_test, ridge_pred)
rmse_ridge = np.sqrt(mse_ridge)
print('Root Mean Squared Error for Ridge model: {}'.format(round(rmse_ridge, 3)))

Root Mean Squared Error for Ridge model: 0.088


In [86]:
# Lasso regression
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.001)
lasso.fit(X2_train, y2_train)
lasso_pred = lasso.predict(X2_test)
lasso.coef_

array([ 0.        ,  0.04053617,  0.        , -0.        ,  0.        ,
        0.        , -0.        ,  0.        , -0.        ,  0.        ,
        0.        , -0.        , -0.        , -0.        ,  0.        ,
       -0.01471318, -0.        , -0.        ,  0.        , -0.        ,
       -0.0477116 ,  0.00243033,  0.        , -0.        , -0.        ,
       -0.        ])

In [87]:
# Determining the number of non-zero coefficients
train_score = lasso.score(X2_train, y2_train)
test_score = lasso.score(X2_test, y2_test)

print('Training score for the Lasso regression model: {:.3f}'.format(train_score))
print('Test score for the Lasso regression model: {:.3f}'.format(test_score))
print('Number of features for training: {:.3f}'.format(np.sum(lasso.coef_ != 0)))

Training score for the Lasso regression model: 0.034
Test score for the Lasso regression model: 0.035
Number of features for training: 4.000


In [88]:
# Root Mean Squared Error (RMSE) for Lasso model
from sklearn.metrics import mean_squared_error
mse_lasso = mean_squared_error(y2_test, lasso_pred)
rmse_lasso = np.sqrt(mse_lasso)
print('Root Mean Squared Error for Lasso model: {}'.format(round(rmse_lasso, 5)))

Root Mean Squared Error for Lasso model: 0.09317
