In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read in the energy dataset
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Hamoye-Internship/Stage-B/energydata_complete.csv")

In [None]:
# check the first 5 rows
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


**Fetch the Columns**

In [None]:
#Fetch the columns and parse it to a list function
cols = list(data.columns)
print(cols)

['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2']


**Check for Missing Values**

In [None]:
#No Missing Data
data.isnull().sum()

**Check correlation of each variable**

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(data.corr(),cbar=True,annot=True,cmap='Reds')

### **Question 12**

In [None]:
X = np.array(data['T2']).reshape(-1, 1) #Set the column 'T2' to independent feature X and reshape
y = data['T6'] #Set the column 'T6' to the dependent output

In [None]:
#Import the LinearRegression module
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression() #Create an object of the LinearRegression class

In [None]:
linear_model.fit(X, y) #Fit a Linear regression Model to the data

LinearRegression()

In [None]:
pred = linear_model.predict(X) #Make predictions on the independent feature X

In [None]:
from sklearn.metrics import r2_score #Import R-Squared metrics for evaluation

r_square = r2_score(y, pred) #Perform the evaluation on the predicted values

print(r_square)#Print the output of the evaluation

0.6418990830855493


### **Data Preparation and model training for Question 13 - 16**

In [None]:
new_data = data.drop(['date', 'lights'],axis = 1) #Create a new dataset without the 'date' and 'lights' columns 

In [None]:
#Import the MinMaxScalar module for data normalization

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

normalized_data = pd.DataFrame(scaler.fit_transform(new_data), columns=new_data.columns) # Normalize the Dataset

In [None]:
x_data = normalized_data.drop('Appliances', axis = 1) #Create features X by droping the 'Appliances' column

y_data = normalized_data['Appliances'] #Create the output y by selecting the 'Appliances' column

In [None]:
# Split the dataset into training and test set at a ratio of 70/30

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 42)

In [None]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression() #Create a new object of linearRegression()

In [None]:
linear_model.fit(x_train, y_train) #Fit the model to the training set

LinearRegression()

In [None]:
pred_val = linear_model.predict(x_test) #Make predictions on the test set

### **Question 13**

In [None]:
#Calculate the Mean Absolute Error and print the output to 2 decimal places

from sklearn.metrics import mean_absolute_error

error = mean_absolute_error(pred_val, y_test)

print(round(error, 2))

0.05


### **Question 14**

In [None]:
#Calculate the Residual Sum of Squere and print the output to 2 decimal places
rss = np.sum(np.square((y_test - pred_val)))
print(round(rss, 2))

45.35


### **Question 15**

In [None]:
#Calculate the Root MEan Squared error and print the output to 3 decimal places
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, pred_val)

print(round(np.sqrt(mse), 3))

0.088


### **Question 16**

In [None]:
#Calculate the Coefficient of determination (R-Square) and print the output to 2 decimal places
from sklearn.metrics import r2_score

r_square = r2_score(y_test, pred_val)

print(round(r_square, 2))

0.15


### **Question 17**

In [None]:
# Create a function to Obtain feature weight

def get_weights_df(model, feat, col_name):
  weights = pd.Series(model.coef_, feat.columns).sort_values()#Sort the values in the pandas series
  weights_df = pd.DataFrame(weights).reset_index()
  weights_df.columns = ['Features', col_name]
  weights_df[col_name].round(3)
  return weights_df #this function returns the weight of every feature

In [None]:
linear_model_weights = get_weights_df(linear_model, x_train, 'Linear_Model_Weight') #Call the get_weights_df()

In [None]:
linear_model_weights #Print the feature weights and select the largest and lowest which are RH_1 and RH_2 respectively

Unnamed: 0,Features,Linear_Model_Weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


### **Question 18**

In [None]:
# Train a ridge regressor and compare the RMSE with the normal linear regression model

from sklearn.linear_model import Ridge #Import Ridge model

ridge_reg = Ridge(alpha = 0.4) #Create an object of the Ridge class
ridge_reg.fit(x_train, y_train)#Fit the Ridge regression to the train dataset

Ridge(alpha=0.4)

In [None]:
ridge_pred = ridge_reg.predict(x_test) #Make prediction on the test set

In [None]:
#Calculate RMSE of the Ridge Regression prediction
ridge_mse = mean_squared_error(y_test, ridge_pred)

print(round(np.sqrt(ridge_mse), 3))

#It's the same for the LinearRegression model

0.088


### **Question 19**

Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it.

In [None]:
from sklearn.linear_model import Lasso #Import Lasso Regression
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train) #Fit the Lasso regression to the training dataset

Lasso(alpha=0.001)

In [None]:
lasso_pred = lasso_reg.predict(x_test) #Make prediction on the test set

In [None]:
lasso_weights_df = get_weights_df(lasso_reg, x_train, 'Lasso_weight') #Create a feature wieght for the Lasso Regression
lasso_weights_df

# There are only 4 Non-Zero weights (RH_out,RH_8, Windspeed, RH_1)

Unnamed: 0,Features,Lasso_weight
0,RH_out,-0.049557
1,RH_8,-0.00011
2,T1,0.0
3,Tdewpoint,0.0
4,Visibility,0.0
5,Press_mm_hg,-0.0
6,T_out,0.0
7,RH_9,-0.0
8,T9,-0.0
9,T8,0.0


### **Question 20**

What is the new RMSE with the lasso regression? (in three (3) decimal places)







In [None]:
lasso_mse = mean_squared_error(y_test, lasso_pred)

print(round(np.sqrt(lasso_mse), 3))

0.094
