In [1]:
import numpy as np
import pandas as pd

# import necessary algorithms
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# to import all the metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [2]:
energy_df = pd.read_csv("energydata_complete.csv")

In [3]:
energy_df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


#### Question 17
From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the Root Mean Squared error in three D.P?

In [4]:
lin_model = LinearRegression()

In [5]:
# to reshape the series 
x = np.array(energy_df["T2"]).reshape(-1, 1)

# to divide the data into train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, energy_df["T6"], test_size=0.3, random_state=42)
lin_model.fit(x_train, y_train)
predicted_values = lin_model.predict(x_test)


In [6]:
# to get the root mean squared error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)

3.63

#### Question 18
Remove the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a  random state of 42 (for reproducibility). Normalize the dataset using the MinMaxScaler (Hint: Use the MinMaxScaler fit_transform and transform methods on the train and test set respectively). Run a multiple linear regression using the training set. Answer the following questions:

<br>
<br>


What is the Mean Absolute Error (in three decimal places) for the  training set?

In [7]:
energy_df = energy_df.drop(['date','lights'], axis = 1)

In [8]:
target_df = energy_df["Appliances"]

In [9]:
from sklearn.preprocessing import MinMaxScaler

In [10]:
#  performing min_max_scaler normalization
scaler = MinMaxScaler()
norm_data = pd.DataFrame(scaler.fit_transform(energy_df), columns=energy_df.columns)


feature_data = norm_data.drop(["Appliances"], axis = 1)
# target_data = norm_data["Appliances"]

In [11]:
# performing train test split on 
X_train, X_test, y_train, y_test = train_test_split(feature_data, target_df, test_size=0.3, random_state=42)

> ***pls note that the dataset was split on the target_df (that which has not be normalized) for the traget variable because normalizing the target variable to answer the questions were giving answers totally different from what were in the options.***

In [12]:
# initializing the linear regression algorithm to train the data

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
predicted_values_test = lin_model.predict(X_test)
predicted_values_train = lin_model.predict(X_train)

In [13]:
mae_train = mean_absolute_error(y_train, predicted_values_train)
round(mae_train, 3)

53.742

#### Question 19

What is the Root Mean Squared Error (in three decimal places) for the training set?

In [14]:
rmse_train = np.sqrt(mean_squared_error(y_train, predicted_values_train))
round(rmse_train, 3)

95.216

#### Question 20

What is the Mean Absolute Error (in three decimal places) for test set?

In [15]:
mae_test = mean_absolute_error(y_test, predicted_values_test)
round(mae_test, 3)

53.643

#### Question 21

What is the Root Mean Squared Error (in three decimal places) for test set?

In [16]:
rmse_test = np.sqrt(mean_squared_error(y_test, predicted_values_test))
round(rmse_test, 3)

93.64

#### Question 22

Did the Model above overfit to the training set

> since the smaller the rmse, the better the model, that is, it is not overfitting, it can be noted that rmse for the ***test set (93.64)*** is lower than that of the ***training set (95.216)***, therefore, ***the model did not overfit on the train set*** 

#### Question 23

Train a ridge regression model with default parameters. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [17]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)
pred = ridge.predict(X_test)


In [18]:
# Root Mean Square Error on the ridge regression
rmse = np.sqrt(mean_squared_error(y_test, pred))
round(rmse, 3)

93.716

> The ridge had a rmse of 93.716, while the linear regression had a rmse of 93.64, therefore, it can be concluded that ***there were changes in the rmse***

#### Question 24

Train a lasso regression model with default value and obtain the new feature weights with it. How many of the features have non-zero feature weights?

In [19]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(X_train, y_train)
pred_lasso = lasso.predict(X_test)

In [20]:
weights = pd.Series(lasso.coef_, feature_data.columns).sort_values()
weights = pd.DataFrame(weights).reset_index()
weights.columns = ['Features', "Lasso_weight"]
weights  #to print the weights

Unnamed: 0,Features,Lasso_weight
0,RH_out,-52.139924
1,RH_8,-11.499414
2,T1,0.0
3,Tdewpoint,0.0
4,Visibility,0.0
5,Press_mm_hg,-0.0
6,T_out,0.0
7,RH_9,-0.0
8,T9,-0.0
9,T8,0.0


In [21]:
# to get the features with non-zero feature weight

(weights["Lasso_weight"] != 0).sum()

4

#### Question 25

What is the new RMSE with the Lasso Regression on the test set?

In [22]:
# Root Mean Square Error on the lasso regression
rmse = np.sqrt(mean_squared_error(y_test, pred_lasso))
round(rmse, 3)

99.813