In [1]:
#import the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [2]:
#load data

dataset = pd.read_csv('energydata_complete.csv')
dataset

FileNotFoundError: [Errno 2] No such file or directory: 'energydata_complete.csv'

In [None]:
#checking for missing data to avoid computational error
dataset.isnull().sum()

In [None]:
dataset.info()

In [None]:
#check the first 10 value
dataset.head()

In [None]:
#rename the columns for better understanding

rename = {'T1':'kitchen temperature', 'RH_1':'kitchen humidity', 'T2':'living room temperature', 'RH_2':'living room humidity', 'T3':'laundary temperature', 'RH_3':'laundary humidity', 'T4':'office temperature', 'RH_4':'office humidity', 'T5':'bathroom temperature', 'RH_5':'bathroom humidity', 'T6':'outside north temperature', 'RH_6':'outside north humidity', 'T7':'ironing room temperature', 'RH_7':'ironing room humidity', 'T8':'teenager room temperature', 'RH_8':'teenager room humidity', 'T9':'parents room temperature', 'RH_9':'parents room humidity', 'T_out':'outside station temperature', 'Press_mm_hg':'station pressure', 'RH_out':'outside station humidity', 'rv1':'random variable 1', 'rv2':'random variable 2'}

dataset = dataset.rename(columns=rename)
dataset

In [None]:
#select a sample advised and plot a linear regression
slr_data = dataset[['living room temperature', 'outside north temperature']].sample(15, random_state=2)

sns.regplot(x='living room temperature', y='outside north temperature', data=slr_data)


In [None]:
#fitting data into a model

target_df = dataset[['living room temperature', 'outside north temperature']]
x= target_df[['living room temperature']]
y = target_df[['outside north temperature']]
linear_model = LinearRegression().fit(x,y)
predict_x = linear_model.predict(x)
predict_x

In [None]:
#get the value of r square

r2 = r2_score(y,predict_x )
round(r2, 2)

In [None]:
#drop two variables as instructed in the quiz which are date and lights
dataset = dataset.drop(columns=['date', 'lights'])
dataset

In [None]:
#normalize data using the minmaxscaler
scaler = MinMaxScaler()
new_data = pd.DataFrame(scaler.fit_transform(dataset), columns=dataset.columns)
new_data

In [None]:
#seperate feature dataset before spliting
train_data = new_data.drop(['Appliances'], axis = 1)
y = new_data.Appliances
y

In [None]:
#splitting our dataset into test and train dataset

x_train, x_test, y_train, y_test = train_test_split(train_data, y, test_size = 0.3, random_state = 42)

In [None]:
#fit into a new model 
new_model = LinearRegression()
new_model.fit(x_train, y_train)
predicted_values = new_model.predict(x_test)
predicted_values

In [None]:
#calculate the Mean Absolute Error
mae = mean_absolute_error(y_test, predicted_values)
round(mae, 2)

In [None]:
#Calculate the Residual Sum Of Squares
rss = np.sum(np.square(y_test - predicted_values))
round(rss, 2)

In [None]:
#calculate the Root Mean Square Error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)


In [None]:
#calculate the Rsquared value
r2 = r2_score(y_test, predicted_values)
round(r2, 2)

In [None]:
#define a function to get weights of features
def get_weights_df(model, feat, col_name):
#this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df

In [None]:
#train data using ridge regression
ridge_reg = Ridge(alpha=0.5)
ridge_reg.fit(x_train, y_train)

In [None]:
#train data using lasso regression
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)



In [None]:
linear_model_weights = get_weights_df(new_model, x_train, 'Linear_Model_Weight')
ridge_weights_df = get_weights_df(ridge_reg, x_train, 'Ridge_Weight')
lasso_weights_df = get_weights_df(lasso_reg, x_train, 'Lasso_weight')
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features')

In [None]:
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(x_train, y_train)

In [None]:
#check for change in rsme
rmse2 = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse2, 3)

In [None]:
#get the feature with the maximum weight
final_weights.max()


In [None]:
#get the feature with the minimum weight 
final_weights.min()

the feature with the minimum weight is tdewpoint

In [None]:
#train lasso with a new alpha value as given in the quiz
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

In [None]:
#check the weight for non zero feature weights
final_weights

In [None]:
#find the new rmse with lasso and check if it changed
rmse3 = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse3, 3)

the new rmse did not change