# SIMPLE LINEAR REGRESSION

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_excel('ENB2012_data.xlsx')

In [None]:
df

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
rename_columns = {'X1':'Relative_Compactness','X2':'Surface_Area','X3':'Wall_Area','X4':'Roof_Area','X5':'Overall_Height','X6':'Orientation','X7':'Glazing_Area','X8':'Glazing_Area_Distribution','Y1':'Heating_Load','Y2':'Cooling_load'}

In [None]:
df.rename(columns = rename_columns, inplace=True)

In [None]:
df

In [None]:
df['Relative_Compactness'].max()

In [None]:
df['Cooling_load'].max()

In [None]:
df['Cooling_load'].idxmax()

In [None]:
simple_linear_reg_df = df[['Relative_Compactness','Cooling_load']].sample(30, random_state=2)

In [None]:
simple_linear_reg_df.sort_values(by='Cooling_load')

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
# get correlations of each features in dataset
corrmat = df.cosrr(method='pearson',numeric_only=True)
top_corr_features = corrmat.index
mask= np.triu(top_corr_features)
plt.figure(figsize=(20,20))
#plot heat map
sns.heatmap(df[top_corr_features].corr(),annot=True, fmt='.2f', mask=mask, cmap='Spectral_r');

In [None]:
#regplot mostly plots relationships between numeric features
sns.regplot(x='Relative_Compactness',y='Cooling_load',data=simple_linear_reg_df);

# Measuring Regression Performance

In [None]:
#Normalise the dataset
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
features_df = normalized_df.drop(columns=['Heating_Load','Cooling_load'])
label = normalized_df['Heating_Load']

In [None]:
normalized_df

# Splitting into training and testing datasets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
x_train,x_test,y_train,y_test = train_test_split(features_df, label, test_size=0.3, random_state=1)

In [None]:
linear_model = LinearRegression()

In [None]:
linear_model.fit(x_train,y_train)

In [None]:
predicted_values = linear_model.predict(x_test)

In [None]:
predicted_values.shape

In [None]:
y_test

In [None]:
#mean absolute error
from sklearn.metrics import mean_absolute_error

In [None]:
mae = mean_absolute_error(y_test,predicted_values)
mae

In [None]:
round(mae,3)

In [None]:
# root mean squared error
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(y_test,predicted_values))

In [None]:
rmse

In [None]:
round(rmse,3)

In [None]:
#R-squared
from sklearn.metrics import r2_score
r2score = r2_score(y_test,predicted_values)

In [None]:
r2score

In [None]:
round(r2score,3)

In [None]:
#Ridge(L2) regularization
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.5)
ridge_reg.fit(x_train,y_train)

In [None]:
#Lasso(L1) regularization
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train,y_train)



In [None]:
#this function returns the weight of every feature
def get_weight_df(model,feature,col_name):
    '''
    get weight of regularization
    '''
    weights = pd.Series(model.coef_,feature.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features','col_name']
    #weights_df[col_name].round(3)
    return weights_df

In [None]:
#comparing the effect of L1 & L2 regularization
linear_model_weight = get_weight_df(linear_model,x_train,'Linear_Model_Weight')
ridge_weight = get_weight_df(ridge_reg,x_train,'Ridge_Weight')
lasso_weight = get_weight_df(lasso_reg,x_train,'Lasso_Weight')

In [None]:
linear_model_weight, ridge_weight, lasso_weight