In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [4]:
%matplotlib inline

In [5]:
df = pd.read_csv("energydata_complete.csv")

In [7]:
df.sample(20)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
9518,2016-03-17 19:20:00,110,10,22.823333,38.423333,20.39,38.09,21.79,34.86,22.2,...,20.0,35.26,6.366667,764.233333,68.333333,2.0,35.666667,0.866667,6.002439,6.002439
7034,2016-02-29 13:20:00,220,0,20.356667,33.156667,19.29,32.7,21.856667,38.223333,19.5,...,18.2,39.4975,5.333333,762.5,52.0,6.666667,32.0,-3.833333,2.957497,2.957497
9324,2016-03-16 11:00:00,230,0,20.89,39.9,21.5,38.133333,22.26,38.526667,21.5,...,19.7,38.656667,6.5,764.8,56.0,10.0,40.0,-1.7,26.152242,26.152242
14189,2016-04-19 05:50:00,70,0,20.89,38.59,18.1,42.4,22.2,36.4,21.5,...,20.29,40.56,5.216667,764.283333,94.0,2.0,24.0,4.316667,48.97746,48.97746
2605,2016-01-29 19:10:00,70,10,19.633333,45.0,19.29,44.126667,19.5,43.933333,16.7,...,16.2,45.933333,9.0,762.283333,82.0,10.0,40.0,6.1,34.159885,34.159885
17639,2016-05-13 04:50:00,50,0,24.89,47.59,22.79,49.4,26.6,43.826667,25.0,...,24.29,48.9,12.3,745.866667,96.166667,1.0,36.666667,11.716667,22.465785,22.465785
10731,2016-03-26 05:30:00,50,0,21.79,37.29,18.1,42.79,23.29,37.7,20.166667,...,19.79,42.29,3.35,757.4,87.0,3.5,23.0,1.4,2.281636,2.281636
15564,2016-04-28 19:00:00,200,0,21.2,33.5,19.79,31.926667,23.7,38.06,20.5,...,18.79,29.166667,9.8,756.2,41.0,5.0,40.0,-3.0,48.469124,48.469124
11455,2016-03-31 06:10:00,50,0,21.7,40.2,18.823333,44.326667,22.6,38.29,20.6,...,20.39,43.09,5.983333,753.75,97.5,1.833333,63.333333,5.616667,20.330638,20.330638
4256,2016-02-10 06:20:00,70,0,21.1,41.7,20.5,40.0,22.39,43.09,18.7,...,19.1,46.29,2.5,739.433333,94.0,7.0,40.0,1.6,3.435501,3.435501


In [9]:
df.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [10]:
df.describe(include="all").transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
date,19735,19735.0,2016-04-23 07:40:00,1.0,,,,,,,
Appliances,19735,,,,97.695,102.525,10.0,50.0,60.0,100.0,1080.0
lights,19735,,,,3.80187,7.93599,0.0,0.0,0.0,0.0,70.0
T1,19735,,,,21.6866,1.60607,16.79,20.76,21.6,22.6,26.26
RH_1,19735,,,,40.2597,3.9793,27.0233,37.3333,39.6567,43.0667,63.36
T2,19735,,,,20.3412,2.19297,16.1,18.79,20.0,21.5,29.8567
RH_2,19735,,,,40.4204,4.06981,20.4633,37.9,40.5,43.26,56.0267
T3,19735,,,,22.2676,2.00611,17.2,20.79,22.1,23.29,29.236
RH_3,19735,,,,39.2425,3.25458,28.7667,36.9,38.53,41.76,50.1633
T4,19735,,,,20.8553,2.04288,15.1,19.53,20.6667,22.1,26.2


##### To drop data and lights features as contained in the instruction

In [13]:
df.drop(["date", "lights"], axis=1, inplace=True) ## Drop the date column

##### Normalizing using MinMaxScaler

In [14]:
normalized_df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns = df.columns)

In [70]:
normalized_df.corr()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
Appliances,1.0,0.055447,0.086031,0.120073,-0.060465,0.08506,0.036292,0.040281,0.016965,0.01976,...,0.01001,-0.051462,0.099155,-0.034885,-0.152282,0.087122,0.00023,0.015353,-0.011145,-0.011145
T1,0.055447,1.0,0.164006,0.836834,-0.002509,0.892402,-0.02855,0.877001,0.097861,0.885247,...,0.844777,0.071756,0.682846,-0.150574,-0.345481,-0.087654,-0.07621,0.571309,-0.006203,-0.006203
RH_1,0.086031,0.164006,1.0,0.269839,0.797535,0.25323,0.844677,0.10618,0.880359,0.205797,...,0.115263,0.764001,0.340767,-0.293957,0.274126,0.204932,-0.021057,0.639106,-0.000699,-0.000699
T2,0.120073,0.836834,0.269839,1.0,-0.16561,0.735245,0.121497,0.762066,0.231563,0.72055,...,0.675535,0.157346,0.792255,-0.133028,-0.505291,0.052495,-0.069721,0.582602,-0.011087,-0.011087
RH_2,-0.060465,-0.002509,0.797535,-0.16561,1.0,0.137319,0.678326,-0.047304,0.721435,0.110409,...,0.054544,0.676467,0.033674,-0.255646,0.584911,0.06919,-0.005368,0.499152,0.006275,0.006275
T3,0.08506,0.892402,0.25323,0.735245,0.137319,1.0,-0.011234,0.852778,0.122737,0.888169,...,0.901324,0.134602,0.699417,-0.189974,-0.281718,-0.100776,-0.10231,0.645886,-0.005194,-0.005194
RH_3,0.036292,-0.02855,0.844677,0.121497,0.678326,-0.011234,1.0,-0.140457,0.898978,-0.050062,...,-0.19527,0.833538,0.118207,-0.233274,0.356192,0.263188,0.017041,0.414387,-0.000477,-0.000477
T4,0.040281,0.877001,0.10618,0.762066,-0.047304,0.852778,-0.140457,1.0,-0.04865,0.871813,...,0.889439,-0.025549,0.663478,-0.075292,-0.388602,-0.185747,-0.104768,0.519471,-0.001815,-0.001815
RH_4,0.016965,0.097861,0.880359,0.231563,0.721435,0.122737,0.898978,-0.04865,1.0,0.091812,...,-0.044518,0.856591,0.293289,-0.250748,0.336813,0.300192,0.002636,0.616509,-0.001787,-0.001787
T5,0.01976,0.885247,0.205797,0.72055,0.110409,0.888169,-0.050062,0.871813,0.091812,1.0,...,0.911055,0.072308,0.651321,-0.170999,-0.273953,-0.145011,-0.084164,0.588362,-0.00549,-0.00549


## To Answer the Quiz Questions

#### From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two D.P?

In [25]:
Xi = np.array([normalized_df["T2"]]).reshape(-1, 1)
yi = np.array([normalized_df["T6"]]).reshape(-1, 1)


Xi_train, Xi_test, yi_train, yi_test = train_test_split(Xi, yi, test_size=0.3, random_state=42)

In [28]:
lmm = LinearRegression()
lmm.fit(Xi_train, yi_train)


round(metrics.r2_score(yi_test, lmm.predict(Xi_test)), 2)

0.64

#### What is the Mean Absolute Error (in two decimal places)?



In [None]:
y = normalized_df["Appliances"]
X = normalized_df.drop("Appliances", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [69]:
## LinearRigression Model

reg = LinearRegression()
reg.fit(X_train, y_train)

print("MAE", metrics.mean_absolute_error(y_test, reg.predict(X_test)))

MAE 0.05013362397742954


#### What is the Residual Sum of Squares (in two decimal places)?



In [34]:
print("RSS", round(np.sum(np.square(y_test, predict))), 2)

RSS 93.0 2


##### What is the Root Mean Squared Error (in three decimal places)?

In [35]:
print("RMSE", round(np.sqrt(metrics.mean_squared_error(y_test, predict))), 3)

RMSE 0.0 3


##### What is the Coefficient of Determination (in two decimal places)? 


In [36]:
print("R_2", metrics.r2_score(y_test, predict))

R_2 0.20545493424595573


##### Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [57]:
linearReg_coef = pd.DataFrame(reg.coef_, X_train.columns, columns =["Coef"] ).reset_index()
linearReg_coef.columns = ["features","coef"]

print(linearReg_coef.min())
print(linearReg_coef.max())

features    Press_mm_hg
coef          -0.456698
dtype: object
features         rv2
coef        0.553547
dtype: object


##### Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [66]:
def RMSE_ridge_calculate(alpha):
    ### A finction to check if the RMSE with alpha change is the same
    
    RMSE1 = list()
    for alpha in alpha:
        ridge = Ridge(alpha)
        ridge.fit(X_train, y_train)
       
        prediction = ridge.predict(X_test)
        
        RMSE = round(np.sqrt(metrics.mean_squared_error(y_test, prediction)))
        if RMSE not in RMSE1:
            RMSE1.append(RMSE)
            if len(RMSE1)!=1:
                print("YES: Thers's changes")
        else:
            print("NO: No change")
RMSE_ridge_calculate([0, 0.4])

NO: No change


##### Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?







In [108]:
lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)


lasso_coef = pd.DataFrame(lasso.coef_, X_train.columns, columns =["coef"] ).reset_index()
lasso_coef.columns = ["features","coef"]
lasso_coef

#lasso_coef.query('coef=="0"').count()

Unnamed: 0,features,coef
0,T1,0.0
1,RH_1,0.01788
2,T2,0.0
3,RH_2,-0.0
4,T3,0.0
5,RH_3,0.0
6,T4,-0.0
7,RH_4,0.0
8,T5,-0.0
9,RH_5,0.0


#### What is the new RMSE with the Lasso Regression (in 3 decimal places)?  



In [82]:
print("Lasso RMSE", round(np.sqrt(metrics.mean_squared_error(y_test, lasso.predict(X_test)))))

Lasso RMSE 0.0
