# Assignment 9
## Data driven predictions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 

Importing the external files:


In [None]:
ExternalFilesFolder =  r"C:\Users\Gilberto\Desktop\poli\Energy building systems\Data-driven_Building_simulation_Polimi_EETBS\Data"
ConsumptionFileName= "consumption_5545.csv"
TemperatureFileName= "Austin_weather_2014.csv"
IrradianceFileName= "irradiance_2014_gen.csv"

path_consumptionFile = os.path.join(ExternalFilesFolder,ConsumptionFileName)
path_TemperatureFile = os.path.join(ExternalFilesFolder,TemperatureFileName)
path_IrradianceFile = os.path.join(ExternalFilesFolder,IrradianceFileName)

Reading the files and changing the indexes in order to have the possibility to use them as time datas:

In [None]:
DF_consumption = pd.read_csv(path_consumptionFile,sep=",", index_col=0)
DF_weather = pd.read_csv(path_TemperatureFile,sep=";",index_col=0)
DF_irradianceSource = pd.read_csv(path_IrradianceFile,sep=";",index_col=1)

PreviousIndex_c = DF_consumption.index
NewIndex_c= pd.to_datetime(PreviousIndex_c)
PreviousIndex_w = DF_weather.index
NewIndex_w= pd.to_datetime(PreviousIndex_w)
PreviousIndex_i = DF_irradianceSource.index
NewIndex_i= pd.to_datetime(PreviousIndex_i)

DF_consumption.index =NewIndex_c
DF_weather.index=NewIndex_w
DF_irradianceSource.index=NewIndex_i

Setting Data Frames for the needed datas: temperature, AC consumption and irradiance (wich is correlated to the PV generation):

In [None]:
DF_Temperature= DF_weather[["temperature"]]

DF_irradiance=DF_irradianceSource[["gen"]]
DF_irradiance[DF_irradiance["gen"]<0] = 0 #setting at zero all the negative datas

DF_consumption=DF_consumption

DF_joined = DF_consumption.join([DF_Temperature,DF_irradiance])
DF_joined.dropna(inplace=True)

Creating a copy of my datas which will be modified for predictions, and shifting the temperature datas due to the different time zone of the measurements:

In [None]:
df_mod=DF_joined.copy()
df_mod.columns=["AC_consumption", "temperature","irradiance"]

df_mod["temperature"]=df_mod["temperature"].shift(-5)
df_mod.dropna(inplace=True)

#plotting a normalized version of my DF to see the qualitative behaviour:
df_mod_choosendates=df_mod["2014-08-01":"2014-08-04"]
df_mod_normalized_choosendates=(df_mod_choosendates-df_mod_choosendates.min())/(df_mod_choosendates.max()-df_mod_choosendates.min())
df_mod_normalized_choosendates.plot()
plt.show()

Creating lagged features and applying them to my dataframe in order to have meaningful datas for my predictions:

In [None]:

lag_start=1
lag_end=6
lag_interval=1
df=df_mod

def lagfeature(df, column_name, lag_start,lag_end,lag_interval):
    for i in range(lag_start, lag_end+1, lag_interval):
    
        new_column_name= column_name + "-"+ str(i)+"h"
        df[new_column_name]=df[column_name].shift(i)
        df.dropna(inplace=True)
    return df

df_mod= lagfeature(df_mod, "irradiance", 3,6,1)
df_mod= lagfeature(df_mod, "temperature", 1,6,1)
df_mod= lagfeature(df_mod, "AC_consumption", 1,24,1)
df_mod.head(24)

Let's add time-related parameters extracted from the indexes converted before:

In [None]:
df_mod["hour"]=df_mod.index.hour
df_mod["day_of_week"]=df_mod.index.dayofweek
df_mod["week_of_year"]=df_mod.index.week
df_mod["month"]=df_mod.index.month

I need to correlate time and consumption, but time always restart from 0 while consumption values are continuous.
To avod this problem I convert "hour" into a radiant angle so it's possible to use sin and cos functions with it:

In [None]:
df_mod["sin_hour"]=np.sin(df_mod.index.hour*2*np.pi/24)
df_mod["cos_hour"]=np.cos(df_mod.index.hour*2*np.pi/24)
df_mod.head()

Adding and applying other 2 features for improving my predictions:
WeekendDetector to know if the day is a working day or not, and DayDetector to set the working hours in a day :

In [None]:
def weekendDetector(day):
    if (day==5 or day==6):
        weekend=1
    else:
        weekend=0
    return weekend
    
df_mod["weekend"]=df_mod["day_of_week"].apply(weekendDetector)


def dayDetector(hour):
    if (hour<19 and hour>9):
        daylevel=1
    else:
        daylevel=0
    return daylevel
    
df_mod["working_time"]=df_mod["hour"].apply(dayDetector)

Finding correlations between each data:

In [None]:
df_mod.corr()

In [None]:
df_mod=df_mod["2014-03-01":"2014-09-30"]
DF_target= df_mod["AC_consumption"] 
DF_features= df_mod.drop("AC_consumption", axis=1)

# Part 2:
## Prediction models

From now on, let's use sklearn, which uses statistical methods to progressively improve algorithm's performances in identifying data patterns

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, Y_train, Y_test = train_test_split(DF_features,DF_target,test_size = 0.2, random_state=41234)

Fitting the model with the training datas:

In [None]:
from sklearn import linear_model

linear_reg = linear_model.LinearRegression()

linear_reg.fit(X_train, Y_train)
predicted_linearReg_split = linear_reg.predict(X_test)

predicted_DF_linearReg_split=pd.DataFrame(predicted_linearReg_split,index=Y_test.index, columns=["AC_cons_predicted_linearReg_split"])
predicted_DF_linearReg_split=predicted_DF_linearReg_split.join(Y_test)


Let's qualitatively see if it works

In [None]:
predicted_DF_linearReg_split_august=predicted_DF_linearReg_split["2014-08-01":"2014-08-31"]
predicted_DF_linearReg_split_august.plot()
plt.show(1)

Now let's see how accurate the predictions are:

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

MAE_linearReg_split= mean_absolute_error(predicted_linearReg_split,Y_test)
MSE_linearReg_split= mean_squared_error(predicted_linearReg_split,Y_test)
R2_linearReg_split = r2_score(predicted_linearReg_split,Y_test)

print ("R2 of the train test split is: " +str(R2_linearReg_split))


Using k-fold model instead of the splitting test one: 

In [None]:
from sklearn.model_selection import cross_val_predict

predict_linearReg_CV = cross_val_predict(linear_reg,DF_features,DF_target,cv=10)
predicted_DF_linearReg_CV=pd.DataFrame(predict_linearReg_CV, index=DF_target.index, columns=["AC_cons_predicted_linearReg_CV"])

predicted_DF_linearReg_CV=predicted_DF_linearReg_CV.join(DF_target)

Qualitative behaviour:

In [None]:
predicted_DF_linearReg_CV_august=predicted_DF_linearReg_CV["2014-08-01":"2014-08-31"]
predicted_DF_linearReg_CV_august.plot()
plt.show(1)

prediction's accuracy:

In [None]:
MAE_linearReg_CV= mean_absolute_error(predict_linearReg_CV,DF_target)
MSE_linearReg_CV= mean_squared_error(predict_linearReg_CV,DF_target)
R2_linearReg_CV = r2_score(predict_linearReg_CV,DF_target)

print ("R2 of the cross validation model is: " +str(R2_linearReg_CV))

Let's now use Random Forest algorithm:

In [None]:
from sklearn.ensemble import RandomForestRegressor

reg_RF = RandomForestRegressor() #jus to have a shorter name

predict_RF_CV = cross_val_predict(reg_RF,DF_features,DF_target,cv=10)

predicted_DF_RF_CV=pd.DataFrame(predict_RF_CV, index=DF_target.index, columns=["AC_cons_predicted_RF_CV"])
predicted_DF_RF_CV=predicted_DF_RF_CV.join(DF_target)

RF qualitative behaviour:

In [None]:
predicted_DF_RF_CV_august=predicted_DF_RF_CV["2014-08-01":"2014-08-31"]
predicted_DF_RF_CV_august.plot()
plt.show()

RF algoritmh's precision:

In [None]:
MAE_RF_CV= mean_absolute_error(predict_RF_CV,DF_target)
MSE_RF_CV= mean_squared_error(predict_RF_CV,DF_target)
R2_RF_CV = r2_score(predict_RF_CV,DF_target)

print ("R2 of the RF model is: " +str(R2_RF_CV))

Now, let's use Online learning to progressively improve algorithm's performances: 

In [None]:
DF_onlineConsumptionPrediction = pd.DataFrame(index=df_mod.index)

period_of_training = pd.Timedelta(30, unit="d")

FirstTimeStamp_measured = df_mod.index[0]
LastTimeStamp_measured = df_mod.index[-1]

FirstTimeStamp_toPredict= FirstTimeStamp_measured+period_of_training

training_startTimeStamp=FirstTimeStamp_measured
training_endTimeStamp=FirstTimeStamp_toPredict

timeStamp_toPredict= FirstTimeStamp_toPredict
DF_onlineConsumptionPrediction=DF_onlineConsumptionPrediction.truncate(before=training_endTimeStamp)

Setting online training:

In [None]:
while (timeStamp_toPredict< LastTimeStamp_measured):
    #print timeStamp_toPredict
    #DF_feature_train=DF_features.loc[training_startTimeStamp:training_endTimeStamp]
    DF_feature_train=DF_features.truncate(before=training_startTimeStamp,after=training_endTimeStamp)
    DF_target_train= DF_target.truncate(before=training_startTimeStamp,after=training_endTimeStamp)
    
    DF_feature_test = DF_features.loc[timeStamp_toPredict].values.reshape(1,-1) # If you dont add this it gives you an error, because you can not have a single line feature
    DF_target_test=DF_target.loc[timeStamp_toPredict]
    reg_RF.fit(DF_feature_train,DF_target_train) # Here I am just training with my training data
    predicted_Consumption = linear_reg.predict(DF_feature_test)
    DF_onlineConsumptionPrediction.loc[timeStamp_toPredict,"Predicted"]=predicted_Consumption
    DF_onlineConsumptionPrediction.loc[timeStamp_toPredict,"Real"] = DF_target_test

    
    timeStamp_toPredict=timeStamp_toPredict+pd.Timedelta(1, unit="h")
    training_endTimeStamp=training_endTimeStamp+pd.Timedelta(1, unit="h")
    training_startTimeStamp=training_startTimeStamp + pd.Timedelta(1, unit="h")

DF_onlineConsumptionPrediction.dropna(inplace=True)

R2_score_online_linearReg = r2_score(DF_onlineConsumptionPrediction[["Real"]],
                                     DF_onlineConsumptionPrediction[["Predicted"]])

In [None]:
df_mod.index[1265]

In [None]:
df_mod.index[-1]