<a href="https://colab.research.google.com/github/bonchevap/aigraz_project/blob/master/Notebooks/Model3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [87]:
! pip install rfpimp



In [0]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import *
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn import model_selection
import plotly.express as px
import rfpimp 

**Read load data and temperature data.**

In [89]:
df_temperature=pd.read_csv("MK_Temperature2016-2019C.csv")
df_electricity=pd.read_csv("MK_LoadData2016-2019.csv")
df_electricity.LoadMW.isna().sum()

986

**Using IterativeImputer instead of SimpleImputer to fill the missing values, since it seems wrong to do mean or most frequent values in the case of hourly electricity values.**

In [90]:
#Dealing with missing values
df_missing_loc=df_electricity.loc[df_electricity.LoadMW.isna()].index

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10, verbose=0)
imp.fit(df_electricity)
df_electricity_imp = imp.transform(df_electricity)
df_electricity_imp = pd.DataFrame(df_electricity_imp, columns=df_electricity.columns)
df_electricity_imp.LoadMW.isna().sum()

0

In [91]:
#Assign float values 
df_temperature['Year']=df_temperature['Year'].astype(float)
df_temperature['Month']=df_temperature['Month'].astype(float)
df_temperature['Day']=df_temperature['Day'].astype(float)
df_temperature['Hour']=df_temperature['Hour'].astype(float)
df_temperature.head()

Unnamed: 0,Temperature,Year,Month,Day,Hour
0,-8.888889,2016.0,1.0,1.0,0.0
1,-10.0,2016.0,1.0,1.0,1.0
2,-10.0,2016.0,1.0,1.0,2.0
3,-11.111111,2016.0,1.0,1.0,3.0
4,-12.222222,2016.0,1.0,1.0,4.0


**Merging the both dataframes.**

In [0]:
dfinal = df_electricity_imp.merge(df_temperature, how='left', left_on=['year', 'day', 'month', 'hour'], right_on=['Year', 'Day', 'Month', 'Hour'])

**Dealing with missing temperature values.**

In [93]:
#Missing temperature values
nan_rows = dfinal[dfinal['Temperature'].isnull()]
impute_missing=SimpleImputer(missing_values=np.NaN, strategy='median')
dfinal[["Temperature"]]=impute_missing.fit_transform(dfinal[["Temperature"]])
dfinal.head()

Unnamed: 0,LoadMW,dayweek,month,year,day,hour,weekend,Temperature,Year,Month,Day,Hour
0,1187.0,3.0,12.0,2015.0,31.0,23.0,0.0,12.777778,,,,
1,1142.0,4.0,1.0,2016.0,1.0,0.0,0.0,-8.888889,2016.0,1.0,1.0,0.0
2,1059.0,4.0,1.0,2016.0,1.0,1.0,0.0,-10.0,2016.0,1.0,1.0,1.0
3,976.0,4.0,1.0,2016.0,1.0,2.0,0.0,-10.0,2016.0,1.0,1.0,2.0
4,929.0,4.0,1.0,2016.0,1.0,3.0,0.0,-11.111111,2016.0,1.0,1.0,3.0


In [94]:
df_electricity_imp=dfinal
df_electricity_imp.head()

Unnamed: 0,LoadMW,dayweek,month,year,day,hour,weekend,Temperature,Year,Month,Day,Hour
0,1187.0,3.0,12.0,2015.0,31.0,23.0,0.0,12.777778,,,,
1,1142.0,4.0,1.0,2016.0,1.0,0.0,0.0,-8.888889,2016.0,1.0,1.0,0.0
2,1059.0,4.0,1.0,2016.0,1.0,1.0,0.0,-10.0,2016.0,1.0,1.0,1.0
3,976.0,4.0,1.0,2016.0,1.0,2.0,0.0,-10.0,2016.0,1.0,1.0,2.0
4,929.0,4.0,1.0,2016.0,1.0,3.0,0.0,-11.111111,2016.0,1.0,1.0,3.0


In [95]:
df_electricity_imp=df_electricity_imp.drop(["Year", "Month", "Day", "Hour"], axis=1)
df_electricity_imp.head()

Unnamed: 0,LoadMW,dayweek,month,year,day,hour,weekend,Temperature
0,1187.0,3.0,12.0,2015.0,31.0,23.0,0.0,12.777778
1,1142.0,4.0,1.0,2016.0,1.0,0.0,0.0,-8.888889
2,1059.0,4.0,1.0,2016.0,1.0,1.0,0.0,-10.0
3,976.0,4.0,1.0,2016.0,1.0,2.0,0.0,-10.0
4,929.0,4.0,1.0,2016.0,1.0,3.0,0.0,-11.111111


In [0]:
df_electricity_imp=df_electricity_imp.reset_index()

In [0]:
#categorical_columns=["dayweek", "month", "year", "day", "hour", "weekend"]
#categorical_columns=["year", "weekend"]

In [0]:
#for col in categorical_columns:
 #   df_electricity_imp[col] = df_electricity_imp[col].astype('category')

**The modeling part.**

In [97]:
df_electricity_imp = pd.get_dummies(df_electricity_imp, drop_first=True)
df_electricity_imp.head()

Unnamed: 0,index,LoadMW,dayweek,month,year,day,hour,weekend,Temperature
0,0,1187.0,3.0,12.0,2015.0,31.0,23.0,0.0,12.777778
1,1,1142.0,4.0,1.0,2016.0,1.0,0.0,0.0,-8.888889
2,2,1059.0,4.0,1.0,2016.0,1.0,1.0,0.0,-10.0
3,3,976.0,4.0,1.0,2016.0,1.0,2.0,0.0,-10.0
4,4,929.0,4.0,1.0,2016.0,1.0,3.0,0.0,-11.111111


In [0]:
X = df_electricity_imp.drop('LoadMW', axis=1)

In [0]:
y = df_electricity_imp['LoadMW']

In [100]:
X.head()

Unnamed: 0,index,dayweek,month,year,day,hour,weekend,Temperature
0,0,3.0,12.0,2015.0,31.0,23.0,0.0,12.777778
1,1,4.0,1.0,2016.0,1.0,0.0,0.0,-8.888889
2,2,4.0,1.0,2016.0,1.0,1.0,0.0,-10.0
3,3,4.0,1.0,2016.0,1.0,2.0,0.0,-10.0
4,4,4.0,1.0,2016.0,1.0,3.0,0.0,-11.111111


In [101]:
y.head()

0    1187.0
1    1142.0
2    1059.0
3     976.0
4     929.0
Name: LoadMW, dtype: float64

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [103]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(verbose=True)
rfr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   10.2s finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=True, warm_start=False)

In [104]:

y_predrfr = rfr.predict(X_test)
r2_score(y_test, y_predrfr)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


0.9421431009935133

In [105]:
from sklearn.model_selection import cross_validate
#rfr = RandomForestRegressor(verbose=True)
scores = cross_validate(rfr, X, y.values.ravel(), cv=3,scoring=('r2'),return_train_score=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    9.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [106]:
scores

{'fit_time': array([9.10849118, 8.68201542, 8.50724053]),
 'score_time': array([0.11947107, 0.14139843, 0.09344721]),
 'test_score': array([-0.07425893,  0.42342844, -0.03673996]),
 'train_score': array([0.98144646, 0.98471652, 0.99648211])}

In [107]:
rfr_importance = pd.DataFrame()
rfr_importance['column'] = X.columns
rfr_importance['importance'] = rfr.feature_importances_
rfr_importance.sort_values('importance', ascending=False).head(50)

Unnamed: 0,column,importance
5,hour,0.271546
7,Temperature,0.260247
0,index,0.257784
4,day,0.120294
2,month,0.054103
1,dayweek,0.021891
3,year,0.012396
6,weekend,0.001738


In [108]:
px.bar(y=rfr.feature_importances_, x=X.columns, title='Feature importance')