# Electricity Price Prediction

In [27]:

import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore") 


In [28]:
df_elec=pd.read_csv(r"Datasets/electricitydata.csv", low_memory=False)
df_elec.head()

Unnamed: 0,DateTime,Holiday,HolidayFlag,DayOfWeek,WeekOfYear,Day,Month,Year,PeriodOfDay,ForecastWindProduction,SystemLoadEA,SMPEA,ORKTemperature,ORKWindspeed,CO2Intensity,ActualWindProduction,SystemLoadEP2,SMPEP2
0,01/11/2011 00:00,,0,1,44,1,11,2011,0,315.31,3388.77,49.26,6.0,9.3,600.71,356.0,3159.6,54.32
1,01/11/2011 00:30,,0,1,44,1,11,2011,1,321.8,3196.66,49.26,6.0,11.1,605.42,317.0,2973.01,54.23
2,01/11/2011 01:00,,0,1,44,1,11,2011,2,328.57,3060.71,49.1,5.0,11.1,589.97,311.0,2834.0,54.23
3,01/11/2011 01:30,,0,1,44,1,11,2011,3,335.6,2945.56,48.04,6.0,9.3,585.94,313.0,2725.99,53.47
4,01/11/2011 02:00,,0,1,44,1,11,2011,4,342.9,2849.34,33.75,6.0,11.1,571.52,346.0,2655.64,39.87


In [29]:
df_elec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38014 entries, 0 to 38013
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   DateTime                38014 non-null  object
 1   Holiday                 38014 non-null  object
 2   HolidayFlag             38014 non-null  int64 
 3   DayOfWeek               38014 non-null  int64 
 4   WeekOfYear              38014 non-null  int64 
 5   Day                     38014 non-null  int64 
 6   Month                   38014 non-null  int64 
 7   Year                    38014 non-null  int64 
 8   PeriodOfDay             38014 non-null  int64 
 9   ForecastWindProduction  38014 non-null  object
 10  SystemLoadEA            38014 non-null  object
 11  SMPEA                   38014 non-null  object
 12  ORKTemperature          38014 non-null  object
 13  ORKWindspeed            38014 non-null  object
 14  CO2Intensity            38014 non-null  object
 15  Ac

In [30]:
elec_data=df_elec[['ForecastWindProduction',
       'SystemLoadEA', 'SMPEA', 'ORKTemperature', 'ORKWindspeed',
       'CO2Intensity', 'ActualWindProduction', 'SystemLoadEP2', 'SMPEP2']]

In [31]:
elec_data.isin(['?']).any()

ForecastWindProduction    True
SystemLoadEA              True
SMPEA                     True
ORKTemperature            True
ORKWindspeed              True
CO2Intensity              True
ActualWindProduction      True
SystemLoadEP2             True
SMPEP2                    True
dtype: bool

In [32]:
for col in elec_data.columns:
    elec_data.drop(elec_data.index[elec_data[col] == '?'], inplace=True)

In [33]:
elec_data=elec_data.apply(pd.to_numeric)
elec_data=elec_data.reset_index()
elec_data.drop('index', axis=1, inplace=True)

In [34]:
elec_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37682 entries, 0 to 37681
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ForecastWindProduction  37682 non-null  float64
 1   SystemLoadEA            37682 non-null  float64
 2   SMPEA                   37682 non-null  float64
 3   ORKTemperature          37682 non-null  float64
 4   ORKWindspeed            37682 non-null  float64
 5   CO2Intensity            37682 non-null  float64
 6   ActualWindProduction    37682 non-null  float64
 7   SystemLoadEP2           37682 non-null  float64
 8   SMPEP2                  37682 non-null  float64
dtypes: float64(9)
memory usage: 2.6 MB


In [35]:
elec_data['SMPEP2'] = elec_data['SMPEP2'].apply(np.int64)

In [36]:
elec_data.corrwith(elec_data['SMPEP2']).abs().sort_values(ascending=False)

SMPEP2                    1.000000
SMPEA                     0.618230
SystemLoadEP2             0.517028
SystemLoadEA              0.491064
ActualWindProduction      0.083280
ForecastWindProduction    0.079428
CO2Intensity              0.035239
ORKWindspeed              0.035237
ORKTemperature            0.008940
dtype: float64

In [37]:
X=elec_data.drop('SMPEP2', axis=1)
y=elec_data['SMPEP2']

To Machine Learning

In [38]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=42)

In [39]:
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

#### Decision Tree Classifier with 10 fold cross validation

In [40]:
# define model for Decision Tree
from sklearn.tree import DecisionTreeClassifier 
dtc = DecisionTreeClassifier(max_depth = 2, 
                             random_state = 0)
#evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores_E = cross_val_score(dtc, x_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores_E = absolute(scores_E)
print('Mean MAE: %.3f (%.3f)' % (mean(scores_E), std(scores_E)))


Mean MAE: 16.718 (0.532)


In [41]:
dtc.fit(x_train, y_train)


In [42]:
dt_pred = dtc.predict(x_test)
dt_acc = accuracy_score(y_test, dt_pred)
print(f"Accuracy Score of Decision Tree is : {dt_acc}")
clf_report = classification_report(y_test, dt_pred)
print(f"Classification Report : \n{clf_report}")


Accuracy Score of Decision Tree is : 0.049621865463712354
Classification Report : 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         5
          19       0.00      0.00      0.00         3
          20       0.00      0.00      0.00         5
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         2
          23       0.00      0.00      0.00        11
          24       0.00      0.00      0.00        11
          25       0.00      0.00      0.00        28
          26       0.00      0.00      0.00        19
          27       0.00      0.00      0.00        3

#### Support Vector Machine with 10 fold cross validation

In [43]:
# define model for Decision Tree
from sklearn.svm import SVC
svm = SVC()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores_e = cross_val_score(svm, x_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores_e = absolute(scores_e)
print('Mean MAE: %.3f (%.3f)' % (mean(scores_e), std(scores_e)))

Mean MAE: 19.552 (0.588)


In [44]:
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)

In [45]:
acc_svm1 = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy Score of SVM is : {acc_svm1}")
clf_report = classification_report(y_test, y_pred_svm)
print(f"Classification Report : \n{clf_report}")

Accuracy Score of SVM is : 0.04524346556985538
Classification Report : 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         5
          19       0.00      0.00      0.00         3
          20       0.00      0.00      0.00         5
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         2
          23       0.00      0.00      0.00        11
          24       0.00      0.00      0.00        11
          25       0.00      0.00      0.00        28
          26       0.00      0.00      0.00        19
          27       0.00      0.00      0.00        33
         

Let's see how good the model is working

In [46]:
#Let's see some sample prediction and difference between label and prediction
sme_data=x_test.iloc[50:60]
sme_data_label=y_test.iloc[50:60]
sme_predict=dtc.predict(sme_data)
pd.DataFrame({'Predict':sme_predict,'Label':sme_data_label})

Unnamed: 0,Predict,Label
4093,59,188
22310,39,33
8034,59,62
35027,59,49
23685,59,69
268,49,56
35261,39,46
11905,59,78
30903,59,82
608,59,415


In [47]:
#features = [["ForecastWindProduction", "SystemLoadEA", "SMPEA", "ORKTemperature", "ORKWindspeed", "CO2Intensity", "ActualWindProduction", "SystemLoadEP2"]]
features = np.array([[54.10, 4241.05, 49.56, 8.0, 13.8, 491.32, 54.0, 4426.84]])
dtc.predict(features)

array([49])