In [26]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [28]:
df = pd.read_csv("../data/electricistan/train_prepped.csv")
df = df.set_index('datetime')
df.index = pd.to_datetime(df.index)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 375355 entries, 2012-01-01 00:15:00 to 2022-09-14 23:45:00
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   power       375355 non-null  int64
 1   hour        375355 non-null  int64
 2   dayofweek   375355 non-null  int64
 3   quarter     375355 non-null  int64
 4   month       375355 non-null  int64
 5   year        375355 non-null  int64
 6   dayofyear   375355 non-null  int64
 7   dayofmonth  375355 non-null  int64
 8   weekofyear  375355 non-null  int64
dtypes: int64(9)
memory usage: 28.6 MB


# Preprocessing

In [44]:
# Splitting Dataset - should be improved e.g. through cross validation. 
train = df.loc[df.index < '01-01-2020']
test = df.loc[df.index >= '01-01-2020']

In [45]:
# split into train and test with ... 

In [52]:
train.iloc[:, :1]

Unnamed: 0_level_0,power
datetime,Unnamed: 1_level_1
2012-01-01 00:15:00,3767
2012-01-01 00:30:00,3743
2012-01-01 00:45:00,3691
2012-01-01 01:00:00,3645
2012-01-01 01:15:00,3628
...,...
2019-12-31 22:45:00,4401
2019-12-31 23:00:00,4382
2019-12-31 23:15:00,4354
2019-12-31 23:30:00,4317


In [60]:
# Scaling Dataset

# Standardisation - power_std
sc = StandardScaler()
train.iloc[:, :1] = sc.fit_transform(train.iloc[:, :1])
test.iloc[:, :1] = sc.transform(test.iloc[:, :1])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [59]:
train

Unnamed: 0_level_0,power,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-01 00:15:00,-1.245882,0,6,1,1,2012,1,1,52
2012-01-01 00:30:00,-1.279745,0,6,1,1,2012,1,1,52
2012-01-01 00:45:00,-1.353115,0,6,1,1,2012,1,1,52
2012-01-01 01:00:00,-1.418019,1,6,1,1,2012,1,1,52
2012-01-01 01:15:00,-1.442005,1,6,1,1,2012,1,1,52
...,...,...,...,...,...,...,...,...,...
2019-12-31 22:45:00,-0.351338,22,1,4,12,2019,365,31,1
2019-12-31 23:00:00,-0.378146,23,1,4,12,2019,365,31,1
2019-12-31 23:15:00,-0.417652,23,1,4,12,2019,365,31,1
2019-12-31 23:30:00,-0.469858,23,1,4,12,2019,365,31,1


In [None]:
# Normalization - power_normalized

# Training

In [None]:
FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year']
TARGET = 'power'

X_train = train[FEATURES]
y_train = train[TARGET]

X_test = test[FEATURES]
y_test = test[TARGET]

In [None]:
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                       n_estimators=1500,
                       early_stopping_rounds=50,
                       objective='reg:linear',
                       #max_depth=3,
                       learning_rate=0.01)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100)

In [None]:
# predict on test
test['prediction'] = reg.predict(X_test)

In [None]:
score = np.sqrt(mean_squared_error(test['power'], test['prediction']))
print(f'RMSE Score on Test set: {score:0.2f}')

# Predict

In [None]:
# We import the test set
testSet = pd.read_csv("../data/electricistan/test.csv")
submission = pd.read_csv("../data/electricistan/sample_submission_csv.csv")

#index for TestSet
testSet = testSet.set_index('datetime')
testSet.index = pd.to_datetime(testSet.index)

In [None]:
# Feature creation
testSet['hour'] = testSet.index.hour
testSet['dayofweek'] = testSet.index.dayofweek
testSet['quarter'] = testSet.index.quarter
testSet['month'] = testSet.index.month
testSet['year'] = testSet.index.year
testSet['dayofyear'] = testSet.index.dayofyear
testSet['dayofmonth'] = testSet.index.day
testSet['weekofyear'] = testSet.index.isocalendar().week

In [None]:
testSet = testSet[FEATURES]

In [None]:
print(X_train.shape)
print(testSet.shape)

In [None]:
# Prediction with XGBoost
pred = reg.predict(testSet)

# Submission write

In [None]:
# Creation of the submission CSV
submission['power'] = pred

print(submission)

submission.to_csv('../data/electricistan/submission.csv',index=False)