In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

In [None]:
from sklearn.decomposition import PCA
from sklearn import metrics

In [None]:
from tensorflow import keras
from tensorflow.keras import layers


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D

In [None]:
from sklearn.utils import resample
from matplotlib import pyplot
from numpy import mean
from numpy import std
from numpy import array
from numpy import dstack

In [None]:
from sklearn.metrics import max_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
import os
from datetime import datetime

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

In [None]:
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')

In [None]:
def tie_dataset(feature, label, timestep = 24, want_period = 1):
    feature_list = []
    label_list = []

    for i in range(timestep, len(feature)-want_period):
        feature_list.append(feature[i-timestep:i])
        label_list.append(label[i:i+want_period]) 
        #label_list.append(label[i])
    return np.array(feature_list), np.array(label_list)

In [None]:
def divide_data(dataset, step = 1, length = 12000):
    data_list = []

    for i in range(0, len(dataset) - length, step):
        data_list.append(dataset.iloc[i:i+length])

    return data_list

        


In [None]:
def generate_model_lstm(trainX, trainy):
    
    model = Sequential()
    # model.add(LSTM(48, activation='tanh', return_sequences=True))
    model.add(LSTM(128, activation='tanh'))
    model.add(Dense(1))
   
    model.compile(loss='mse',
                optimizer='adam',
                metrics=['mae', 'mse'])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=20)
    filename = os.path.join('/content/gdrive/My Drive/Colab Notebooks', 'generation_data_lstm_checkpoint.h5')
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

    model.fit(trainX, trainy, epochs=200, validation_split = 0.2, verbose=0,
                 callbacks=[early_stop, checkpoint]) 
    
    return model




def generate_model_cnn(trainX, trainy):   
    
    model = Sequential()
    model.add(Conv1D(filters=4, kernel_size= 1, activation='relu'))
    model.add(MaxPooling1D(pool_size=2)) # 데이터 크기 1/2로 줄여줌
    #model.add(Flatten()) # 다차원 배열을 1차원으로 바꿔줌

    #model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(LSTM(64, activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mse',
                optimizer='adam',
                metrics=['mae', 'mse'])
    early_stop = EarlyStopping(monitor='val_loss', patience=10)
    filename = os.path.join('/content/gdrive/My Drive/Colab Notebooks', 'generation_cnnlsmt_checkpoint.h5')
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

    model.fit(trainX, trainy, epochs=100, validation_split = 0.2, verbose=0,  
                    callbacks=[early_stop, checkpoint])

    return model 



 
def generate_model_gru(trainX, trainy):   

    model = Sequential()
    #model.add(GRU(64, activation='tanh', return_sequences=True))
    model.add(GRU(64, activation='tanh'))
    model.add(Dense(1))
   
    model.compile(loss='mse',
                optimizer='adam',
                metrics=['mae', 'mse'])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=10)
    filename = os.path.join('/content/gdrive/My Drive/Colab Notebooks', 'generation_gru_checkpoint.h5')
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

    model.fit(trainX, trainy, epochs=100, validation_split = 0.2, verbose=0,  
                    callbacks=[early_stop, checkpoint])

    return model



def generate_model_mlp(trainX, trainy):   

    model = Sequential()
    model.add(Dense(64, activation='tanh'))
    model.add(Dense(64, activation='tanh'))
    model.add(Dense(1))
   
    model.compile(loss='mse',
                optimizer='adam',
                metrics=['mae', 'mse'])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=10)
    filename = os.path.join('/content/gdrive/My Drive/Colab Notebooks', 'generation_mlp_checkpoint.h5')
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

    model.fit(trainX, trainy, epochs=100, validation_split = 0.2, verbose=0,  
                    callbacks=[early_stop, checkpoint])

    
    return model


def generate_meta_model(trainX, trainy):

    model = Sequential()
  
    #model.add(LSTM(64, activation='relu', return_sequences=True))
    model.add(LSTM(64, activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mse',
                optimizer='adam',
                metrics=['mae', 'mse'])
    

    early_stop = EarlyStopping(monitor='val_loss', patience=20)
    filename = os.path.join('/content/gdrive/My Drive/Colab Notebooks', 'generation_lstm_checkpoint.h5')
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

    model.fit(trainX, trainy, epochs=300, validation_split = 0.2, verbose=0,  
                    callbacks=[early_stop, checkpoint])
    
    return model



In [None]:
def load_all_models(n_models):
	all_models = list()
	for i in range(n_models):
		# define filename for this ensemble
		filename = '/content/gdrive/My Drive/Colab Notebooks/model_' + str(i + 1) + '.h5'
		# load model from file
		model = tf.keras.models.load_model(filename)
		# add to list of members
		all_models.append(model)
		print('>loaded %s' % filename)
	return all_models

In [None]:
df = pd.read_csv(os.path.join('/content/gdrive/My Drive/Colab Notebooks', 'train_hourly.csv'), encoding='utf8')
df_forecast = pd.read_csv(os.path.join('/content/gdrive/My Drive/Colab Notebooks', 'test_hourly.csv'), encoding='cp949')

In [None]:
all_train = pd.concat([df, df_forecast], axis=0)
all_train = all_train.reset_index()
all_train = all_train.drop(['index', 'year', 'month', 'day', 'hour'], axis=1)
all_train

Unnamed: 0,temperature,humidity,dew_point,sol_rad,cloud,pow_gen
0,4.3,77,0.6,100.000,0,140.631
1,4.3,80,1.1,72.222,0,85.793
2,3.8,80,0.6,55.556,0,32.590
3,3.4,80,0.2,13.889,0,0.154
4,0.0,70,-4.8,0.000,0,7.275
...,...,...,...,...,...,...
16568,15.0,24,-5.0,609.000,0,1079.446
16569,16.0,23,-6.0,563.000,0,1018.955
16570,16.0,24,-5.0,468.000,0,852.790
16571,14.0,29,-4.0,329.000,0,425.755


In [None]:
length = 14000
step = int((len(all_train) - length) / 4)
data_list = divide_data(all_train, step, length)
data_list

[       temperature  humidity  dew_point  sol_rad  cloud   pow_gen
 0              4.3        77        0.6  100.000      0   140.631
 1              4.3        80        1.1   72.222      0    85.793
 2              3.8        80        0.6   55.556      0    32.590
 3              3.4        80        0.2   13.889      0     0.154
 4              0.0        70       -4.8    0.000      0     7.275
 ...            ...       ...        ...      ...    ...       ...
 13995         32.9        41       17.8  675.000      8  1020.833
 13996         33.0        41       17.9  686.111      6   873.120
 13997         31.5        47       18.8  530.556      3   693.494
 13998         29.7        52       18.7  233.333      6   420.027
 13999         21.2        96       20.5   19.444     10   297.109
 
 [14000 rows x 6 columns],
        temperature  humidity  dew_point  sol_rad  cloud   pow_gen
 643           -5.9        46      -15.6   19.444      0   139.903
 644           -4.5        43    

In [None]:
train_stats = all_train.describe()
train_stats.pop("pow_gen")
train_stats = train_stats.transpose()

In [None]:
#Split dataset into training and testing set
train_final_set = []
test_final_set = []
target_set = []
test_target_set = []

for i in range(0,len(data_list)):

    train_size = int(len(data_list[i])*0.7)
    test_size = int(len(data_list[i])*0.3)
 
# train_dataset = df[:train_size, ['month', 'day', 'hour', 'temperature', 'humidity', 'dew_point', 'sol_rad', 'cloud', 'pow_gen']]
    train_dataset = data_list[i].iloc[:train_size]
    test_dataset = data_list[i].iloc[train_size : train_size + test_size]                     #시 발전량 기온 습도 이슬점온도 일사량 전운량
# test_dataset = df_forecast[['month', 'day', 'hour', 'temperature', 'humidity', 'dew_point', 'sol_rad', 'cloud', 'pow_gen']]

    train_labels = train_dataset.pop('pow_gen')
    test_labels = test_dataset.pop('pow_gen')

    normed_train_data = norm(train_dataset)
    normed_test_data = norm(test_dataset)
    
    train_final= normed_train_data.values
    test_final = normed_test_data.values
    target=train_labels.values
    test_target=test_labels.values
    
    train_final_set.append(train_final)
    test_final_set.append(test_final)
    target_set.append(target)
    test_target_set.append(test_target)

In [None]:
trainx, trainla = tie_dataset(train_final_set[0], target_set)
trainx.shape

(9775, 24, 5)

In [None]:
members = list()
trainx_set = list()
trainy_set = list()
testy_set = list()
testx_set = list()

for i in range(0, len(data_list)):

    # trainX, trainy = tie_dataset(pca_train, target)
    # testX, testy = tie_dataset(pca_test, test_target)

    trainX, trainy = tie_dataset(train_final_set[i], target_set[i])
    testX, testy = tie_dataset(test_final_set[i], test_target_set[i])
    
    trainx_set.append(trainX)
    trainy_set.append(trainy)
    testx_set.append(testX)
    testy_set.append(testy)


for i in range(0, len(data_list)):
    model = generate_model_lstm(trainx_set[i], trainy_set[i])
    members.append(model)
   


Epoch 00001: val_loss improved from inf to 408008.56250, saving model to /content/gdrive/My Drive/Colab Notebooks/generation_data_lstm_checkpoint.h5

Epoch 00002: val_loss improved from 408008.56250 to 378392.62500, saving model to /content/gdrive/My Drive/Colab Notebooks/generation_data_lstm_checkpoint.h5

Epoch 00003: val_loss improved from 378392.62500 to 351691.75000, saving model to /content/gdrive/My Drive/Colab Notebooks/generation_data_lstm_checkpoint.h5

Epoch 00004: val_loss improved from 351691.75000 to 327750.84375, saving model to /content/gdrive/My Drive/Colab Notebooks/generation_data_lstm_checkpoint.h5

Epoch 00005: val_loss improved from 327750.84375 to 306027.62500, saving model to /content/gdrive/My Drive/Colab Notebooks/generation_data_lstm_checkpoint.h5

Epoch 00006: val_loss improved from 306027.62500 to 286383.96875, saving model to /content/gdrive/My Drive/Colab Notebooks/generation_data_lstm_checkpoint.h5

Epoch 00007: val_loss improved from 286383.96875 to 26

In [None]:
members

In [None]:
n_splits = len(data_list)
for i in range(n_splits):
	#model = generate_model_lstm(trainX, trainy)
	filename = '/content/gdrive/My Drive/Colab Notebooks/model_' + str(i + 1) + '.h5'
	members[i].save(filename)
	print('>Saved %s' % filename)

In [None]:
load_members = list()

n_splits = len(data_list)
load_members = load_all_models(n_splits)
print('Loaded %d models' % len(load_members))

In [None]:
predict_set = list()

for i in range(n_splits):
    predict_set.append(load_members[i].predict(testx_set[4]))

In [None]:
mse_set = list()

for i in range(n_splits):
    mse = mean_squared_error(testy_set[4], predict_set[i], squared=False)
    mse_set.append(mse)

In [None]:
mse_set

In [None]:
plt.rcParams.update({
    'font.family': 'Times New Roman',
    'font.size': 18,
    'figure.figsize': (12, 5),
    'axes.grid' : True, 'axes.grid.axis': 'y'
})

In [None]:

single_mse = list()
ensemble_mse = list()

for i in range(1, n_splits+1):
  
    subset = load_members[:i]
    yhats = [model.predict(testx_set[4]) for model in subset]
    yhats = array(yhats)
    yhats[yhats<0] = 0
    averaged = np.average(yhats, axis=0)
    ensemble = mean_squared_error(testy_set[4],averaged,squared=False)

    *_, y1 = [load_members[i-1].predict(testx_set[4])]
    y1[y1<0] = 0
    single =  mean_squared_error(testy_set[4],y1,squared=False)

        #X_test_cnn = X_test.reshape(X_test.shape[0], 4,1,1)
        # *_, y2 = [members_cnn[i-1].predict(X_test)]
        # y2[y2<0] = 0
        # cnn_lstm = mean_squared_error(Y_test, y2, squared=False) 

        # *_, y3 = [members_gru[i-1].predict(X_test)]
        # y3[y3<0] = 0
        # single_gru = mean_squared_error(Y_test, y3, squared=False) 
    
        # *_, y4 = [members_mlp[i-1].predict(X_test)]
        # y4[y4<0] = 0
        # single_mlp = mean_squared_error(Y_test, y4, squared=False) 

        #print('> %d: LSTM=%.3f, cnn_lstm=%.3f, ensemble=%.3f' % (i, single_lstm, cnn_lstm, ensemble))
    print('> %d: Single=%.3f, Ensemble=%.3f' % (i, single, ensemble))
    single_mse.append(single)
    # cnn_mse.append(cnn_lstm)
    ensemble_mse.append(ensemble) 
    
    # gru_mse.append(single_gru)
    # mlp_mse.append(single_mlp)

In [None]:
# plot score vs number of ensemble members
print('RMSE Single Learners %.3f (%.3f)' % (mean(single_mse), std(single_mse)))
#print('RMSE Cnn_lstm Learners %.3f (%.3f)' % (mean(cnn_mse), std(cnn_mse)))
print('RMSE Ensemble Learners %.3f (%.3f)' % (mean(ensemble_mse), std(ensemble_mse)))
#print('RMSE GRU Learners %.3f (%.3f)' % (mean(gru_mse), std(gru_mse)))

x_axis = [i for i in range(1, n_splits+1)]
pyplot.plot(x_axis, single_mse, marker='o', label='Single Model')
#pyplot.plot(x_axis, cnn_mse, marker='o', label='LSTM + CNN')
pyplot.plot(x_axis, ensemble_mse, marker='o', label='Ensemble Model (Ours)')
#pyplot.plot(x_axis, gru_mse, marker='o', label='GRU')
#pyplot.plot(x_axis, gru_mse, marker='o', label='GRU')
plt.title("Root Mean Squared Error: Single Learner vs Ensemble Learners (Bagging)")
plt.xlabel('Iteration')
plt.ylabel('MSE')
plt.legend()
pyplot.show()