## Generate Training Data for AutoEncoder/Seq2seq Models

    - input: histories for t-th day, output: histories of the (t+1) day
    - input can be modified to have multiple days
    - similarly, output can also span multiple days

In [None]:
import datetime
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
import pickle
import random

In [None]:
inv_df = pd.read_csv('all_inverters.csv')
inv_df.head()

In [None]:
inverters = \
['INV-01',
 'INV-02',
 'INV-03',
 'INV-05',
 'INV-06',
 'INV-07',
 'INV-09',
 'INV-10',
 'INV-11',
 'INV-13',
 'INV-14',
 'INV-15',
 'INV-16',
 'INV-17',
 'INV-18',
 'INV-19',
 'INV-20',
 'INV-21',
 'INV-23',
 'INV-24',
 'INV-26',
 'INV-27',
 'INV-28',
 'INV-29',
 'INV-31',
 'INV-32',
 'INV-33',
 'INV-34',
 'INV-35',
 'INV-36',
 'INV-501',
 'INV-502',
 'INV-503',
 'INV-504',
 'INV-505',
 'INV-506',
 'INV-507',
 'INV-508',
 'INV-509']

In [None]:
TIMESTAMP_COL_NAME = 'date'
min_past_days = 1
remove_night = False

count_row = min_past_days * 24 * 12
X, Y = [], []

for inverter in inverters:
    features = ['IN.GMRX.CHAR.'+inverter+'.Active Power (kW)', 
    #             'IN.GMRX.CHAR.WS-20 MW.Ambient Temperature (°C)', # all nulls
                'IN.GMRX.CHAR.WS-20 MW.Module Temperature (°C)',
                'IN.GMRX.CHAR.WS-20 MW.POA Irradiance (w/m²)',
    #             'IN.GMRX.CHAR.WS-5 MW.Ambient Temperature (°C)',  # all nulls
                'IN.GMRX.CHAR.WS-5 MW.Module Temperature (°C)',
                'IN.GMRX.CHAR.WS-5 MW.POA Irradiance (w/m²)'
               ]
    columns = ['date'] + features
    inv_df_i = inv_df[columns].copy()
#     print(inv_df_i.shape[0], len(set(inv_df_i['date'])))
    
    inv_df_i['date'] = pd.to_datetime(inv_df_i["date"])
    inv_df_i.rename(columns={'IN.GMRX.CHAR.'+inverter+'.Active Power (kW)': 'power',
                            'IN.GMRX.CHAR.WS-20 MW.Module Temperature (°C)': 'temp1',
                            'IN.GMRX.CHAR.WS-20 MW.POA Irradiance (w/m²)': 'rad1',
                            'IN.GMRX.CHAR.WS-5 MW.Module Temperature (°C)': 'temp2',
                            'IN.GMRX.CHAR.WS-5 MW.POA Irradiance (w/m²)': 'rad2'}, inplace=True)
    inv_df_i['hour'] = inv_df_i.date.dt.hour
    
#     inv_df_i = inv_df_i[(inv_df_i['date'] >= data_min) & (inv_df_i['date'] <= alarm_max)]
    # inv_df_i = inv_df_i[(inv_df_i.hour >= 6) & (inv_df_i.hour <= 18)]
    inv_df_i = inv_df_i.sort_values('date').reset_index(drop=True)
#     print(inv_df_i.shape[0], len(set(inv_df_i['date'])))

    min_date, max_date = inv_df_i.date.min(), inv_df_i.date.max()
    df_full = pd.DataFrame({TIMESTAMP_COL_NAME: pd.date_range(
                            start=min_date, end=max_date, freq="5min")}).merge(inv_df_i, on='date', how='left')
#     print(df_full[TIMESTAMP_COL_NAME].isna().sum(), df_full['power'].isna().sum())

    df_full.loc[(df_full['power'].isna()) & (df_full['hour'] > 18), 'power'] = 0
    df_full.loc[(df_full['power'].isna()) & (df_full['hour'] < 6), 'power'] = 0
    
    # backward window - check for past data availability
    df_full['power_count_b'] = df_full['power'].rolling(count_row).count()
    df_full['temp1_count_b'] = df_full['temp1'].rolling(count_row).count()
    df_full['rad1_count_b'] = df_full['rad1'].rolling(count_row).count()
    
    # forward window - check for future data availability
    df_full['power_count_f'] = df_full['power'][::-1].rolling(count_row).count()[::-1]
    df_full['temp1_count_f'] = df_full['temp1'][::-1].rolling(count_row).count()[::-1]
    df_full['rad1_count_f'] = df_full['rad1'][::-1].rolling(count_row).count()[::-1]
    
    df_full['minute'] = df_full.date.dt.minute
    
    df_feature = df_full[(df_full.power_count_b==count_row) &\
                         (df_full.temp1_count_b==count_row) &\
                         (df_full.rad1_count_b==count_row) &\
                         (df_full.power_count_f==count_row) &\
                         (df_full.temp1_count_f==count_row) &\
                         (df_full.rad1_count_f==count_row) &\
                         (df_full.hour == 6)  & (df_full.minute == 0)]
    count_data = 0
    
    for ii, row in df_feature.iterrows():

        date_index = df_full[df_full['date'] == row['date']].index[0]
        if remove_night:
            inp_start, inp_end = date_index - count_row, int(date_index - count_row/2)
            out_start, out_end = date_index, int(date_index + count_row/2)
            seq_len = count_row/2
        else:
            inp_start, inp_end = date_index - count_row, date_index
            out_start, out_end = date_index, date_index + count_row
            seq_len = count_row
        
        inp_df = df_full.iloc[inp_start:inp_end]
        out_df = df_full.iloc[out_start:out_end]
        if inp_df.shape[0] == seq_len and out_df.shape[0] == seq_len:
            x_ii = inp_df[['power', 'temp1', 'rad1']].values
            y_ii = out_df[['power', 'temp1', 'rad1']].values
            X.append(x_ii)
            Y.append(y_ii)
            count_data += 1
    print(inverter, df_feature.shape[0], count_data)

X_ = np.stack(X, axis=0)
Y_ = np.stack(Y, axis=0)

In [None]:
df_full

In [None]:
# X_ = np.stack(X, axis=0)
# Y_ = np.stack(Y, axis=0)
X_.shape, Y_.shape

In [None]:
with open('autoencoder-data-1d.pkl', 'wb') as handle:
    pickle.dump([X_, Y_], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import random

def plot_sample(example_X, example_Y, num_samples=1):
    len_x, len_y = example_X.shape[1], example_Y.shape[1]
    shifted_x = [x+len_x for x in range(len_y)]
    for ii in range(num_samples):
        cnum = random.randint(0, len(example_X))
        f = plt.figure(ii+1, figsize=(15,4))
        ax1 = f.add_subplot(131)
        ax2 = f.add_subplot(132)
        ax3 = f.add_subplot(133)

        ax1.plot(example_X[cnum,:, 0])
        ax1.plot(shifted_x, example_Y[cnum,:,0])  # , 'bo--'
#         ax1.plot(shifted_x, predictions[cnum,:,0])  # , 'r+'
        ax1.title.set_text('Power')

        ax2.plot(example_X[cnum,:, 1])
        ax2.plot(shifted_x, example_Y[cnum,:,1])  # , 'bo--'
#         ax2.plot(shifted_x, predictions[cnum,:,1])  # , 'r+'
        ax2.title.set_text('Temperature')

        ax3.plot(example_X[cnum,:, 2])
        ax3.plot(shifted_x, example_Y[cnum,:,2])  # , 'bo--'
#         ax3.plot(shifted_x, predictions[cnum,:,2])  # , 'r+'
        ax3.title.set_text('Irradiance')


In [None]:
plot_sample(X_, Y_, 5)

In [None]:
def count_nan(df):
    print(df)
    sys.exit("HERE")
    
df_full['power'].rolling(min_past_days * 24 * 12).count() # apply(count_nan)