# BTAP_ML Phase 3, Task 2 Testing

**Description**    
This notebook provides the ability to test a suite of potential Multi-Layer Perceptron (MLP) designs on a preprocessed fold of a train/test/validation set, which is provided by the main btap_ml program. The outputs of the training process will be output into a specified output folder for future analysis. Note that some manual edits to the files used may be needed.

In [None]:
# Adjust these based on where the file is run
import prepare_weather as pw
import preprocessing as pp
import feature_selection as fs
import predict as pred

import csv
import datetime
import json
import logging
import os
import time
from math import sqrt
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import typer
from keras import backend as K
from keras import regularizers  # for l2 regularization
from keras.callbacks import EarlyStopping
from keras.layers.core import Dense, Dropout, Flatten
from keras.models import Sequential
from keras_tuner import Hyperband
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import RobustScaler
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras import layers

In [None]:
# The root folder path of where the preprocessed data is located
FILE_PREFIX = ''
# Where the preproccesed.json and feature_selection.json files are located in that root folder path
preprocessed_data_file = FILE_PREFIX + 'preprocessing/preprocessing.json'
selected_features_file = FILE_PREFIX + 'feature_selection/feature_selection.json'
# The random seed to be used when testing
random_seed = 7
# The ability to use parameter tuning (it is recommended to turn this off)
param_search = 'no'
# The output path from the working directory (can be left blank to have the outputs appear within this working directory)
output_path = ''

In [None]:
# Resets all state generated by Keras.
K.clear_session()
start_time = time.time()
# Set the random seeds
np.random.seed(random_seed)
tf.random.set_seed(random_seed)
os.environ['PYTHONHASHSEED'] = str(random_seed)
# Load the training, testing, and validation sets
with open(preprocessed_data_file, 'r', encoding='UTF-8') as preprocessing_file:
    preprocessing_json = json.load(preprocessing_file)
# Load the set of features to be used for training
with open(selected_features_file, 'r', encoding='UTF-8') as feature_selection_file:
    features_json = json.load(feature_selection_file)
# Configure the dataframes using the features specified from the preprocessing
features = preprocessing_json["features"]
selected_features = features_json["features"]
X_train = pd.DataFrame(preprocessing_json["X_train"], columns=features)
X_test = pd.DataFrame(preprocessing_json["X_test"], columns=features)
X_validate = pd.DataFrame(preprocessing_json["X_validate"], columns=features)
y_train = pd.read_json(preprocessing_json["y_train"], orient='values').values.ravel()

# Extract the selected features from feature engineering
X_train = X_train[selected_features]
X_test = X_test[selected_features]
X_validate = X_validate[selected_features]
col_length = X_train.shape[1]

# Extract the test data for the target variable
y_test_complete = pd.DataFrame(preprocessing_json["y_test_complete"], columns=['energy', 'datapoint_id', 'Total Energy'])
y_test = pd.DataFrame(preprocessing_json["y_test"], columns=['energy', 'datapoint_id'])
y_validate_complete = pd.DataFrame(preprocessing_json["y_validate_complete"], columns=['energy', 'datapoint_id', 'Total Energy'])
y_validate= pd.DataFrame(preprocessing_json["y_validate"], columns=['energy', 'datapoint_id'])

# Can combine the train/test/val sets and generate k folds
# ...

# Scale the data to be used for training and testing
scalerx = RobustScaler()
scalery = RobustScaler()
X_train = scalerx.fit_transform(X_train)
X_test = scalerx.transform(X_test)
X_validate = scalerx.transform(X_validate)
y_train = scalery.fit_transform(y_train.reshape(-1, 1))

In [None]:
len(os.listdir("")) - 1

In [None]:
# The number of epochs to be run (early stopping is enabled by default)
EPOCHS = 50
# The activation functions to test
ACTIVATIONS = ['relu', 'sigmoid']
# The dropout rates to test
DROPOUT_RATES = [0.1, -1, 0.5]
# The learning rates to test
LEARNING_RATES = [0.1, 0.01, 0.001, 0.0001, 0.00001]
# Different batch sizes to test
BATCH_SIZES = [90, 16, 32, 128, 256]
# Layer combinations to test
LAYERS = [
    [56],
    [112],
    [1000],
    [10000],
    [56, 28],
    [112, 56],
    [1000, 500],
    [10000, 5000],
    [56, 28, 14],
    [112, 56, 28],
    [1000, 500, 250],
    [10000, 5000, 2500]
]
# Maintain a counted, starting at 0, to skip tests already performed
count = 0
# State which test the loop below should resume at, default is 0
# Can set to len(os.listdir("file_output_directory")) - 1
NUM_FILES_ALREADY_TESTED = 0
# Loop through all, or a subset of tests to perform
for activation in ACTIVATIONS:
    for dropout_rate in DROPOUT_RATES:
        for learning_rate in LEARNING_RATES:
            for batch_size in BATCH_SIZES:
                for layer_design in LAYERS:
                    # If the test has not already been tested
                    if count > NUM_FILES_ALREADY_TESTED:
                        # Set random seeds
                        os.environ["TF_CUDNN_DETERMINISTIC"] = "true"
                        os.environ["TF_DETERMINISTIC_OPS"] = "true"
                        # Resets all state generated by Keras.
                        K.clear_session()
                        start_time = time.time()
                        # Set the random seeds
                        np.random.seed(random_seed)
                        tf.random.set_seed(random_seed)
                        os.environ['PYTHONHASHSEED'] = str(random_seed)
                        # Begin the test
                        print("TEST NUMBER", count)
                        results_pred, hypermodel = pred.create_model(
                                        dense_layers=layer_design,
                                        activation=activation,
                                        optimizer='adam',
                                        dropout_rate=dropout_rate,
                                        length=col_length,
                                        learning_rate=learning_rate,
                                        epochs=EPOCHS,
                                        batch_size=batch_size,
                                        X_train=X_train,
                                        y_train=y_train,
                                        X_test=X_test,
                                        y_test=y_test,
                                        y_test_complete=y_test_complete,
                                        scalery=scalery,
                                        X_validate = X_validate,
                                        y_validate=y_validate,
                                        y_validate_complete= y_validate_complete,
                                        output_path=output_path,
                                        path_elec="",
                                        path_gas="",
                                        val_building_path=""
                                       )
                        # Also output all training information within one json file
                        # Note that the filename maintains all run information, separated by '_'
                        filename = "test_" + activation + "_" + str(learning_rate) + "_" + str(batch_size) + "_" + str(layer_design) + "_" + str(dropout_rate) + ".json"
                        with open(filename, 'w', encoding='utf8') as json_output:
                            json.dump(results_pred, json_output)
                    count += 1