## Machine learning process: train and test a DNN


### Based on following example from TensorFLow

Title: Timeseries classification from scratch  
Author: [hfawaz](https://github.com/hfawaz/)  
Date created: 2020/07/21  
Last modified: 2021/07/16  
Description: Training a timeseries classifier from scratch on the FordA dataset from the UCR/UEA archive.  

This example shows how to do timeseries classification from scratch, starting from raw
CSV timeseries files on disk. We demonstrate the workflow on the FordA dataset from the
[UCR/UEA archive](https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/).


## Setup

In [None]:
import tensorflow.keras as keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import sklearn

In [None]:
# Parameters cell for Papermill

dataset_name = 'BasicMotions' # FordA FordB ArrowHead BasicMotions SelfRegulationSCP1 SelfRegulationSCP2 df7_20220710_023405 df8_20220721_182341 df9_20220722_065855(etc)
classifier_name = 'cnn' # cnn fcn resnet11
epochs = 10

In [None]:
root_dir = "/Users/stevemead/LISAandNEA/"

timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')
run_id = f'{dataset_name}_{classifier_name}_{timestamp}'
output_directory = f'{root_dir}results/{run_id}/'

In [None]:
import os

def create_directory(directory_path):
    '''Create a new directory, returns None if directory_path already exists or an error occurs'''
    if os.path.exists(directory_path):
        return None
    else:
        try:
            os.makedirs(directory_path)
        except:
            return None
        return directory_path

def get_output_file_name(output_file, file_type):
    return f'{output_directory}/{output_file}_{run_id}.{file_type}'

In [None]:
create_directory(output_directory)

## Load the data

In [None]:
def read_ucr_datasets(filename):
    data = np.loadtxt(filename)
    y = data[:, 0]
    x = data[:, 1:]
    return x, y.astype(int)

## Visualize the data

Here we visualize one timeseries example for each class in the dataset.

In [None]:
if dataset_name in ['FordA','FordB','ArrowHead']:
    
    print('Univariate UCR TSC dataset')
    # Load the data

    x_train, y_train = read_ucr_datasets(f'{root_dir}datasets/{dataset_name}/{dataset_name}_TRAIN.txt')
    x_test, y_test = read_ucr_datasets(f'{root_dir}datasets/{dataset_name}/{dataset_name}_TEST.txt')

    # Visualise the data

    print(type(x_train))
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    
    print(y_train[:100])
    classes = np.unique(np.concatenate((y_train, y_test), axis=0))

    plt.figure()
    for c in classes:
        c_x_train = x_train[y_train == c]
        plt.plot(c_x_train[0], label="class " + str(c))
    plt.legend(loc="best")
    plt.show()
    plt.close()

    # Standardise the data (see comments below re: znormalise)

    # Reshape the data

    x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
    x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

elif dataset_name in ['BasicMotions','SelfRegulationSCP1','SelfRegulationSCP2']:
    print('Multivariate UEA TSC dataset')
    
    # Load the data
    from sktime.datasets import load_from_tsfile
    
    x_train, y_train = load_from_tsfile(
        f'{root_dir}datasets/{dataset_name}/{dataset_name}_TRAIN.ts', return_data_type="numpy3d")
    x_test, y_test = load_from_tsfile(
        f'{root_dir}datasets/{dataset_name}/{dataset_name}_TEST.ts', return_data_type="numpy3d")

    # Visualise the data
    print(type(x_train))
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    x_train = np.transpose(x_train, (0,2,1))
    x_test = np.transpose(x_test, (0,2,1))
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    
    print(y_train[:100])

    max_xyz, max_aet = 0,0
    for i in range(0,3):
        max_val = abs(x_train[:,:,i]).max()
        if max_val>max_xyz: max_xyz = max_val
    for i in range(3,6):
        max_val = abs(x_train[:,:,i]).max()
        if max_val>max_aet: max_aet = max_val
    x_train[:,:,0:3] = x_train[:,:,0:3]/max_xyz
    x_train[:,:,3:6] = x_train[:,:,3:6]/max_aet
    x_test[:,:,0:3] = x_test[:,:,0:3]/max_xyz
    x_test[:,:,3:6] = x_test[:,:,3:6]/max_aet

    classes = np.unique(np.concatenate((y_train, y_test), axis=0))

    
    for c in classes:
        c_x_train = x_train[y_train == c]
        print(c_x_train.shape)
        plt.figure()
        plt.plot(c_x_train[0,:,0:3], label="class " + str(c))
        plt.legend(loc="best")
        plt.show()
        plt.close()
        plt.figure()
        plt.plot(c_x_train[0,:,3:6], label="class " + str(c))
        plt.legend(loc="best")
        plt.show()
        plt.close()
    
    # Standardise the data (see comments below re: znormalise)

else:
    print('Multivariate LISAandNEA TSC dataset')

    # load the data
    df_name = run_id.split('_')[0]
    print(df_name)
    
    # load y
    df_y_file_path = f'{root_dir}datasets/{df_name}.txt'
    df_y = pd.read_csv(df_y_file_path)
    all_y = df_y['type'].to_numpy()
    # y_train = df_y['type'][:2500]
    # y_test = df_y['type'][2500:]
    
    # load x
    signals = 'fluctuations'
    all_x_file_path = f'{root_dir}datasets/{dataset_name}/dataset_{signals}_{dataset_name}.npy'
    all_x = np.load(all_x_file_path)*1e5
    # x_train = all_x[:2500,:,:]
    # x_test = all_x[2500:,:,:]

    # split the dataset into training and testing, keeping same proportions of each class in both
    from sklearn.model_selection import train_test_split

    x_train, x_test, y_train, y_test = train_test_split(
        all_x, all_y, test_size=1/6, random_state=1702, shuffle=False
    )
    x_train = all_x[:2500,:,:]
    x_test = all_x[2500:,:,:]
    y_train = all_y[:2500]
    y_test = all_y[2500:]

    print(y_train.shape, x_train.shape, y_test.shape, x_test.shape)

    # rescale the data so that the maximum absolute value of each feature is scaled to unit size

    max_xyz, max_aet = 0,0
    for i in range(0,3):
        max_val = abs(x_train[:,:,i]).max()
        if max_val>max_xyz: max_xyz = max_val
    for i in range(3,6):
        max_val = abs(x_train[:,:,i]).max()
        if max_val>max_aet: max_aet = max_val
    x_train[:,:,0:3] = x_train[:,:,0:3]/max_xyz
    x_train[:,:,3:6] = x_train[:,:,3:6]/max_aet
    x_test[:,:,0:3] = x_test[:,:,0:3]/max_xyz
    x_test[:,:,3:6] = x_test[:,:,3:6]/max_aet
    
    # scalers = {}
    # for i in range(x_train.shape[2]):
    #     scalers[i] = sklearn.preprocessing.MaxAbsScaler()
    #     x_train[:,:,i] = scalers[i].fit_transform(x_train[:,:,i])
    #     print(f'Scale of scaler[{i}]:', max(scalers[i].scale_))
    
    # for i in range(x_test.shape[2]):
    #     x_test[:,:,i] = scalers[i].transform(x_test[:,:,i])

    classes = np.unique(np.concatenate((y_train, y_test), axis=0))
    print(classes)
    
    for c in classes:
        c_x_train = x_train[y_train == c]
        print(c_x_train.shape)
        plt.figure(figsize=(10,4))
        plt.plot(c_x_train[0,:,0:3], label="class " + str(c))
        plt.legend(loc="best")
        plt.show()
        plt.close()
        plt.figure(figsize=(10,4))
        plt.plot(c_x_train[0,:,3:6], label="class " + str(c))
        plt.legend(loc="best")
        plt.show()
        plt.close()   

In [None]:
x_train.max(), x_test.max()

In [None]:
print(y_train.shape, x_train.shape, y_test.shape, x_test.shape)
print(type(y_train), type(x_train), type(y_test), type(x_test))

## Standardize the data

In [None]:
num_classes = len(np.unique(np.concatenate((y_train, y_test), axis=0)))
np.unique(np.concatenate((y_train, y_test), axis=0))

In [None]:
idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]

y_train.shape, y_test.shape, y_train[:10]

In [None]:
import sklearn
from sklearn.preprocessing import OneHotEncoder

y_test_original = y_test

if (len(y_train.shape) == 1) & (len(y_test.shape) == 1):
    # transform the labels from integers to one hot vectors
    enc = sklearn.preprocessing.OneHotEncoder(categories='auto')
    enc.fit(np.concatenate((y_train, y_test), axis=0).reshape(-1, 1))
    y_train = enc.transform(y_train.reshape(-1, 1)).toarray()
    y_test = enc.transform(y_test.reshape(-1, 1)).toarray()

# y_ datasets have shape (number_of_samples, number_of_classes)
# Class 0 (or -1 in the original value) is the first class. Index 0 along axis=1
# Class 1 is the second class. Index 1 along axis=1
y_train.shape, y_test.shape, y_train[:10], enc.categories_

In [None]:
y_true = np.argmax(y_test, axis=1)
print(y_true.shape, y_true)

In [None]:
try:
    np.testing.assert_array_equal(y_true, y_test_original)
    print('y_true is a perfect match for y_test_original')
except AssertionError as e:
    print(e)

## Build a model or 3


In [None]:
def make_cnn_model(input_shape, num_classes):

    padding = 'valid'
    input_layer = keras.layers.Input(input_shape)

    conv1 = keras.layers.Conv1D(filters=6,kernel_size=7,padding=padding,activation='sigmoid')(input_layer)
    conv1 = keras.layers.AveragePooling1D(pool_size=3)(conv1)

    conv2 = keras.layers.Conv1D(filters=12,kernel_size=7,padding=padding,activation='sigmoid')(conv1)
    conv2 = keras.layers.AveragePooling1D(pool_size=3)(conv2)

    flatten_layer = keras.layers.Flatten()(conv2)

    output_layer = keras.layers.Dense(units=num_classes,activation='sigmoid')(flatten_layer)

    return keras.models.Model(inputs=input_layer, outputs=output_layer)

In [None]:
def make_fcn_model(input_shape, num_classes):
    input_layer = keras.layers.Input(input_shape)

    conv1 = keras.layers.Conv1D(filters=128, kernel_size=8, padding="same")(input_layer)
    conv1 = keras.layers.BatchNormalization()(conv1)
    conv1 = keras.layers.Activation('relu')(conv1)

    conv2 = keras.layers.Conv1D(filters=256, kernel_size=5, padding="same")(conv1)
    conv2 = keras.layers.BatchNormalization()(conv2)
    conv2 = keras.layers.Activation('relu')(conv2)

    conv3 = keras.layers.Conv1D(filters=128, kernel_size=3, padding="same")(conv2)
    conv3 = keras.layers.BatchNormalization()(conv3)
    conv3 = keras.layers.Activation('relu')(conv3)

    gap_layer = keras.layers.GlobalAveragePooling1D()(conv3)

    output_layer = keras.layers.Dense(num_classes, activation="softmax")(gap_layer)

    return keras.models.Model(inputs=input_layer, outputs=output_layer)

In [None]:
def make_resnet11_model(input_shape, num_classes):

    n_feature_maps = 64

    input_layer = keras.layers.Input(input_shape)

    # BLOCK 1

    conv_x = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=8, padding='same')(input_layer)
    conv_x = keras.layers.BatchNormalization()(conv_x)
    conv_x = keras.layers.Activation('relu')(conv_x)

    conv_y = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=5, padding='same')(conv_x)
    conv_y = keras.layers.BatchNormalization()(conv_y)
    conv_y = keras.layers.Activation('relu')(conv_y)

    conv_z = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=3, padding='same')(conv_y)
    conv_z = keras.layers.BatchNormalization()(conv_z)

    # expand channels for the sum
    shortcut_y = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=1, padding='same')(input_layer)
    shortcut_y = keras.layers.BatchNormalization()(shortcut_y)

    output_block_1 = keras.layers.add([shortcut_y, conv_z])
    output_block_1 = keras.layers.Activation('relu')(output_block_1)

    # BLOCK 2

    conv_x = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=8, padding='same')(output_block_1)
    conv_x = keras.layers.BatchNormalization()(conv_x)
    conv_x = keras.layers.Activation('relu')(conv_x)

    conv_y = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=5, padding='same')(conv_x)
    conv_y = keras.layers.BatchNormalization()(conv_y)
    conv_y = keras.layers.Activation('relu')(conv_y)

    conv_z = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=3, padding='same')(conv_y)
    conv_z = keras.layers.BatchNormalization()(conv_z)

    # expand channels for the sum
    shortcut_y = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=1, padding='same')(output_block_1)
    shortcut_y = keras.layers.BatchNormalization()(shortcut_y)

    output_block_2 = keras.layers.add([shortcut_y, conv_z])
    output_block_2 = keras.layers.Activation('relu')(output_block_2)

    # BLOCK 3

    conv_x = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=8, padding='same')(output_block_2)
    conv_x = keras.layers.BatchNormalization()(conv_x)
    conv_x = keras.layers.Activation('relu')(conv_x)

    conv_y = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=5, padding='same')(conv_x)
    conv_y = keras.layers.BatchNormalization()(conv_y)
    conv_y = keras.layers.Activation('relu')(conv_y)

    conv_z = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=3, padding='same')(conv_y)
    conv_z = keras.layers.BatchNormalization()(conv_z)

    # no need to expand channels because they are equal
    shortcut_y = keras.layers.BatchNormalization()(output_block_2)

    output_block_3 = keras.layers.add([shortcut_y, conv_z])
    output_block_3 = keras.layers.Activation('relu')(output_block_3)

    # FINAL

    gap_layer = keras.layers.GlobalAveragePooling1D()(output_block_3)

    output_layer = keras.layers.Dense(num_classes, activation="softmax")(gap_layer)

    return keras.models.Model(inputs=input_layer, outputs=output_layer)

In [None]:
training_log_file = get_output_file_name('training_log','csv')
init_model_file = get_output_file_name('init_model','h5')
best_model_file = get_output_file_name('best_model','h5')
last_model_file = get_output_file_name('last_model','h5')

model_checkpoint = keras.callbacks.ModelCheckpoint(
    best_model_file, save_best_only=True, monitor="val_loss")

reduce_lr = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.5, patience=50, min_lr=0.0001)

early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=50, verbose=1)

csv_logger = keras.callbacks.CSVLogger(
    filename=training_log_file, separator=",", append=False)

In [None]:
class CustomSaver(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        N = 50
        if (epoch+1)%N == 0:  # save once every N epochs
            self.model.save(get_output_file_name(f'mid_model_{epoch:04d}','h5'))

In [None]:
saver = CustomSaver()

In [None]:
input_shape = x_train.shape[1:]
callbacks = [saver, model_checkpoint, csv_logger]
lr_logged = True
validation_split = 0.2
# epochs is now a papermill parameter
batch_size = 16

if classifier_name == 'cnn':
    model = make_cnn_model(
        input_shape=input_shape, num_classes=num_classes)
    model.compile(
        loss='mean_squared_error', 
        optimizer=keras.optimizers.Adam(),
        metrics=['accuracy']
    )
    lr_logged = False
elif classifier_name == 'fcn':
    callbacks.append(reduce_lr)
    model = make_fcn_model(
        input_shape=input_shape, num_classes=num_classes)
    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )
    batch_size = int(min(x_train.shape[0]/10, 16))
elif classifier_name == 'resnet11':
    callbacks.append(reduce_lr)
    model = make_resnet11_model(input_shape=input_shape, num_classes=num_classes)
    model.compile(
        loss='categorical_crossentropy', 
        optimizer=keras.optimizers.Adam(),
        metrics=['accuracy']
    )
    # epochs = 1500
    batch_size = int(min(x_train.shape[0]/10, 64))
else:
    print(f'Model type {classifier_name} unsupported')

model.save_weights(init_model_file)

In [None]:
model_fig = get_output_file_name('model','png')
keras.utils.plot_model(model, show_shapes=True, rankdir='TB', to_file=model_fig)

In [None]:
model.summary()

In [None]:
df_about_file = get_output_file_name('df_about','csv')

df_about = pd.DataFrame(data=np.zeros((1, 2), dtype=int), index=[0],
                                 columns=['epochs', 'batch_size'])

df_about['epochs'] = epochs
df_about['batch_size'] = batch_size
df_about['validation_split'] = validation_split
df_about['run_id'] = run_id
df_about['dataset_name'] = dataset_name
df_about['classifier_name'] = classifier_name
df_about['timestamp'] = timestamp
df_about['lr_logged'] = lr_logged
df_about['num_instances_train'] = x_train.shape[0] # total training instances, then split into training and validation datasets
df_about['num_instances_test'] = x_test.shape[0] # total testing instance, not used for training or validation
df_about['num_samples'] = x_train.shape[1]
df_about['num_variables'] = x_train.shape[2]
df_about['num_classes'] = num_classes

df_about.to_csv(df_about_file, index=False)

## Train the model

In [None]:
from time import time

fit_start_time = time()

history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_split=validation_split,
    verbose=0
)

fit_duration = time() - fit_start_time
model.save(last_model_file)

In [None]:
from datetime import timedelta

print(f'Trained {classifier_name} on {dataset_name} in {str(timedelta(seconds=int(fit_duration)))} ({int(fit_duration)} seconds)')

In [None]:
history_file = get_output_file_name('history','csv')

history_df = pd.DataFrame(history.history)
history_df.to_csv(history_file, index=False)

In [None]:
df_best_model_file = get_output_file_name('df_best_model','csv')

index_best_model = history_df['val_loss'].idxmin()
row_best_model = history_df.loc[index_best_model]
df_best_model = pd.DataFrame(data=np.zeros((1, 6), dtype=float), index=[0],
                                 columns=['best_model_train_loss', 'best_model_val_loss', 'best_model_train_acc',
                                          'best_model_val_acc', 'best_model_learning_rate', 'best_model_nb_epoch'])

df_best_model['best_model_train_loss'] = row_best_model['loss']
df_best_model['best_model_val_loss'] = row_best_model['val_loss']
df_best_model['best_model_train_acc'] = row_best_model['accuracy']
df_best_model['best_model_val_acc'] = row_best_model['val_accuracy']
if lr_logged:
    df_best_model['best_model_learning_rate'] = row_best_model['lr']
df_best_model['best_model_nb_epoch'] = index_best_model

df_best_model.to_csv(df_best_model_file, index=False)

## Plot the model's training and validation loss and accuracy

In [None]:
def plot_history(epochs, history, metric, bpe=None, zoomed=None, save_figure_as=None, show=False):
    '''Plot the history of loss over training epochs'''
    fig, axs = plt.subplots(nrows=1, 
                        ncols=1, 
                        # sharex='col', 
                        # sharey='row', 
                        squeeze=False, 
                        figsize=(6, 4))
    xs = np.arange(1,epochs+1,1)
    plt.plot(xs, history[metric], label=f'Training {metric}', zorder=2)
    plt.plot(xs, history['val_' + metric], label=f'Validation {metric}', zorder=1)
    if bpe is not None:
        axs[0,0].axvline(x=bpe, ymin=0.0, ymax=0.95, color='purple', ls='--', label='Best performing epoch', zorder=0) 
    axs[0,0].set(xlabel='Epoch', ylabel=metric.capitalize())
    if metric=='loss':
        axs[0,0].legend(loc='upper right')
    else:
        axs[0,0].legend(loc='lower right')
        axs[0,0].set(ylim=(0,1.05))
    if zoomed is not None:
        axs[0,0].set(xlim=zoomed) 
    plt.tight_layout()
    if show:
        plt.show()
    if save_figure_as is not None:
        plot_history_file = get_output_file_name(save_figure_as,'png')
        fig.patch.set_alpha(1)
        fig.savefig(plot_history_file)
    plt.close()

plot_history(epochs, history.history, metric='loss', save_figure_as='epochs_loss')
# plot_history(200, history.history, metric='loss', zoomed=(0-5,200+5), save_figure_as='epochs_loss_zoom200')
plot_history(epochs, history.history, metric='accuracy', save_figure_as='epochs_accuracy')
# plot_history(200, history.history, metric='accuracy', zoomed=(0-5,200+5), save_figure_as='epochs_accuracy_zoom200')

plot_history(epochs, history.history, metric='loss', bpe=index_best_model+1, save_figure_as='epochs_loss_bpe', show=True)
plot_history(epochs, history.history, metric='accuracy', bpe=index_best_model+1, save_figure_as='epochs_accuracy_bpe', show=True)


## Evaluate model on test data

In [None]:
model = keras.models.load_model(best_model_file)

evaluation = model.evaluate(x_test, y_test)

In [None]:
df_metrics_eval_file = get_output_file_name('df_metrics_eval','csv')

df_metrics_eval = pd.DataFrame(data=np.zeros((1, len(model.metrics_names)), dtype=float), index=[0],
                       columns=model.metrics_names)

for i, metric in enumerate(model.metrics_names):
    df_metrics_eval[metric] = evaluation[i]
    print(f'Test {metric}: {evaluation[i]}')

df_metrics_eval.to_csv(df_metrics_eval_file, index=False)

## Make some predictions on test data

In [None]:
y_pred = model.predict(x_test)

# raw probabilities to chosen class (highest probability)
y_pred = np.argmax(y_pred,axis=1) 

In [None]:
x_test.shape, y_test.shape, y_pred.shape, y_pred

In [None]:
df_metrics_test_file = get_output_file_name('df_metrics_test','csv')

df_metrics_test = pd.DataFrame(data=np.zeros((1, 4), dtype=float), index=[0],
                       columns=['precision', 'accuracy', 'recall', 'duration'])

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

df_metrics_test['precision'] = precision_score(y_true, y_pred, average='macro')
df_metrics_test['accuracy'] = accuracy_score(y_true, y_pred)
df_metrics_test['recall'] = recall_score(y_true, y_pred, average='macro')
df_metrics_test['duration'] = fit_duration

df_metrics_test.to_csv(df_metrics_test_file, index=False)

## Plot the ROC curve and confusion matrix

In [None]:
from sklearn.metrics import roc_curve, auc

# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred, y, save_figure_as=None):
    fpr, tpr, _ = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)

    fig = plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.tight_layout()
    if save_figure_as is not None:
        roc_file = get_output_file_name(save_figure_as,'png')
        fig.patch.set_alpha(1)
        fig.savefig(roc_file)
    plt.show()

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix_original(cm, names, title='Confusion matrix', 
                            cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
if num_classes == 2: # univariate only
    plot_roc(y_pred, y_true, save_figure_as='roc')

In [None]:
try:
    np.testing.assert_array_equal(y_true, y_pred)
    print('Predictions are a perfect match for ground truth!')
except AssertionError as e:
    print(e)

In [None]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

In [None]:
df_cm_file = get_output_file_name('df_cm','csv')

df_cm = pd.DataFrame(data=cm)

df_cm.to_csv(df_cm_file, index=False)

In [None]:
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

In [None]:
df_cm_norm_file = get_output_file_name('df_cm_norm','csv')

df_cm_norm = pd.DataFrame(data=cm_normalized)

df_cm_norm.to_csv(df_cm_norm_file, index=False)

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True,
                          save_figure_as=None):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    fig = plt.figure(figsize=(4.5, 4))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    # plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    if save_figure_as is not None:
        cm_file = get_output_file_name(save_figure_as,'png')
        fig.patch.set_alpha(1)
        fig.savefig(cm_file)
    plt.show()

In [None]:
print('Confusion matrix')
print(cm)
plot_confusion_matrix(cm, enc.categories_[0], 
        normalize=False, save_figure_as='cm')

In [None]:
print('Normalized confusion matrix')
print(cm_normalized)
plot_confusion_matrix(cm_normalized, enc.categories_[0], 
        save_figure_as='cm_norm')
# plot_confusion_matrix_original(cm_normalized, enc.categories_[0])

In [None]:
# Save the testing data (last 500 rows) out of the file used to define the dataset
df_y_test = df_y.tail(500)

# Append the true and predicted values
# Integer values correspond to classes in alphabetical order: 
#   0 = glitch
#   1 = gwburst
#   2 = nea

df_y_test['y_true'] = y_true
df_y_test['y_pred'] = y_pred

# Save the testing data for posterity
df_y_test_file = get_output_file_name('df_y_test','csv')
df_y_test.to_csv(df_y_test_file, index=False)
