# **Exploring Rocket Losses**

When training the Rocket architecture, it was noticed the accuracy reached very high levels but that this was not commensurate with the magnitude of the loss function. We explore why in this notebook.

## **Initialisation**

In [1]:
pip install sktime==0.32.1



In [86]:
import os
import numpy as np
import pickle
import itertools
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import L1, L2
from keras.losses import CategoricalCrossentropy
from keras.layers import Input, Layer, Conv1D, MaxPool1D, ReLU, BatchNormalization, LayerNormalization, Dropout, Add, Dense, GlobalMaxPooling1D, Bidirectional, GRU, Activation

In [3]:
#You MUST run this command before reading in any data from Google Drive
from google.colab import files
from google.colab import drive
import pandas as pd
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/My Drive/Colab Notebooks/Thesis/experiments')

%run ../sys_configs.ipynb

Mounted at /content/drive


In [4]:
with open('../data/train.npy', 'rb') as f:
    x_train = np.load(f)
    y_train = np.load(f).astype(np.int64)
sz, dim = x_train.shape[1:]

with open('../data/val.npy', 'rb') as f:
    x_val = np.load(f)
    y_val = np.load(f).astype(np.int64)

with open('../data/test.npy', 'rb') as f:
    x_test = np.load(f)
    y_test = np.load(f).astype(np.int64)

classes = np.unique(y_train)

N_train = len(y_train)
N_val = len(y_val)
N_test = len(y_test)

In [5]:
# Convert the labels to tensors
train_labels_tf = tf.one_hot(y_train, 31, dtype=tf.float32)
val_labels_tf = tf.one_hot(y_val, 31, dtype=tf.float32)
test_labels_tf = tf.one_hot(y_test, 31, dtype=tf.float32)

#### **Prepare Rocket transformation train & validation datasets**

In [6]:
# Transpose the train and validation data as the format needs to be N x D x T
x_train_ = x_train.transpose((0, 2, 1))
x_val_ = x_val.transpose((0, 2, 1))
x_test_ = x_test.transpose((0, 2, 1))

# Compute the MiniRocket transform and transform to tensors
minirocket_multi = MiniRocketMultivariate(num_kernels = 10000, max_dilations_per_kernel = 32)
minirocket_multi.fit(x_train_)

train_rocket_np = minirocket_multi.transform(x_train_).to_numpy()
val_rocket_np = minirocket_multi.transform(x_val_).to_numpy()
test_rocket_np = minirocket_multi.transform(x_test_).to_numpy()

train_rocket_tf = tf.convert_to_tensor(train_rocket_np, dtype = tf.float32)
val_rocket_tf = tf.convert_to_tensor(val_rocket_np, dtype = tf.float32)
test_rocket_tf = tf.convert_to_tensor(test_rocket_np, dtype = tf.float32)

In [7]:
train_rocket_ds = tf.data.Dataset.from_tensor_slices((train_rocket_tf, train_labels_tf))
val_rocket_ds = tf.data.Dataset.from_tensor_slices((val_rocket_tf, val_labels_tf))
test_rocket_ds = tf.data.Dataset.from_tensor_slices((test_rocket_tf, test_labels_tf))

train_rocket_ds = train_rocket_ds.shuffle(500)

train_rocket_ds = train_rocket_ds.padded_batch(64)
val_rocket_ds = val_rocket_ds.padded_batch(64)
test_rocket_ds = test_rocket_ds.padded_batch(64)

In [8]:
C = len(set(y_train)) # Number of classes

## **Experiment: Rocket transformation with *sktime* implementation**

In [9]:
def RocketSktime(shape):
    block1_input_layer = Input(shape=shape)
    output_layer = Dense(C, activation="softmax")(block1_input_layer) #, kernel_regularizer = l2(10)
    return Model(inputs=block1_input_layer, outputs=output_layer)

In [10]:
rocketsktime = RocketSktime(shape = (9996,))
rocketsktime.summary()

In [11]:
rocketsktime_model = RocketSktime(shape = (9996,))
rocketsktime_model.compile(optimizer=Adam(learning_rate=1.0, beta_1=0.99, beta_2=0.999, epsilon=1e-08), loss='categorical_crossentropy', metrics=['accuracy'])
history_sktime = rocketsktime_model.fit(train_rocket_ds, validation_data=val_rocket_ds, epochs=50, verbose = 1)

Epoch 1/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.0443 - loss: 16338.2207 - val_accuracy: 0.0704 - val_loss: 10820.7227
Epoch 2/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.1075 - loss: 10465.8789 - val_accuracy: 0.1473 - val_loss: 8340.1836
Epoch 3/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.1904 - loss: 6839.5840 - val_accuracy: 0.2227 - val_loss: 4946.8486
Epoch 4/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.2573 - loss: 4339.7529 - val_accuracy: 0.3032 - val_loss: 2546.4197
Epoch 5/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.3286 - loss: 2433.9175 - val_accuracy: 0.3843 - val_loss: 1802.1234
Epoch 6/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.4012 - loss: 1419.4506 - val_accuracy: 0.5036 - val_loss: 1095.

## **Extract predictions & targets to CSV**

Make predictions based on the validation data, then compute the sample-wise categorical cross entropy loss which will be useful for observing which samples contribute the most to the overall loss.

In [12]:
# Make predictions based on the validation data
val_preds_tf = rocketsktime_model.predict(val_rocket_ds)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [13]:
# Compute the sample-wise categorical cross entropy loss
cce = CategoricalCrossentropy(reduction = None) # Using None reduction type to get a sample-wise Categorical Cross Entropy loss
categorical_crossentropy = pd.DataFrame(cce(val_labels_tf, val_preds_tf).numpy().reshape(-1, 1))

Get the index of the target and prediction and concatenate to their respective datasets. Then concatenate both datasets and the categorical cross entropy loss column and export to CSV for offline analysis.

In [14]:
val_labels_np = tf.argmax(val_labels_tf, axis = 1).numpy().reshape(-1, 1)
val_preds_np = tf.argmax(val_preds_tf, axis = 1).numpy().reshape(-1, 1)

In [15]:
val_labels = pd.DataFrame(np.hstack([val_labels_tf.numpy(), val_labels_np.reshape(-1, 1)]))
val_preds = pd.DataFrame(np.hstack([val_preds_tf, val_preds_np.reshape(-1, 1)]))
print(f"The shape of val_labels is {val_labels.shape} and the shape of val_preds is {val_preds.shape}")

The shape of val_labels is (1392, 32) and the shape of val_preds is (1392, 32)


In [16]:
val_labels = val_labels.add_prefix('Labels_')
val_preds = val_preds.add_prefix('Predictions_')

In [17]:
val_labels_and_preds = pd.concat([val_labels, val_preds, categorical_crossentropy], axis=1)

The dataset has:
- Predicted classes: 31 classes + 1 final prediction column
- Target classes: 31 classes + 1 target column
- Categorical cross entropy loss

Therefore there are 65 columns overall.

In [None]:
val_labels_and_preds.to_csv("rocket_analysis.csv", index = False)

Exploring the dataset extracted earlier, it is immediately obvious that the model is learning a perfect mathematical fit on the training data, i.e. that every sample is correctly classified with 100% probability. This is because there are (many) more features than samples. This is why the Rocket transform was failing when even minor regularisation penalties were applied.

Despite this, the model generalises fairly well to validation data, but either perfectly classifies or perfectly missclassifies (all probability mass is focussed on a single, wrong class) each sample. When the sample is missclassified, the categorical cross entropy loss takes a large, consistent value.

Almost every missclassified sample produced the same value in the categorical cross entropy loss.

## **Experiment 2: Refine model**

In this experiment, we attempt to learn a condensed representation of the Rocket transform using an extra dense layer.

In [25]:
def RocketSktime2(shape):
    block1_input_layer = Input(shape=shape)
    layer = Dense(64, activation = "sigmoid")(block1_input_layer)
    output_layer = Dense(C, activation="softmax")(layer)
    return Model(inputs=block1_input_layer, outputs=output_layer)

In [26]:
rocketsktime2 = RocketSktime2(shape = (9996,))
rocketsktime2.summary()

In [30]:
rocketsktime_model2 = RocketSktime2(shape = (9996,))
rocketsktime_model2.compile(optimizer=Adam(learning_rate=1e-3, beta_1=0.99, beta_2=0.999, epsilon=1e-08), loss='categorical_crossentropy', metrics=['accuracy'])
history_sktime2 = rocketsktime_model2.fit(train_rocket_ds, validation_data=val_rocket_ds, epochs=100, verbose = 1)

Epoch 1/100
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.0372 - loss: 3.5112 - val_accuracy: 0.0654 - val_loss: 3.4114
Epoch 2/100
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0499 - loss: 3.4101 - val_accuracy: 0.0496 - val_loss: 3.4068
Epoch 3/100
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.0482 - loss: 3.4269 - val_accuracy: 0.0496 - val_loss: 3.4068
Epoch 4/100
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.0543 - loss: 3.4199 - val_accuracy: 0.0438 - val_loss: 3.4038
Epoch 5/100
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.0502 - loss: 3.4155 - val_accuracy: 0.0654 - val_loss: 3.4041
Epoch 6/100
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.0496 - loss: 3.4139 - val_accuracy: 0.0654 - val_loss: 3.4040
Epoch 7/100
[1m68/68[0m [

### **Experiment 3: Add Batch Normalisation**

We summise that the output features of Mini-Rocket may not be normalised, leading to variations in scale across different features. We implement Batch Normalisation to learn the appropriate feature scaling.

In [32]:
def RocketSktime3(shape):
    block1_input_layer = Input(shape=shape)
    layer = BatchNormalization()(block1_input_layer)
    layer = Dense(64, activation = "sigmoid")(layer)
    output_layer = Dense(C, activation="softmax")(layer)
    return Model(inputs=block1_input_layer, outputs=output_layer)

In [33]:
rocketsktime3 = RocketSktime3(shape = (9996,))
rocketsktime3.summary()

In [34]:
rocketsktime_model3= RocketSktime3(shape = (9996,))
rocketsktime_model3.compile(optimizer=Adam(learning_rate=1e-3, beta_1=0.99, beta_2=0.999, epsilon=1e-08), loss='categorical_crossentropy', metrics=['accuracy'])
history_sktime3 = rocketsktime_model3.fit(train_rocket_ds, validation_data=val_rocket_ds, epochs=50, verbose = 1)

Epoch 1/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.3675 - loss: 2.5345 - val_accuracy: 0.2751 - val_loss: 2.6789
Epoch 2/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.7613 - loss: 1.4384 - val_accuracy: 0.5072 - val_loss: 1.8656
Epoch 3/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8711 - loss: 0.9825 - val_accuracy: 0.6300 - val_loss: 1.3489
Epoch 4/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9234 - loss: 0.6931 - val_accuracy: 0.7069 - val_loss: 1.0458
Epoch 5/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.9560 - loss: 0.4911 - val_accuracy: 0.7112 - val_loss: 0.8964
Epoch 6/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9729 - loss: 0.3595 - val_accuracy: 0.7191 - val_loss: 0.8168
Epoch 7/50
[1m68/68[0m [32m━━━━

This experiment worked! The train fit is perfect, and the validation accuracy is very competitive with the best approaches considered so far.

### **Experiment 4: Adding L2 penalisation**

In [87]:
def RocketSktime4(shape):
    block1_input_layer = Input(shape=shape)
    layer = BatchNormalization()(block1_input_layer)
    layer = Dense(64, activation = "sigmoid", kernel_regularizer = L2(0.0001))(layer)
    output_layer = Dense(C, activation="softmax")(layer)
    return Model(inputs=block1_input_layer, outputs=output_layer)

In [88]:
rocketsktime4 = RocketSktime4(shape = (9996,))
rocketsktime4.summary()

In [89]:
rocketsktime_model4 = RocketSktime4(shape = (9996,))
rocketsktime_model4.compile(optimizer=Adam(learning_rate=1e-2, beta_1=0.99, beta_2=0.999, epsilon=1e-08), loss='categorical_crossentropy', metrics=['accuracy'])
history_sktime4 = rocketsktime_model4.fit(train_rocket_ds, validation_data=val_rocket_ds, epochs=50, verbose = 1)

Epoch 1/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.2678 - loss: 2.8853 - val_accuracy: 0.1523 - val_loss: 3.8741
Epoch 2/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.5528 - loss: 2.2999 - val_accuracy: 0.3549 - val_loss: 3.0640
Epoch 3/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.6430 - loss: 2.0244 - val_accuracy: 0.4188 - val_loss: 2.6608
Epoch 4/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.7191 - loss: 1.7011 - val_accuracy: 0.5589 - val_loss: 2.0124
Epoch 5/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.7570 - loss: 1.4398 - val_accuracy: 0.5582 - val_loss: 1.9254
Epoch 6/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.7393 - loss: 1.4153 - val_accuracy: 0.5826 - val_loss: 1.7758
Epoch 7/50
[1m68/68[0m [32m━━━━

This model was slightly weaker than the unpenalised model. Let's try L1 penalisation.

### **Experiment 5: L1 Penalisation**

In [82]:
def RocketSktime5(shape):
    block1_input_layer = Input(shape=shape)
    layer = BatchNormalization()(block1_input_layer)
    layer = Dense(64, activation = "sigmoid", kernel_regularizer = L1(0.0001))(layer)
    output_layer = Dense(C, activation="softmax")(layer)
    return Model(inputs=block1_input_layer, outputs=output_layer)

In [83]:
rocketsktime5 = RocketSktime5(shape = (9996,))
rocketsktime5.summary()

In [84]:
rocketsktime_model5 = RocketSktime5(shape = (9996,))
rocketsktime_model5.compile(optimizer=Adam(learning_rate=1e-2, beta_1=0.99, beta_2=0.999, epsilon=1e-08), loss='categorical_crossentropy', metrics=['accuracy'])
history_sktime5 = rocketsktime_model5.fit(train_rocket_ds, validation_data=val_rocket_ds, epochs=50, verbose = 1)

Epoch 1/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.2573 - loss: 5.0849 - val_accuracy: 0.1516 - val_loss: 6.0094
Epoch 2/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.5660 - loss: 4.1885 - val_accuracy: 0.3944 - val_loss: 4.0423
Epoch 3/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.6362 - loss: 3.3333 - val_accuracy: 0.4318 - val_loss: 3.7995
Epoch 4/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.6985 - loss: 3.0187 - val_accuracy: 0.6078 - val_loss: 3.2517
Epoch 5/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.7595 - loss: 2.7937 - val_accuracy: 0.5970 - val_loss: 3.1691
Epoch 6/50
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.7936 - loss: 2.5987 - val_accuracy: 0.6293 - val_loss: 3.0374
Epoch 7/50
[1m68/68[0m [32m━━━━

Again, the model does not generalise quite so well as the unregularised model. It is sensible to conclude there is useful information across all random components.