In [None]:
########################################################################################################################
# Filename: threshold_learning_demo.ipynb
#
# Purpose: Demonstrate the use of the 'threshold_learning' library for multi-label
#          classification tasks.
#
# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll, threshold_learning
########################################################################################################################

# Threshold Learning Demonstration for Multi-Label Classification

In [2]:
## Import necessary modules
from scipy.io import arff
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn import metrics
from bpmll import bp_mll_loss

## Load and Prep the Data

Let's start by loading in our data. We'll be using the `Yeast` dataset which can be found <a href = "http://www.uco.es/kdis/mllresources/">here</a>. As the description from the link states: "this dataset contains micro-array expressions and phylogenetic profiles for 2417 yeast genes. Each gene is annotated with a subset of 14 functional categories (e.g. Metabolism, energy, etc.) of the top level of the functional catalogue.". The training set is loaded as a pandas datarame, below.

In [3]:
## Load the 'Yeast' dataset
data_train = arff.loadarff('Yeast/Yeast-train.arff')
df_train = pd.DataFrame(data_train[0])

df_train.head()

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.0937,0.139771,0.062774,0.007698,0.083873,-0.119156,0.073305,0.00551,0.027523,0.043477,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0'
1,-0.022711,-0.050504,-0.035691,-0.065434,-0.084316,-0.37856,0.038212,0.08577,0.182613,-0.055544,...,b'0',b'0',b'1',b'1',b'0',b'0',b'0',b'1',b'1',b'0'
2,-0.090407,0.021198,0.208712,0.102752,0.119315,0.041729,-0.021728,0.019603,-0.063853,-0.053756,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'1',b'0'
3,-0.085235,0.00954,-0.013228,0.094063,-0.013592,-0.030719,-0.116062,-0.131674,-0.165448,-0.123053,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'1',b'1'
4,-0.088765,-0.026743,0.002075,-0.043819,-0.005465,0.004306,-0.055865,-0.071484,-0.159025,-0.111348,...,b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0',b'0'


### Conversion to Floats & Numpy Arrays 
The label indicators are loaded as bytes literals. We'll convert them to floats and generate numpy arrays of covariate and label values, to be used for training later.

In [5]:
## Convert the class labels into floats
label_names = []
for name in df_train.columns:
    if "Class" in name:
        label_names.append(name)
df_train[label_names] = df_train[label_names].astype("float")
df_train.head()

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.0937,0.139771,0.062774,0.007698,0.083873,-0.119156,0.073305,0.00551,0.027523,0.043477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.022711,-0.050504,-0.035691,-0.065434,-0.084316,-0.37856,0.038212,0.08577,0.182613,-0.055544,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,-0.090407,0.021198,0.208712,0.102752,0.119315,0.041729,-0.021728,0.019603,-0.063853,-0.053756,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,-0.085235,0.00954,-0.013228,0.094063,-0.013592,-0.030719,-0.116062,-0.131674,-0.165448,-0.123053,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,-0.088765,-0.026743,0.002075,-0.043819,-0.005465,0.004306,-0.055865,-0.071484,-0.159025,-0.111348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
X_train = df_train[np.setdiff1d(df_train.columns, label_names)].to_numpy()
Y_train = df_train[label_names].to_numpy()

## Define and Train a Network Using Cross Entropy Loss 

We'll start with a two-layered network, utilizing a standard cross-entropy loss function. This is in contrast to the novel multi-label loss function: "BP-MLL" discussed in Zhang & Zhou (2006). We will fit an identical network architecture using BP-MLL later on, for comparison. Note, we're using relu and sigmoid activations, dropout regularization, and Adagrad optimization. This is due to intuitions from Nam et al. (2014) where results for networks using 'standard' losses performed as well, or better than BP-MLL when using similar designs.

In [116]:
## Start with standard cross-entropy loss (bpmll used later)
tf.random.set_seed(123)

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(14, activation = 'sigmoid')
])

optim_func = tf.keras.optimizers.Adagrad(
    learning_rate = 0.0001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model.compile(optimizer = optim_func,
              loss = 'categorical_crossentropy',
              )

In [117]:
model.fit(X_train, Y_train, epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1eb07702220>

## Compare Test Set Performance Using Constant and Learned Threshold Functions

Next, let's compare how test-set performance is affected when a constant threshold function is swapped out for a learned threshold function. After loading the test data and generating logit predictions, we'll apply a constant threshold function ($t(x) \equiv 0.5$) to generate binary predictions. Using hamming loss as a metric, we'll then evaluate how our model performed.

In [None]:
## Load the test data
data_test = arff.loadarff('Yeast/Yeast-test.arff')
df_test = pd.DataFrame(data_test[0])
df_test[label_names] = df_test[label_names].astype("float")
X_test = df_test[np.setdiff1d(df_test.columns, label_names)].to_numpy()
Y_test = df_test[label_names].to_numpy()

df_test.head()

In [118]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model.predict(X_test)
predictions_binary = model.predict(X_test)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.5559277145972893

A hamming loss of $\approx 0.56$ doesn't surpass random guessing (where guessing occurs at the level of individual labels). Next, we'll utilize the `predict_test_labels_binary()` function from the `threshold_learning` library to generate binary predictions from a learned threshold function. We find that the hamming loss can be significantly reduced.

In [119]:
## Learn a Threshold Function
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
t_range = (0, 1)
from threshold_learning import predict_test_labels_binary

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.2862595419847328

## Define and Train a Network Using BP-MLL Loss

Using the same architecture as in the previous example, we'll now learn a network using the BP-MLL loss function as described in Zhang & Zhao (2006). This loss function aims to increase model performance by minimizing pairwise errors. Namely, let $c_j^i$ denote the output of the label $j$ node of the  network for instance $i$. Furthermore, let $Y_i$ denote the label set of instance $i$ and $\overline{Y}_i$ denote it's complement, in the set of possible labels. Then, the BP-MLL loss is given by:

$$
    E = \sum_{i = 1}^m E_i = \sum_{i = 1}^m \frac{1}{|Y_i| |\overline{Y}_i|} \sum_{(k,l) \in Y_i \times \overline{Y}_i} \exp(-(c_k^i - c_l^i))
$$

so that the $i^{th}$ error term is severely penalized if $c_k^i$ is much smaller than $c_l^i$. 

In [120]:
## Start with bp-mll loss
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(14, activation = 'sigmoid')
])

optim_func = tf.keras.optimizers.Adagrad(
    learning_rate = 0.0001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model.compile(optimizer = optim_func,
              loss = bp_mll_loss,
              )

In [121]:
model.fit(X_train, Y_train, epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1eb07a9dee0>

## Compare Test Set Performance Using Constant and Learned Threshold Functions

Again, we'll compute the hamming loss on our test data using both a constant and learned threshold function.

In [122]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model.predict(X_test)
predictions_binary = model.predict(X_test)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.3599470322480137

In [123]:
## Learn a Threshold Function
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
t_range = (0, 1)
from threshold_learning import predict_test_labels_binary

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.3110297554136158

## Conclusion

We see here that performance with BP-MLL vastly improves upon cross-entropy, when using a constant threshold function. Furthermore, similar to the cross-entropy loss model, performance improves with a learned threshold function. That said, it's important to note that we were able to reach a lower hamming loss on test data using cross-entropy loss and a learned threshold function than we were with either of the BP-MLL methods. This mirrors the results found in Nam et al. (2014) and is, in fact, a desirable outcome, for the following reason.

Since computing the BP-MLL loss involves pairwise computations, obtaining error terms is more expensive than utilizing cross-entropy or MSE loss. This scales poorly with the number of labels, and can lead to significantly larger training times. Furthermore, the surface for the BP-MLL loss has plateaus in which gradient descent can be very slow in comparison with the cross-entropy loss function. 


## References

* Jinseok Nam, Jungi Kim, Eneldo Loza Menc´ıa, Iryna Gurevych, and
Johannes F¨urnkranz. Large-scale multi-label text classification —
revisiting neural networks. In Toon Calders, Floriana Esposito, Eyke
H¨ullermeier, and Rosa Meo, editors, Machine Learning and Knowledge
Discovery in Databases, pages 437–452, Berlin, Heidelberg, 2014.
Springer Berlin Heidelberg. ISBN 978-3-662-44851-9.

* Min-Ling Zhang and Zhi-Hua Zhou. Ml-knn: A lazy learning approach to
multi-label learning. Pattern Recognition, 40(7):2038–2048, 2007. doi:
10.1016/j.patcog.2006.12.019.

* Min-Ling Zhang and Zhi-Hua Zhou. Multilabel neural networks with
applications to functional genomics and text categorization. IEEE
Transactions on Knowledge and Data Engineering, 18(10):1338–1351,
2006. doi: doi:10.1109/TKDE.2006.162.
