In [None]:
########################################################################################################################
# Filename: BPMLL_paragraph_classification.ipynb
#
# Purpose: Multi-label Text-categorization, using neural networks, for paragraph-level
#          data as part of STAT 6500 final project.

# Author(s): Bobby (Robert) Lumpkin, Yue Li
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll
########################################################################################################################

# Text Categorization With Neural Networks 

In [1]:
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
from bpmll import bp_mll_loss
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sys
sys.path.append('../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary    ## Import the 'predict_test_labels_binary()' function from the 
                                                             ## threshold_learning library

## Load & Pre-Process the Data

In [2]:
## Load 'content_paragraphs_ready.csv' into a pandas dataframe
data_filepath = "..\..\dataset\content_paragraphs_ready.csv"
paragraph_data = pd.read_csv(data_filepath)
paragraph_data.head()
#os.getcwd()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,...,prosper,preview,moor,coverag,glow,profil,clash,incumb,frequent,unfound
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
## Keep only 'doc_id', the label columns, and the tf-idf columns
tfidf_colnames = list(paragraph_data.columns[25:])
label_columns = list(paragraph_data.columns[2:15])
cols_toKeep = ['doc_id']
cols_toKeep.extend(tfidf_colnames)
cols_toKeep.extend(label_columns)
paragraph_data = paragraph_data[cols_toKeep]
paragraph_data.head()

Unnamed: 0,doc_id,murphi,martha,defacto,stay,home,state,million,us,you�,...,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,economy,education,political evaluation,racial conflict,international ralations/foreign policies
0,text1,1.684247,1.348455,2.161368,3.118616,3.016311,0.91833,1.207125,1.383217,1.763428,...,1,1,0,1,1,0,0,0,0,0
1,text2,1.684247,1.348455,0.0,0.0,0.0,3.67332,0.0,0.0,1.763428,...,1,1,0,1,1,0,0,0,0,0
2,text3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
3,text4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,1,1,0,0,0,0,0
4,text5,1.684247,1.348455,0.0,0.0,0.0,0.0,0.0,1.383217,0.0,...,0,0,1,0,1,0,0,0,0,0


In [4]:
## Define the X and Y train and test matrices
X = paragraph_data[tfidf_colnames].to_numpy().astype(float)
Y = paragraph_data[label_columns].to_numpy().astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 321)

In [5]:
## These are the rows that don't have any associated labels
Y_gz = Y > 0
no_labels_id = np.where(~Y_gz.any(axis=1))[0]
no_labels_id

array([ 38,  58,  61,  67,  83, 132, 139, 142, 143, 150, 151, 166, 167,
       180, 214, 221, 240, 279, 282, 283, 284, 285, 286, 287, 288, 289],
      dtype=int64)

## Define and Train a BP-MLL Network (Training Instances That Have Atleast One Label)

In [6]:
## Since some instances don't have any labels, remove them from the dataset for use with BP-MLL
atleast_one_label_ids = paragraph_data.index
atleast_one_label_ids = atleast_one_label_ids.difference(no_labels_id)
X_hasLabel = X[atleast_one_label_ids, ]
Y_hasLabel = Y[atleast_one_label_ids, ]
X_train_hasLabel, X_test_hasLabel, Y_train_hasLabel, Y_test_hasLabel = train_test_split(X_hasLabel, Y_hasLabel, 
                                                                                        test_size = 0.33, random_state = 321)

In [7]:
Y_test_hasLabel

array([[1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [1., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.]])

In [7]:
## Compute the proportion of labels that are equal to one in the test_hasLabel set
## This tells us what the hamming loss would be, if we constantly predicted 0
## We can compare our observed hamming loss against this, since we have sparse labels
prop_one_bpmll = np.sum(Y_test_hasLabel == 1) / (Y_test_hasLabel.shape[0] * Y_test_hasLabel.shape[1])
prop_one_bpmll

0.25524475524475526

In [8]:
## Start by defining and compiling the bp-mll loss network 
num_labels = len(label_columns)

model_bpmll = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_bpmll = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_bpmll.compile(optimizer = optim_func_bpmll,
              loss = bp_mll_loss,
              )

In [9]:
tf.random.set_seed(123)
model_bpmll.fit(X_train_hasLabel, Y_train_hasLabel, epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x21106cfd1f0>

### Compare Performance of BP-MLL Loss Using Different Threshold Function Approaches 

In [10]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_bpmll.predict(X_test_hasLabel)
predictions_binary = model_bpmll.predict(X_test_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.28583916083916083

In [11]:
## Learn a Threshold Function
Y_train_pred = model_bpmll.predict(X_train_hasLabel)
Y_test_pred = model_bpmll.predict(X_test_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.1853146853146853

### Deeper Architectures

In [12]:
## Add an additional hidden layer
model_bpmll_deep = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_bpmll = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_bpmll_deep.compile(optimizer = optim_func_bpmll,
              loss = bp_mll_loss,
              )

model_bpmll_deep.fit(X_train_hasLabel, Y_train_hasLabel, epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x211088a71f0>

In [13]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_bpmll_deep.predict(X_test_hasLabel)
predictions_binary = model_bpmll_deep.predict(X_test_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.2736013986013986

In [14]:
## Learn a Threshold Function
Y_train_pred = model_bpmll_deep.predict(X_train_hasLabel)
Y_test_pred = model_bpmll_deep.predict(X_test_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.2097902097902098

## Define and Train a Cross-Entropy Loss Network (Training Instances That Have Atleast One Label)

In [15]:
## Start with standard cross-entropy loss (bpmll used later)

model_ce_hasLabel = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_ce = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_ce_hasLabel.compile(optimizer = optim_func_ce,
              loss = 'categorical_crossentropy',
              )

In [16]:
model_ce_hasLabel.fit(X_train_hasLabel, Y_train_hasLabel, epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x21108e4c5e0>

### Compare Performance of CE Loss Using Different Threshold Function Approaches

In [17]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_ce_hasLabel.predict(X_test_hasLabel)
predictions_binary = model_ce_hasLabel.predict(X_test_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.5629370629370629

In [18]:
## Learn a Threshold Function
Y_train_pred = model_ce_hasLabel.predict(X_train_hasLabel)
Y_test_pred = model_ce_hasLabel.predict(X_test_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.23601398601398602

### Deeper Architectures

In [19]:
## Add an additional hidden layer
model_ce_deep = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_ce = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_ce_deep.compile(optimizer = optim_func_ce,
              loss = 'categorical_crossentropy',
              )

model_ce_deep.fit(X_train_hasLabel, Y_train_hasLabel, epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x21106b56b80>

In [20]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_ce_hasLabel.predict(X_test_hasLabel)
predictions_binary = model_ce_hasLabel.predict(X_test_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.5629370629370629

In [21]:
## Learn a Threshold Function
Y_train_pred = model_ce_deep.predict(X_train_hasLabel)
Y_test_pred = model_ce_deep.predict(X_test_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.5340909090909091

## Define and Train a Cross-Entropy Loss Network (All Training Instances)

In [22]:
## Compute the proportion of labels that are equal to one in the test set
## This tells us what the hamming loss would be, if we constantly predicted 0
## We can compare our observed hamming loss against this, since we have sparse labels
prop_one = np.sum(Y_test == 1) / (Y_test.shape[0] * Y_test.shape[1])
prop_one

0.23798076923076922

In [34]:
## Start with standard cross-entropy loss (bpmll used later)
random.seed(123)
model_ce = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_ce = tf.keras.optimizers.Adagrad(
    learning_rate = 0.0001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_ce.compile(optimizer = optim_func_ce,
              loss = 'categorical_crossentropy',
              )

In [35]:
tf.random.set_seed(321)
model_ce.fit(X_train, Y_train, epochs = 1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x2110bbe88e0>

### Compare Performance of CE Loss Using Different Threshold Function Approaches

In [36]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_ce.predict(X_test)
predictions_binary = model_ce.predict(X_test)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.4110576923076923

In [37]:
## Learn a Threshold Function
Y_train_pred = model_ce.predict(X_train)
Y_test_pred = model_ce.predict(X_test)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.265224358974359

In [24]:
## Compare predicted test labels for instance against true test labels in next cell
test_labels_binary[4,]

array([0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.])

In [40]:
## Compare true test labels for instance against predicted test labels in previous cell
Y_test[4,]

array([1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.])

### Deeper Architectures

In [38]:
## Standard cross-entropy loss 
random.seed(123)
model_ce_all_deep = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation = 'relu'),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation = 'relu'),
    #tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(32, activation = 'relu'),
    #tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_ce = tf.keras.optimizers.Adagrad(
    learning_rate = 0.0001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_ce_all_deep.compile(optimizer = optim_func_ce,
              loss = 'categorical_crossentropy',
              )

In [39]:
tf.random.set_seed(321)
model_ce_all_deep.fit(X_train, Y_train, epochs = 1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x211089c1790>

In [40]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_ce_all_deep.predict(X_test)
predictions_binary = model_ce_all_deep.predict(X_test)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.5625

In [41]:
## Learn a Threshold Function
Y_train_pred = model_ce_all_deep.predict(X_train)
Y_test_pred = model_ce_all_deep.predict(X_test)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.20192307692307693

## Generate Hamming Loss Combinations of Different Hyperparameters Across All Models (Using Learned Threshold Function)