In [None]:
########################################################################################################################
# Filename: BPMLL_paragraph_classification.ipynb
#
# Purpose: Multi-label Text-categorization, using neural networks, for paragraph-level
#          data as part of STAT 6500 final project.

# Author(s): Bobby (Robert) Lumpkin, Yue Li
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll, random, json, sys, threshold_learning
########################################################################################################################

# Text Categorization With Neural Networks 

In [6]:
import numpy as np
import pandas as pd
import os
import random
import json
import tensorflow as tf
from bpmll import bp_mll_loss
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sys
sys.path.append('../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary    ## Import the 'predict_test_labels_binary()' function from the 
                                                             ## threshold_learning library

## Load & Pre-Process the Data

In [2]:
## Load 'content_paragraphs_ready.csv' into a pandas dataframe
data_filepath = "..\..\dataset\content_paragraphs_ready.csv"
paragraph_data = pd.read_csv(data_filepath)
paragraph_data.head()
#os.getcwd()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,...,prosper,preview,moor,coverag,glow,profil,clash,incumb,frequent,unfound
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
## Keep only 'doc_id', the label columns, and the tf-idf columns
tfidf_colnames = list(paragraph_data.columns[25:])
label_columns = list(paragraph_data.columns[2:15])
cols_toKeep = ['doc_id']
cols_toKeep.extend(tfidf_colnames)
cols_toKeep.extend(label_columns)
paragraph_data = paragraph_data[cols_toKeep]
paragraph_data.head()

Unnamed: 0,doc_id,murphi,martha,defacto,stay,home,state,million,us,you�,...,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,economy,education,political evaluation,racial conflict,international ralations/foreign policies
0,text1,1.684247,1.348455,2.161368,3.118616,3.016311,0.91833,1.207125,1.383217,1.763428,...,1,1,0,1,1,0,0,0,0,0
1,text2,1.684247,1.348455,0.0,0.0,0.0,3.67332,0.0,0.0,1.763428,...,1,1,0,1,1,0,0,0,0,0
2,text3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
3,text4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,1,1,0,0,0,0,0
4,text5,1.684247,1.348455,0.0,0.0,0.0,0.0,0.0,1.383217,0.0,...,0,0,1,0,1,0,0,0,0,0


In [7]:
## Define the X and Y train and test matrices
X = paragraph_data[tfidf_colnames].to_numpy().astype(float)
Y = paragraph_data[label_columns].to_numpy().astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 321)

In [8]:
## Export train and test data to json file
tfidf_trainTest_data = {'X_train' : X_train.tolist(), 
                 'X_test' : X_test.tolist(),
                 'Y_train' : Y_train.tolist(), 
                 'Y_test' : Y_test.tolist()}

tfidf_trainTest_json = json.dumps(tfidf_trainTest_data)
with open("tfidf_trainTest_data.json", "w") as outfile: 
    json.dump(tfidf_trainTest_data, outfile)

In [9]:
## These are the rows that don't have any associated labels
Y_gz = Y > 0
no_labels_id = np.where(~Y_gz.any(axis=1))[0]
no_labels_id

array([ 38,  58,  61,  67,  83, 132, 139, 142, 143, 150, 151, 166, 167,
       180, 214, 221, 240, 279, 282, 283, 284, 285, 286, 287, 288, 289],
      dtype=int64)

## Define and Train a BP-MLL Network (Training Instances That Have Atleast One Label)

In [11]:
## Since some instances don't have any labels, remove them from the dataset for use with BP-MLL
atleast_one_label_ids = paragraph_data.index
atleast_one_label_ids = atleast_one_label_ids.difference(no_labels_id)
X_hasLabel = X[atleast_one_label_ids, ]
Y_hasLabel = Y[atleast_one_label_ids, ]
X_train_hasLabel, X_test_hasLabel, Y_train_hasLabel, Y_test_hasLabel = train_test_split(X_hasLabel, Y_hasLabel, 
                                                                                        test_size = 0.33, random_state = 321)

In [7]:
Y_test_hasLabel

array([[1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [1., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.]])

In [12]:
## Export reduced train and test data to json file
tfidf_trainTest_data_reduced = {'X_train_hasLabel' : X_train_hasLabel.tolist(), 
                 'X_test_hasLabel' : X_test_hasLabel.tolist(),
                 'Y_train_hasLabel' : Y_train_hasLabel.tolist(), 
                 'Y_test_hasLabel' : Y_test_hasLabel.tolist()}

tfidf_trainTest_data_reduced_json = json.dumps(tfidf_trainTest_data_reduced)
with open("tfidf_trainTest_data_reduced.json", "w") as outfile: 
    json.dump(tfidf_trainTest_data_reduced, outfile)

In [13]:
## Compute the proportion of labels that are equal to one in the test_hasLabel set
## This tells us what the hamming loss would be, if we constantly predicted 0
## We can compare our observed hamming loss against this, since we have sparse labels
prop_one_bpmll = np.sum(Y_test_hasLabel == 1) / (Y_test_hasLabel.shape[0] * Y_test_hasLabel.shape[1])
prop_one_bpmll

0.25524475524475526

In [14]:
## Start by defining and compiling the bp-mll loss network 
num_labels = len(label_columns)

model_bpmll = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_bpmll = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_bpmll.compile(optimizer = optim_func_bpmll,
              loss = bp_mll_loss,
              )

In [15]:
tf.random.set_seed(123)
model_bpmll.fit(X_train_hasLabel, Y_train_hasLabel, epochs = 500)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x18d8866f400>

### Compare Performance of BP-MLL Loss Using Different Threshold Function Approaches 

In [16]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_bpmll.predict(X_test_hasLabel)
predictions_binary = model_bpmll.predict(X_test_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.4117132867132867

In [17]:
## Learn a Threshold Function
Y_train_pred = model_bpmll.predict(X_train_hasLabel)
Y_test_pred = model_bpmll.predict(X_test_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.2491258741258741

### Deeper Architectures

In [18]:
## Add an additional hidden layer
model_bpmll_deep = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_bpmll = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_bpmll_deep.compile(optimizer = optim_func_bpmll,
              loss = bp_mll_loss,
              )

model_bpmll_deep.fit(X_train_hasLabel, Y_train_hasLabel, epochs = 500)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x18d89dd0b50>

In [19]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_bpmll_deep.predict(X_test_hasLabel)
predictions_binary = model_bpmll_deep.predict(X_test_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.40996503496503495

In [20]:
## Learn a Threshold Function
Y_train_pred = model_bpmll_deep.predict(X_train_hasLabel)
Y_test_pred = model_bpmll_deep.predict(X_test_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.2867132867132867

## Define and Train a Cross-Entropy Loss Network (Training Instances That Have Atleast One Label)

In [21]:
## Start with standard cross-entropy loss (bpmll used later)

model_ce_hasLabel = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_ce = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_ce_hasLabel.compile(optimizer = optim_func_ce,
              loss = 'categorical_crossentropy',
              )

In [22]:
model_ce_hasLabel.fit(X_train_hasLabel, Y_train_hasLabel, epochs = 500)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x18d8b932e80>

### Compare Performance of CE Loss Using Different Threshold Function Approaches

In [23]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_ce_hasLabel.predict(X_test_hasLabel)
predictions_binary = model_ce_hasLabel.predict(X_test_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.3924825174825175

In [24]:
## Learn a Threshold Function
Y_train_pred = model_ce_hasLabel.predict(X_train_hasLabel)
Y_test_pred = model_ce_hasLabel.predict(X_test_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.21241258741258742

### Deeper Architectures

In [25]:
## Add an additional hidden layer
model_ce_deep = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_ce = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_ce_deep.compile(optimizer = optim_func_ce,
              loss = 'categorical_crossentropy',
              )

model_ce_deep.fit(X_train_hasLabel, Y_train_hasLabel, epochs = 500)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x18d88563c10>

In [26]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_ce_hasLabel.predict(X_test_hasLabel)
predictions_binary = model_ce_hasLabel.predict(X_test_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.3924825174825175

In [27]:
## Learn a Threshold Function
Y_train_pred = model_ce_deep.predict(X_train_hasLabel)
Y_test_pred = model_ce_deep.predict(X_test_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.2666083916083916

## Define and Train a Cross-Entropy Loss Network (All Training Instances)

In [28]:
## Compute the proportion of labels that are equal to one in the test set
## This tells us what the hamming loss would be, if we constantly predicted 0
## We can compare our observed hamming loss against this, since we have sparse labels
prop_one = np.sum(Y_test == 1) / (Y_test.shape[0] * Y_test.shape[1])
prop_one

0.23798076923076922

In [29]:
## Start with standard cross-entropy loss (bpmll used later)
random.seed(123)
model_ce = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_ce = tf.keras.optimizers.Adagrad(
    learning_rate = 0.0001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_ce.compile(optimizer = optim_func_ce,
              loss = 'categorical_crossentropy',
              )

In [35]:
tf.random.set_seed(321)
model_ce.fit(X_train, Y_train, epochs = 500)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x2110bbe88e0>

### Compare Performance of CE Loss Using Different Threshold Function Approaches

In [36]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_ce.predict(X_test)
predictions_binary = model_ce.predict(X_test)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.4110576923076923

In [37]:
## Learn a Threshold Function
Y_train_pred = model_ce.predict(X_train)
Y_test_pred = model_ce.predict(X_test)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.265224358974359

In [24]:
## Compare predicted test labels for instance against true test labels in next cell
test_labels_binary[4,]

array([0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.])

In [40]:
## Compare true test labels for instance against predicted test labels in previous cell
Y_test[4,]

array([1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.])

### Deeper Architectures

In [38]:
## Standard cross-entropy loss 
random.seed(123)
model_ce_all_deep = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation = 'relu'),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation = 'relu'),
    #tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(32, activation = 'relu'),
    #tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_func_ce = tf.keras.optimizers.Adagrad(
    learning_rate = 0.0001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

model_ce_all_deep.compile(optimizer = optim_func_ce,
              loss = 'categorical_crossentropy',
              )

In [39]:
tf.random.set_seed(321)
model_ce_all_deep.fit(X_train, Y_train, epochs = 1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x211089c1790>

In [40]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_ce_all_deep.predict(X_test)
predictions_binary = model_ce_all_deep.predict(X_test)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.5625

In [41]:
## Learn a Threshold Function
Y_train_pred = model_ce_all_deep.predict(X_train)
Y_test_pred = model_ce_all_deep.predict(X_test)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.20192307692307693

## Generate Hamming Loss for Combinations of Different Hyperparameters Across All Models (Using Learned Threshold Function)