In [None]:
########################################################################################################################
# Filename: RNN_Paragraph_Classification.ipynb
#
# Purpose: Multi-label Text-categorization, using recurrent neural networks, for paragraph-level
#          data as part of STAT 6500 final project.

# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll
########################################################################################################################

# Paragraph Classification Using RNNs

In [14]:
import numpy as np
import json
import pandas as pd
import os
import random
import tensorflow as tf
import tensorflow_addons as tfa
from bpmll import bp_mll_loss
import sklearn_json as skljson
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sys
sys.path.append('../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary    ## Import the 'predict_test_labels_binary()' function from the 
                                                             ## threshold_learning library

In [15]:
## Load 'content_paragraphs_ready.csv' into a pandas dataframe
data_filepath = "..\..\dataset\content_paragraphs_ready.csv"
paragraph_data = pd.read_csv(data_filepath)
paragraph_data.head()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,...,prosper,preview,moor,coverag,glow,profil,clash,incumb,frequent,unfound
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
## Keep only the paragraph id, paragraph text, and labels
to_keep = paragraph_data.columns[0:15]
#to_keep
paragraph_data = paragraph_data[to_keep]
paragraph_data.head()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,economy,education,political evaluation,racial conflict,international ralations/foreign policies
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,0,0,0,0,0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,0,0,0,0,0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,0,0,0,0,0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,0,0,0,0,0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,0,0,0,0,0


In [17]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in paragraph_data.full_text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

In [19]:
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_URL) # map(lambda x: remove_URL(x))
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_punct)

In [20]:
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_stopwords)

In [22]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(paragraph_data.full_text)

In [23]:
num_unique_words = len(counter)
counter.most_common(5)

[('said', 69), ('people', 61), ('trump', 49), ('new', 48), ('tests', 47)]

In [24]:
num_unique_words

2711

In [25]:
## Define the X and Y train and test matrices
covariate_cols = ['full_text']
label_cols = paragraph_data.columns.difference(['para_id'] + covariate_cols)

X = paragraph_data.full_text.to_numpy()
Y = paragraph_data[label_cols].to_numpy().astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 321)

In [12]:
X_train.shape

(194,)

In [26]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
random.seed(123)

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words = num_unique_words)
tokenizer.fit_on_texts(X_train) # fit only to training
word_index = tokenizer.word_index

In [27]:
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [28]:
print(X_train[0])
print(train_sequences[0])

johnson asked 70 local savannah religious leaders keep worship centers closed none leaders said would reopen johnson told religious leaders understood financial burden religious institutions closed said reach god without going building”
[478, 72, 829, 830, 831, 316, 178, 114, 832, 228, 229, 833, 178, 1, 23, 230, 478, 57, 316, 178, 834, 144, 835, 316, 836, 229, 1, 479, 837, 179, 3, 838]


In [29]:
max_par_length = 0
for par in train_sequences:
    if len(par) > max_par_length:
        max_par_length = len(par)
        
max_par_length

86

In [30]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 100

train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = "post", truncating = "post")
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding = "post", truncating = "post")
train_padded.shape, test_padded.shape

((194, 100), (96, 100))

In [254]:
## Export train and test data to json file
RNN_data_dict = {'train_padded' : train_padded.tolist(), 
                 'test_padded' : test_padded.tolist(),
                 'Y_train' : Y_train.tolist(), 
                 'Y_test' : Y_test.tolist()}

RNN_data_dict_json = json.dumps(RNN_data_dict)
with open("RNN_data_dict.json", "w") as outfile: 
    json.dump(RNN_data_dict_json, outfile)

In [31]:
# Check reversing the indices
# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [32]:
decoded_text = decode(train_sequences[0])

print(train_sequences[0])
print(decoded_text)

[478, 72, 829, 830, 831, 316, 178, 114, 832, 228, 229, 833, 178, 1, 23, 230, 478, 57, 316, 178, 834, 144, 835, 316, 836, 229, 1, 479, 837, 179, 3, 838]
johnson asked 70 local savannah religious leaders keep worship centers closed none leaders said would reopen johnson told religious leaders understood financial burden religious institutions closed said reach god without going building”


## Train a Simple RNN

In [246]:
## Define the simple RNN architecture
num_labels = len(label_cols)

model_simpleRNN = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.SimpleRNN(16, return_sequences = False, return_state = False),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim = tf.keras.optimizers.Adam(lr=0.0001)

metric = tfa.metrics.HammingLoss(mode = 'multilabel', threshold = 0.5)
model_simpleRNN.compile(loss = 'categorical_crossentropy', optimizer = optim, metrics = metric)

In [247]:
tf.random.set_seed(123)
model_simpleRNN.fit(train_padded, Y_train, epochs = 100, validation_data = (test_padded, Y_test), verbose=2)

Epoch 1/100
7/7 - 2s - loss: 7.6402 - hamming_loss: 0.4508 - val_loss: 7.8881 - val_hamming_loss: 0.4423
Epoch 2/100
7/7 - 0s - loss: 7.6057 - hamming_loss: 0.4374 - val_loss: 7.8699 - val_hamming_loss: 0.4503
Epoch 3/100
7/7 - 0s - loss: 7.5747 - hamming_loss: 0.4302 - val_loss: 7.8509 - val_hamming_loss: 0.4551
Epoch 4/100
7/7 - 0s - loss: 7.5452 - hamming_loss: 0.4266 - val_loss: 7.8327 - val_hamming_loss: 0.4503
Epoch 5/100
7/7 - 0s - loss: 7.5174 - hamming_loss: 0.4207 - val_loss: 7.8150 - val_hamming_loss: 0.4407
Epoch 6/100
7/7 - 0s - loss: 7.4897 - hamming_loss: 0.4092 - val_loss: 7.7966 - val_hamming_loss: 0.4327
Epoch 7/100
7/7 - 0s - loss: 7.4609 - hamming_loss: 0.4033 - val_loss: 7.7773 - val_hamming_loss: 0.4359
Epoch 8/100
7/7 - 0s - loss: 7.4315 - hamming_loss: 0.3973 - val_loss: 7.7573 - val_hamming_loss: 0.4303
Epoch 9/100
7/7 - 0s - loss: 7.4005 - hamming_loss: 0.3894 - val_loss: 7.7378 - val_hamming_loss: 0.4239
Epoch 10/100
7/7 - 0s - loss: 7.3696 - hamming_loss: 0.

<tensorflow.python.keras.callbacks.History at 0x20e1f626730>

In [248]:
## Learn a Threshold Function
Y_train_pred = model_simpleRNN.predict(train_padded)
Y_test_pred = model_simpleRNN.predict(test_padded)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.23557692307692307

In [24]:
Y_test[0,]

array([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.])

In [25]:
test_labels_binary[0,]

array([0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1.])

## Train an LSTM model

In [40]:
## Define the LSTM RNN architecture
num_labels = len(label_cols)

model_LSTM = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.LSTM(16, return_sequences = False, return_state = False),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim = tf.keras.optimizers.Adam(lr=0.001)
#optim_func_LSTM = tf.keras.optimizers.Adagrad(
#    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
#    name = 'Adagrad')

metric = tfa.metrics.HammingLoss(mode = 'multilabel', threshold = 0.5)
model_LSTM.compile(loss = 'categorical_crossentropy', optimizer = optim, metrics = metric)

In [41]:
tf.random.set_seed(123)
model_LSTM.fit(train_padded, Y_train, epochs = 100, validation_data = (test_padded, Y_test), verbose=2)

Epoch 1/100
7/7 - 2s - loss: 7.6843 - hamming_loss: 0.5381 - val_loss: 7.9091 - val_hamming_loss: 0.4639
Epoch 2/100
7/7 - 0s - loss: 7.6235 - hamming_loss: 0.5052 - val_loss: 7.8395 - val_hamming_loss: 0.5393
Epoch 3/100
7/7 - 0s - loss: 7.5496 - hamming_loss: 0.5551 - val_loss: 7.7410 - val_hamming_loss: 0.5393
Epoch 4/100
7/7 - 0s - loss: 7.4440 - hamming_loss: 0.5551 - val_loss: 7.5849 - val_hamming_loss: 0.5393
Epoch 5/100
7/7 - 0s - loss: 7.2851 - hamming_loss: 0.5551 - val_loss: 7.3702 - val_hamming_loss: 0.5777
Epoch 6/100
7/7 - 0s - loss: 7.0946 - hamming_loss: 0.5440 - val_loss: 7.1429 - val_hamming_loss: 0.5024
Epoch 7/100
7/7 - 0s - loss: 6.9162 - hamming_loss: 0.5186 - val_loss: 6.9666 - val_hamming_loss: 0.5024
Epoch 8/100
7/7 - 0s - loss: 6.7968 - hamming_loss: 0.5186 - val_loss: 6.8954 - val_hamming_loss: 0.5024
Epoch 9/100
7/7 - 0s - loss: 6.7634 - hamming_loss: 0.5186 - val_loss: 6.8984 - val_hamming_loss: 0.5024
Epoch 10/100
7/7 - 0s - loss: 6.7741 - hamming_loss: 0.

<tensorflow.python.keras.callbacks.History at 0x27102ea6df0>

In [42]:
## Learn a Threshold Function
Y_train_pred = model_LSTM.predict(train_padded)
Y_test_pred = model_LSTM.predict(test_padded)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.23798076923076922

In [30]:
Y_test[0,]

array([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.])

In [31]:
test_labels_binary[0,]

array([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.])

In [38]:
## save the model weights 
model_LSTM_filepath = 'Models/Cross_Entropy/LSTM_weights'
model_LSTM.save_weights(model_LSTM_filepath)

In [36]:
## save the learned threshold function
threshold_filepath = 'Models/Cross_Entropy/threshold_LSTM.json'
skljson.to_json(threshold_function, threshold_filepath)

## Train a Bidirectional LSTM

In [233]:
## Define the LSTM RNN architecture
num_labels = len(label_cols)

model_biLSTM = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences = False, return_state = False)),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

#optim = tf.keras.optimizers.Adam(lr=0.0001)
optim = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

#optim = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9, momentum = 0.8, epsilon=1e-07,)

metric = tfa.metrics.HammingLoss(mode = 'multilabel', threshold = 0.5)
model_biLSTM.compile(loss = 'categorical_crossentropy', optimizer = optim, metrics = metric)

In [234]:
tf.random.set_seed(123)
model_biLSTM.fit(train_padded, Y_train, epochs = 100, validation_data = (test_padded, Y_test), verbose=2)

Epoch 1/100
7/7 - 5s - loss: 7.6579 - hamming_loss: 0.5492 - val_loss: 7.9161 - val_hamming_loss: 0.5481
Epoch 2/100
7/7 - 0s - loss: 7.6446 - hamming_loss: 0.5420 - val_loss: 7.9055 - val_hamming_loss: 0.5553
Epoch 3/100
7/7 - 0s - loss: 7.6339 - hamming_loss: 0.5424 - val_loss: 7.8966 - val_hamming_loss: 0.5521
Epoch 4/100
7/7 - 0s - loss: 7.6247 - hamming_loss: 0.5385 - val_loss: 7.8892 - val_hamming_loss: 0.5497
Epoch 5/100
7/7 - 0s - loss: 7.6172 - hamming_loss: 0.5385 - val_loss: 7.8823 - val_hamming_loss: 0.5441
Epoch 6/100
7/7 - 0s - loss: 7.6100 - hamming_loss: 0.5317 - val_loss: 7.8756 - val_hamming_loss: 0.5361
Epoch 7/100
7/7 - 0s - loss: 7.6031 - hamming_loss: 0.5254 - val_loss: 7.8687 - val_hamming_loss: 0.5312
Epoch 8/100
7/7 - 0s - loss: 7.5961 - hamming_loss: 0.5202 - val_loss: 7.8634 - val_hamming_loss: 0.5264
Epoch 9/100
7/7 - 0s - loss: 7.5904 - hamming_loss: 0.5163 - val_loss: 7.8577 - val_hamming_loss: 0.5248
Epoch 10/100
7/7 - 0s - loss: 7.5844 - hamming_loss: 0.

<tensorflow.python.keras.callbacks.History at 0x20da7de4fa0>

In [47]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_biLSTM.predict(test_padded)
predictions_binary = model_biLSTM.predict(test_padded)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.41746794871794873

In [48]:
## Learn a Threshold Function
Y_train_pred = model_biLSTM.predict(train_padded)
Y_test_pred = model_biLSTM.predict(test_padded)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.2155448717948718

In [37]:
Y_test[0,]

array([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.])

In [38]:
test_labels_binary[0,]

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])

## Train With BPMLL Loss

In [42]:
## Reduce the data to cases with at least one label
## These are the rows that don't have any associated labels
Y_gz = Y > 0
no_labels_id = np.where(~Y_gz.any(axis=1))[0]
#no_labels_id

atleast_one_label_ids = paragraph_data.index
atleast_one_label_ids = atleast_one_label_ids.difference(no_labels_id)
X_hasLabel = X[atleast_one_label_ids, ]
Y_hasLabel = Y[atleast_one_label_ids, ]
X_train_hasLabel, X_test_hasLabel, Y_train_hasLabel, Y_test_hasLabel = train_test_split(X_hasLabel, Y_hasLabel, 
                                                                                        test_size = 0.33, random_state = 321)

In [62]:
counter_hasLabel = counter_word(paragraph_data.iloc[atleast_one_label_ids].full_text)
num_unique_words = len(counter)

## vectorize a text corpus by turning each text into a sequence of integers
tokenizer_hasLabel = Tokenizer(num_words = num_unique_words)
tokenizer_hasLabel.fit_on_texts(X_train_hasLabel) # fit only to training
word_index_hasLabel = tokenizer_hasLabel.word_index

train_sequences_hasLabel = tokenizer_hasLabel.texts_to_sequences(X_train_hasLabel)
test_sequences_hasLabel = tokenizer_hasLabel.texts_to_sequences(X_test_hasLabel)

In [68]:
max_par_length_hasLabel = 0
for par in train_sequences_hasLabel:
    if len(par) > max_par_length_hasLabel:
        max_par_length_hasLabel = len(par)
        
max_par_length_hasLabel

86

In [69]:
# Max number of words in a sequence
max_length_hasLabel = 100

train_padded_hasLabel = pad_sequences(train_sequences_hasLabel, maxlen = max_length_hasLabel, padding = "post", truncating = "post")
test_padded_hasLabel = pad_sequences(test_sequences_hasLabel, maxlen = max_length_hasLabel, padding = "post", truncating = "post")
train_padded_hasLabel.shape, test_padded_hasLabel.shape

((176, 100), (88, 100))

In [255]:
## Export reduced train and test data to json file
RNN_data_dict_reduced = {'train_padded_hasLabel' : train_padded_hasLabel.tolist(), 
                 'test_padded_hasLabel' : test_padded_hasLabel.tolist(),
                 'Y_train_hasLabel' : Y_train_hasLabel.tolist(), 
                 'Y_test_hasLabel' : Y_test_hasLabel.tolist()}

RNN_data_dict_reduced_json = json.dumps(RNN_data_dict_reduced)
with open("RNN_data_dict_reduced.json", "w") as outfile: 
    json.dump(RNN_data_dict_reduced_json, outfile)

In [70]:
# Check reversing the indices
# flip (key, value)
reverse_word_index_hasLabel = dict([(idx, word) for (word, idx) in word_index_hasLabel.items()])

def decode_hasLabel(sequence):
    return " ".join([reverse_word_index_hasLabel.get(idx, "?") for idx in sequence])

decoded_text_hasLabel = decode_hasLabel(train_sequences_hasLabel[0])

print(train_sequences_hasLabel[0])
print(decoded_text_hasLabel)

[293, 166, 83, 43, 770, 167, 294, 771, 772, 773, 33, 774, 31, 6, 1, 775, 295, 776, 452, 777, 107, 168, 453, 33, 50, 454, 108]
invoked defense production act compel general motors accept perform prioritize federal contracts ventilators trump said invocation dpa demonstrate clearly hesitate use full authority federal government combat crisis


### Simple RNN

In [236]:
## Define the simple RNN architecture
model_simpleRNN_bpmll = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.SimpleRNN(16, return_sequences = False, return_state = False),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_bpmll = tf.keras.optimizers.Adam(lr=0.0001)

metric = tfa.metrics.HammingLoss(mode = 'multilabel', threshold = 0.5)
model_simpleRNN_bpmll.compile(loss = bp_mll_loss, optimizer = optim_bpmll, metrics = metric)

In [237]:
tf.random.set_seed(123)
model_simpleRNN_bpmll.fit(train_padded_hasLabel, Y_train_hasLabel, epochs = 100, 
                          validation_data = (test_padded_hasLabel, Y_test_hasLabel), verbose=2)

Epoch 1/100
6/6 - 2s - loss: 1.0189 - hamming_loss: 0.5520 - val_loss: 1.0122 - val_hamming_loss: 0.5472
Epoch 2/100
6/6 - 0s - loss: 1.0136 - hamming_loss: 0.5240 - val_loss: 1.0076 - val_hamming_loss: 0.5236
Epoch 3/100
6/6 - 0s - loss: 1.0072 - hamming_loss: 0.5022 - val_loss: 1.0032 - val_hamming_loss: 0.5052
Epoch 4/100
6/6 - 0s - loss: 1.0001 - hamming_loss: 0.4768 - val_loss: 0.9987 - val_hamming_loss: 0.4851
Epoch 5/100
6/6 - 0s - loss: 0.9924 - hamming_loss: 0.4327 - val_loss: 0.9936 - val_hamming_loss: 0.4545
Epoch 6/100
6/6 - 0s - loss: 0.9851 - hamming_loss: 0.4248 - val_loss: 0.9884 - val_hamming_loss: 0.4563
Epoch 7/100
6/6 - 0s - loss: 0.9794 - hamming_loss: 0.4222 - val_loss: 0.9844 - val_hamming_loss: 0.4353
Epoch 8/100
6/6 - 0s - loss: 0.9752 - hamming_loss: 0.3995 - val_loss: 0.9816 - val_hamming_loss: 0.4205
Epoch 9/100
6/6 - 0s - loss: 0.9713 - hamming_loss: 0.3737 - val_loss: 0.9794 - val_hamming_loss: 0.4047
Epoch 10/100
6/6 - 0s - loss: 0.9675 - hamming_loss: 0.

<tensorflow.python.keras.callbacks.History at 0x20e16500550>

In [75]:
## Learn a Threshold Function
Y_train_pred = model_simpleRNN_bpmll.predict(train_padded_hasLabel)
Y_test_pred = model_simpleRNN_bpmll.predict(test_padded_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.2902097902097902

### Train an LSTM

In [238]:
## Define the LSTM RNN architecture
model_LSTM_bpmll = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.LSTM(16, return_sequences = False, return_state = False),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_bpmll = tf.keras.optimizers.Adam(lr=0.0001)
#optim_func_LSTM = tf.keras.optimizers.Adagrad(
#    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
#    name = 'Adagrad')

metric = tfa.metrics.HammingLoss(mode = 'multilabel', threshold = 0.5)
model_LSTM_bpmll.compile(loss = bp_mll_loss, optimizer = optim_bpmll, metrics = metric)

In [239]:
tf.random.set_seed(123)
model_LSTM_bpmll.fit(train_padded_hasLabel, Y_train_hasLabel, epochs = 100, 
                     validation_data = (test_padded_hasLabel, Y_test_hasLabel), verbose=2)

Epoch 1/100
6/6 - 4s - loss: 1.0040 - hamming_loss: 0.6106 - val_loss: 1.0029 - val_hamming_loss: 0.5997
Epoch 2/100
6/6 - 0s - loss: 1.0033 - hamming_loss: 0.6106 - val_loss: 1.0022 - val_hamming_loss: 0.5997
Epoch 3/100
6/6 - 0s - loss: 1.0026 - hamming_loss: 0.6106 - val_loss: 1.0015 - val_hamming_loss: 0.5997
Epoch 4/100
6/6 - 0s - loss: 1.0018 - hamming_loss: 0.5800 - val_loss: 1.0008 - val_hamming_loss: 0.4895
Epoch 5/100
6/6 - 0s - loss: 1.0011 - hamming_loss: 0.4996 - val_loss: 1.0002 - val_hamming_loss: 0.4895
Epoch 6/100
6/6 - 0s - loss: 1.0004 - hamming_loss: 0.4419 - val_loss: 0.9995 - val_hamming_loss: 0.3829
Epoch 7/100
6/6 - 0s - loss: 0.9997 - hamming_loss: 0.3894 - val_loss: 0.9988 - val_hamming_loss: 0.3829
Epoch 8/100
6/6 - 0s - loss: 0.9990 - hamming_loss: 0.3894 - val_loss: 0.9981 - val_hamming_loss: 0.3829
Epoch 9/100
6/6 - 0s - loss: 0.9983 - hamming_loss: 0.3894 - val_loss: 0.9974 - val_hamming_loss: 0.3829
Epoch 10/100
6/6 - 0s - loss: 0.9975 - hamming_loss: 0.

<tensorflow.python.keras.callbacks.History at 0x20e17233520>

In [79]:
## Learn a Threshold Function
Y_train_pred = model_LSTM_bpmll.predict(train_padded_hasLabel)
Y_test_pred = model_LSTM_bpmll.predict(test_padded_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.18006993006993008

### Train a Bidirectional LSTM

In [217]:
## Define the bidirectional LSTM RNN architecture
model_biLSTM_bpmll = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences = False, return_state = False)),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim_bpmll = tf.keras.optimizers.Adam(lr=0.001)
#optim_bpmll = tf.keras.optimizers.Adagrad(
#    learning_rate = 0.1, initial_accumulator_value = 0.1, epsilon = 1e-07,
#    name = 'Adagrad')

#optim = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9, momentum = 0.8, epsilon=1e-07,)

metric = tfa.metrics.HammingLoss(mode = 'multilabel', threshold = 0.5)
model_biLSTM_bpmll.compile(loss = bp_mll_loss, optimizer = optim_bpmll, metrics = metric)

In [218]:
tf.random.set_seed(123)
model_biLSTM_bpmll.fit(train_padded_hasLabel, Y_train_hasLabel, epochs = 100, 
                       validation_data = (test_padded_hasLabel, Y_test_hasLabel), verbose=2)

Epoch 1/100
6/6 - 4s - loss: 0.9963 - hamming_loss: 0.4738 - val_loss: 0.9916 - val_hamming_loss: 0.3759
Epoch 2/100
6/6 - 0s - loss: 0.9869 - hamming_loss: 0.3431 - val_loss: 0.9820 - val_hamming_loss: 0.3549
Epoch 3/100
6/6 - 0s - loss: 0.9746 - hamming_loss: 0.3475 - val_loss: 0.9679 - val_hamming_loss: 0.3584
Epoch 4/100
6/6 - 0s - loss: 0.9561 - hamming_loss: 0.3479 - val_loss: 0.9450 - val_hamming_loss: 0.3514
Epoch 5/100
6/6 - 0s - loss: 0.9259 - hamming_loss: 0.3230 - val_loss: 0.9044 - val_hamming_loss: 0.3042
Epoch 6/100
6/6 - 0s - loss: 0.8743 - hamming_loss: 0.3046 - val_loss: 0.8490 - val_hamming_loss: 0.3007
Epoch 7/100
6/6 - 0s - loss: 0.8240 - hamming_loss: 0.2985 - val_loss: 0.8097 - val_hamming_loss: 0.3007
Epoch 8/100
6/6 - 0s - loss: 0.7903 - hamming_loss: 0.2985 - val_loss: 0.7843 - val_hamming_loss: 0.3007
Epoch 9/100
6/6 - 0s - loss: 0.7678 - hamming_loss: 0.2985 - val_loss: 0.7658 - val_hamming_loss: 0.3007
Epoch 10/100
6/6 - 0s - loss: 0.7521 - hamming_loss: 0.

<tensorflow.python.keras.callbacks.History at 0x20db3ec8af0>

In [210]:
## OPTIONAL CELL FOR IF WE HAVE PRE-TRAINED WEIGHTS
## Restore the weights
model_biLSTM_bpmll.load_weights('Models/BPMLL/biLSTM_bpmll_weights_manyEpochs')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x20dafbad640>

In [211]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_biLSTM_bpmll.predict(test_padded_hasLabel)
predictions_binary = model_biLSTM_bpmll.predict(test_padded_hasLabel)
for i in range(Y_test_hasLabel.shape[0]):
    for j in range(Y_test_hasLabel.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test_hasLabel, predictions_binary)

0.15034965034965034

In [201]:
## Learn a Threshold Function
Y_train_pred = model_biLSTM_bpmll.predict(train_padded_hasLabel)
Y_test_pred = model_biLSTM_bpmll.predict(test_padded_hasLabel)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train_hasLabel, Y_test_pred, t_range)
metrics.hamming_loss(Y_test_hasLabel, test_labels_binary)

0.20716783216783216

In [182]:
## save the model weights (no need to save learned threshold, in this case)
model_biLSTM_bpmll_filepath = 'Models/BPMLL/biLSTM_bpmll_weights_manyEpochs'
model_biLSTM_bpmll.save_weights(model_biLSTM_bpmll_filepath)