In [None]:
########################################################################################################################
# Filename: RNN_Paragraph_Classification.ipynb
#
# Purpose: Multi-label Text-categorization, using recurrent neural networks, for paragraph-level
#          data as part of STAT 6500 final project.

# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll
########################################################################################################################

# Paragraph Classification Using RNNs

In [1]:
import numpy as np
import json
import pandas as pd
import os
import random
import tensorflow as tf
import tensorflow_addons as tfa
from bpmll import bp_mll_loss
import sklearn_json as skljson
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sys
sys.path.append('../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary    ## Import the 'predict_test_labels_binary()' function from the 
                                                             ## threshold_learning library

In [2]:
## Load 'content_paragraphs_ready.csv' into a pandas dataframe
data_filepath = "..\..\dataset\content_paragraphs_ready.csv"
paragraph_data = pd.read_csv(data_filepath)
paragraph_data.head()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,...,prosper,preview,moor,coverag,glow,profil,clash,incumb,frequent,unfound
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
## Keep only the paragraph id, paragraph text, and labels
to_keep = paragraph_data.columns[0:15]
#to_keep
paragraph_data = paragraph_data[to_keep]
paragraph_data.head()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,economy,education,political evaluation,racial conflict,international ralations/foreign policies
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,0,0,0,0,0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,0,0,0,0,0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,0,0,0,0,0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,0,0,0,0,0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,0,0,0,0,0


In [4]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in paragraph_data.full_text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

In [6]:
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_URL) # map(lambda x: remove_URL(x))
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_punct)

In [7]:
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_stopwords)

In [9]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(paragraph_data.full_text)

In [10]:
num_unique_words = len(counter)
counter.most_common(5)

[('said', 69), ('people', 61), ('trump', 49), ('new', 48), ('tests', 47)]

In [11]:
## Define the X and Y train and test matrices
covariate_cols = ['full_text']
label_cols = paragraph_data.columns.difference(['para_id'] + covariate_cols)

X = paragraph_data.full_text.to_numpy()
Y = paragraph_data[label_cols].to_numpy().astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 321)

In [12]:
X_train.shape

(194,)

In [13]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
random.seed(123)

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words = num_unique_words)
tokenizer.fit_on_texts(X_train) # fit only to training
word_index = tokenizer.word_index

In [14]:
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [15]:
print(X_train[0])
print(train_sequences[0])

johnson asked 70 local savannah religious leaders keep worship centers closed none leaders said would reopen johnson told religious leaders understood financial burden religious institutions closed said reach god without going building”
[478, 72, 829, 830, 831, 316, 178, 114, 832, 228, 229, 833, 178, 1, 23, 230, 478, 57, 316, 178, 834, 144, 835, 316, 836, 229, 1, 479, 837, 179, 3, 838]


In [16]:
max_par_length = 0
for par in train_sequences:
    if len(par) > max_par_length:
        max_par_length = len(par)
        
max_par_length

86

In [17]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 100

train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = "post", truncating = "post")
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding = "post", truncating = "post")
train_padded.shape, test_padded.shape

((194, 100), (96, 100))

In [18]:
# Check reversing the indices
# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [19]:
decoded_text = decode(train_sequences[0])

print(train_sequences[0])
print(decoded_text)

[478, 72, 829, 830, 831, 316, 178, 114, 832, 228, 229, 833, 178, 1, 23, 230, 478, 57, 316, 178, 834, 144, 835, 316, 836, 229, 1, 479, 837, 179, 3, 838]
johnson asked 70 local savannah religious leaders keep worship centers closed none leaders said would reopen johnson told religious leaders understood financial burden religious institutions closed said reach god without going building”


## Train a Simple RNN

In [20]:
## Define the simple RNN architecture
num_labels = len(label_cols)

model_simpleRNN = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.SimpleRNN(16, return_sequences = False, return_state = False),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim = tf.keras.optimizers.Adam(lr=0.0001)

#metrics = tfa.metrics.hamming_loss_fn(mode = 'multi-label')
model_simpleRNN.compile(loss = 'categorical_crossentropy', optimizer = optim)

In [21]:
tf.random.set_seed(123)
model_simpleRNN.fit(train_padded, Y_train, epochs = 100, validation_data = (test_padded, Y_test), verbose=2)

Epoch 1/100
7/7 - 3s - loss: 7.6985 - val_loss: 7.9493
Epoch 2/100
7/7 - 0s - loss: 7.6808 - val_loss: 7.9473
Epoch 3/100
7/7 - 0s - loss: 7.6505 - val_loss: 7.9211
Epoch 4/100
7/7 - 0s - loss: 7.6168 - val_loss: 7.9098
Epoch 5/100
7/7 - 0s - loss: 7.5955 - val_loss: 7.9053
Epoch 6/100
7/7 - 0s - loss: 7.5760 - val_loss: 7.9000
Epoch 7/100
7/7 - 0s - loss: 7.5562 - val_loss: 7.8980
Epoch 8/100
7/7 - 0s - loss: 7.5386 - val_loss: 7.8973
Epoch 9/100
7/7 - 0s - loss: 7.5143 - val_loss: 7.8934
Epoch 10/100
7/7 - 0s - loss: 7.4868 - val_loss: 7.8929
Epoch 11/100
7/7 - 0s - loss: 7.4556 - val_loss: 7.8921
Epoch 12/100
7/7 - 0s - loss: 7.4189 - val_loss: 7.8920
Epoch 13/100
7/7 - 0s - loss: 7.3838 - val_loss: 7.8970
Epoch 14/100
7/7 - 0s - loss: 7.3515 - val_loss: 7.9027
Epoch 15/100
7/7 - 0s - loss: 7.3189 - val_loss: 7.9085
Epoch 16/100
7/7 - 0s - loss: 7.2790 - val_loss: 7.9105
Epoch 17/100
7/7 - 0s - loss: 7.2497 - val_loss: 7.9119
Epoch 18/100
7/7 - 0s - loss: 7.2189 - val_loss: 7.9157
E

<tensorflow.python.keras.callbacks.History at 0x20876ba81f0>

In [22]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_simpleRNN.predict(test_padded)
predictions_binary = model_simpleRNN.predict(test_padded)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.4567307692307692

In [23]:
## Learn a Threshold Function
Y_train_pred = model_simpleRNN.predict(train_padded)
Y_test_pred = model_simpleRNN.predict(test_padded)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.3125

In [24]:
Y_test[0,]

array([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.])

In [25]:
test_labels_binary[0,]

array([0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1.])

## Train an LSTM model

In [20]:
## Define the LSTM RNN architecture
num_labels = len(label_cols)

model_LSTM = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.LSTM(16, return_sequences = False, return_state = False),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim = tf.keras.optimizers.Adam(lr=0.0001)
#optim_func_LSTM = tf.keras.optimizers.Adagrad(
#    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
#    name = 'Adagrad')

#metrics = tfa.metrics.hamming_loss_fn(mode = 'multi-label')
model_LSTM.compile(loss = 'categorical_crossentropy', optimizer = optim)

In [21]:
tf.random.set_seed(123)
model_LSTM.fit(train_padded, Y_train, epochs = 50, validation_data = (test_padded, Y_test), verbose=2)

Epoch 1/50
7/7 - 4s - loss: 7.6639 - val_loss: 7.9256
Epoch 2/50
7/7 - 0s - loss: 7.6582 - val_loss: 7.9200
Epoch 3/50
7/7 - 0s - loss: 7.6526 - val_loss: 7.9144
Epoch 4/50
7/7 - 0s - loss: 7.6473 - val_loss: 7.9089
Epoch 5/50
7/7 - 0s - loss: 7.6418 - val_loss: 7.9034
Epoch 6/50
7/7 - 0s - loss: 7.6366 - val_loss: 7.8979
Epoch 7/50
7/7 - 0s - loss: 7.6312 - val_loss: 7.8922
Epoch 8/50
7/7 - 0s - loss: 7.6256 - val_loss: 7.8863
Epoch 9/50
7/7 - 0s - loss: 7.6198 - val_loss: 7.8807
Epoch 10/50
7/7 - 0s - loss: 7.6143 - val_loss: 7.8748
Epoch 11/50
7/7 - 0s - loss: 7.6085 - val_loss: 7.8689
Epoch 12/50
7/7 - 0s - loss: 7.6025 - val_loss: 7.8628
Epoch 13/50
7/7 - 0s - loss: 7.5963 - val_loss: 7.8561
Epoch 14/50
7/7 - 0s - loss: 7.5899 - val_loss: 7.8491
Epoch 15/50
7/7 - 0s - loss: 7.5832 - val_loss: 7.8423
Epoch 16/50
7/7 - 0s - loss: 7.5766 - val_loss: 7.8353
Epoch 17/50
7/7 - 0s - loss: 7.5692 - val_loss: 7.8270
Epoch 18/50
7/7 - 0s - loss: 7.5614 - val_loss: 7.8183
Epoch 19/50
7/7 - 0

<tensorflow.python.keras.callbacks.History at 0x20dcc8436a0>

In [28]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_LSTM.predict(test_padded)
predictions_binary = model_LSTM.predict(test_padded)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.5024038461538461

In [29]:
## Learn a Threshold Function
Y_train_pred = model_LSTM.predict(train_padded)
Y_test_pred = model_LSTM.predict(test_padded)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.17708333333333334

In [30]:
Y_test[0,]

array([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.])

In [31]:
test_labels_binary[0,]

array([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.])

In [38]:
## save the model weights 
model_LSTM_filepath = 'Models/LSTM_weights'
model_LSTM.save_weights(model_LSTM_filepath)

In [36]:
## save the learned threshold function
threshold_filepath = 'Models/threshold_LSTM.json'
skljson.to_json(threshold_function, threshold_filepath)

## Train a Bidirectional LSTM

In [45]:
## Define the LSTM RNN architecture
num_labels = len(label_cols)

model_biLSTM = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences = False, return_state = False)),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

#optim = tf.keras.optimizers.Adam(lr=0.0001)
optim = tf.keras.optimizers.Adagrad(
    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
    name = 'Adagrad')

#optim = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9, momentum = 0.8, epsilon=1e-07,)

#metrics = tfa.metrics.hamming_loss_fn(mode = 'multi-label')
model_biLSTM.compile(loss = 'categorical_crossentropy', optimizer = optim)

In [46]:
tf.random.set_seed(123)
model_biLSTM.fit(train_padded, Y_train, epochs = 100, validation_data = (test_padded, Y_test), verbose=2)

Epoch 1/100
7/7 - 6s - loss: 7.6579 - val_loss: 7.9161
Epoch 2/100
7/7 - 0s - loss: 7.6446 - val_loss: 7.9055
Epoch 3/100
7/7 - 0s - loss: 7.6339 - val_loss: 7.8966
Epoch 4/100
7/7 - 0s - loss: 7.6247 - val_loss: 7.8892
Epoch 5/100
7/7 - 0s - loss: 7.6172 - val_loss: 7.8823
Epoch 6/100
7/7 - 0s - loss: 7.6100 - val_loss: 7.8756
Epoch 7/100
7/7 - 0s - loss: 7.6031 - val_loss: 7.8687
Epoch 8/100
7/7 - 0s - loss: 7.5961 - val_loss: 7.8634
Epoch 9/100
7/7 - 0s - loss: 7.5904 - val_loss: 7.8577
Epoch 10/100
7/7 - 0s - loss: 7.5844 - val_loss: 7.8522
Epoch 11/100
7/7 - 0s - loss: 7.5784 - val_loss: 7.8471
Epoch 12/100
7/7 - 0s - loss: 7.5730 - val_loss: 7.8414
Epoch 13/100
7/7 - 0s - loss: 7.5671 - val_loss: 7.8359
Epoch 14/100
7/7 - 0s - loss: 7.5616 - val_loss: 7.8317
Epoch 15/100
7/7 - 0s - loss: 7.5571 - val_loss: 7.8273
Epoch 16/100
7/7 - 0s - loss: 7.5522 - val_loss: 7.8217
Epoch 17/100
7/7 - 0s - loss: 7.5465 - val_loss: 7.8170
Epoch 18/100
7/7 - 0s - loss: 7.5416 - val_loss: 7.8121
E

<tensorflow.python.keras.callbacks.History at 0x20809c26880>

In [47]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_biLSTM.predict(test_padded)
predictions_binary = model_biLSTM.predict(test_padded)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.41746794871794873

In [48]:
## Learn a Threshold Function
Y_train_pred = model_biLSTM.predict(train_padded)
Y_test_pred = model_biLSTM.predict(test_padded)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.2155448717948718

In [37]:
Y_test[0,]

array([0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.])

In [38]:
test_labels_binary[0,]

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])