In [1]:
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
import tensorflow_addons as tfa
from bpmll import bp_mll_loss
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sys
sys.path.append('../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary    ## Import the 'predict_test_labels_binary()' function from the 
                                                             ## threshold_learning library

In [4]:
## Define the LSTM RNN architecture
num_labels = 13
max_length = 100
num_unique_words = 2711

model_LSTM = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_unique_words, 32, input_length = max_length),
    tf.keras.layers.LSTM(16, return_sequences = False, return_state = False),
    #tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels, activation = 'sigmoid')
])

optim = tf.keras.optimizers.Adam(lr=0.0001)
#optim_func_LSTM = tf.keras.optimizers.Adagrad(
#    learning_rate = 0.001, initial_accumulator_value = 0.1, epsilon = 1e-07,
#    name = 'Adagrad')

#metrics = tfa.metrics.hamming_loss_fn(mode = 'multi-label')
model_LSTM.compile(loss = 'categorical_crossentropy', optimizer = optim)

In [11]:
# Restore the weights
model_LSTM.load_weights('./Models/Models')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x254939949d0>

In [12]:
## Load 'content_paragraphs_ready.csv' into a pandas dataframe
data_filepath = "..\..\dataset\content_paragraphs_ready.csv"
paragraph_data = pd.read_csv(data_filepath)
paragraph_data.head()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,...,prosper,preview,moor,coverag,glow,profil,clash,incumb,frequent,unfound
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
## Keep only the paragraph id, paragraph text, and labels
to_keep = paragraph_data.columns[0:15]
#to_keep
paragraph_data = paragraph_data[to_keep]
paragraph_data.head()

Unnamed: 0,para_id,full_text,threats/impacts,responses/actions,severity,susceptibility,self-efficacy,external-efficacy,response efficacy,public health,economy,education,political evaluation,racial conflict,international ralations/foreign policies
0,214236,MURPHY: Again Martha we are defacto staying at...,1,1,0,1,1,0,1,1,0,0,0,0,0
1,214232,"GOV. PHIL MURPHY, (D-NJ): Yes. Good to be back...",1,1,1,1,1,0,1,1,0,0,0,0,0
2,214266,"BEAUMONT (ON SCREEN UPPER LEFT - ""FRIDAY MARCH...",0,1,0,0,0,1,0,1,0,0,0,0,0
3,214246,"But in the meantime, my message to Louisiana i...",1,1,1,0,1,0,1,1,0,0,0,0,0
4,214238,"MURPHY: Yeah listen, we had gotten another shi...",0,1,0,0,0,1,0,1,0,0,0,0,0


In [14]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in paragraph_data.full_text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

In [16]:
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_URL) # map(lambda x: remove_URL(x))
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_punct)

In [17]:
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
paragraph_data["full_text"] = paragraph_data.full_text.map(remove_stopwords)

In [19]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(paragraph_data.full_text)

In [20]:
num_unique_words = len(counter)
counter.most_common(5)

[('said', 69), ('people', 61), ('trump', 49), ('new', 48), ('tests', 47)]

In [21]:
## Define the X and Y train and test matrices
covariate_cols = ['full_text']
label_cols = paragraph_data.columns.difference(['para_id'] + covariate_cols)

X = paragraph_data.full_text.to_numpy()
Y = paragraph_data[label_cols].to_numpy().astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 321)

In [22]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
random.seed(123)

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words = num_unique_words)
tokenizer.fit_on_texts(X_train) # fit only to training
word_index = tokenizer.word_index

In [23]:
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [24]:
max_par_length = 0
for par in train_sequences:
    if len(par) > max_par_length:
        max_par_length = len(par)
        
max_par_length

86

In [25]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 100

train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = "post", truncating = "post")
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding = "post", truncating = "post")
train_padded.shape, test_padded.shape

((194, 100), (96, 100))

In [26]:
# Check reversing the indices
# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [28]:
## Using a constant 0.5 threshold function, get the hamming loss for the trained network on the test set
predictions = model_LSTM.predict(test_padded)
predictions_binary = model_LSTM.predict(test_padded)
for i in range(Y_test.shape[0]):
    for j in range(Y_test.shape[1]):
        if predictions_binary[i, j] > 0.5:
            predictions_binary[i, j] = 1
        else:
            predictions_binary[i, j] = 0

# Get the hamming loss
metrics.hamming_loss(Y_test, predictions_binary)

0.5024038461538461

In [29]:
## Learn a Threshold Function
Y_train_pred = model_LSTM.predict(train_padded)
Y_test_pred = model_LSTM.predict(test_padded)
t_range = (0, 1)

test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred, Y_train, Y_test_pred, t_range)
metrics.hamming_loss(Y_test, test_labels_binary)

0.17708333333333334