# Description
This notebook contains
- implementation and training of binary classification using a slightly deeper parallel Bi-LSTM for the condition SJ

## Model description
### 6 way - model_sj_6
- tokenize the texts for the statements and the justification
- embedding layer using glove embeddings for each, statements and justifications
- two parallel Bi-LSTM layers, one for the statements and one for the justifications
- a concatenate layer to merge the result of the two Bi-LSTM layers
- two dense layers ending with a softmax activation with 6 output units

## Results
### 6 way - model_sj_6
- Val accuracy = 24.38%
- Test accuracy = 21.94%

## Weights file
### 6 way - model_sj_6
- model_d_sj_6_weights_1.h5


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import pandas as pd

In [3]:
dataset_dir = "dataset"

train_data_file = os.path.join(dataset_dir, "train2.tsv")
test_data_file = os.path.join(dataset_dir, "test2.tsv")
val_data_file = os.path.join(dataset_dir, "val2.tsv")

In [4]:
# column names are taken from the readme.md of the LIAR-PLUS github repo -
# link to repo - https://github.com/Tariq60/LIAR-PLUS

col_names = ["id", "label", "statement", "subject", "speaker", "speaker_job", "state_info", "party", \
             "barely_true", "false", "half_true", "mostly_true", "pants_on_fire", "context", "ex_just"]

In [5]:
train_data = pd.read_csv(train_data_file, sep = '\t', header = None, names = col_names,)# na_values = ["NaN"], na_filter = True)
test_data = pd.read_csv(test_data_file, sep = '\t', header = None, names = col_names)
val_data = pd.read_csv(val_data_file, sep = '\t', header = None, names = col_names)

In [6]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Embedding, Input, Bidirectional
from keras.initializers import Constant
from keras.utils import to_categorical

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [7]:
# using glove embeddings, as mentioned in the paper. Reference taken from keras documentation on using pretrained word embeddings
# link to reference - https://keras.io/examples/pretrained_word_embeddings/
# link to download glove embeddings - https://nlp.stanford.edu/projects/glove/

glove_file = os.path.join("glove", "glove.6B.100d.txt")
max_no_of_words = 20000
embeddings_dim = 100
max_len_seq = 1000

In [9]:
embeddings_index = {}
with open(glove_file) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [10]:
tokenizer = Tokenizer(num_words=max_no_of_words)

tokenizer.fit_on_texts(list(train_data["statement"]))
tokenizer.fit_on_texts(list(train_data["ex_just"].replace(np.nan, "", regex = True)))

train_stm_sequences = tokenizer.texts_to_sequences(list(train_data["statement"]))
train_just_sequences = tokenizer.texts_to_sequences(list(train_data["ex_just"].replace(np.nan, "", regex = True)))

val_stm_sequences = tokenizer.texts_to_sequences(list(val_data["statement"]))
val_just_sequences = tokenizer.texts_to_sequences(list(val_data["ex_just"].replace(np.nan, "", regex = True)))

test_stm_sequences = tokenizer.texts_to_sequences(list(test_data["statement"]))
test_just_sequences = tokenizer.texts_to_sequences(list(test_data["ex_just"].replace(np.nan, "", regex = True)))

train_stm_seq = np.array(pad_sequences(train_stm_sequences, maxlen = max_len_seq))
train_just_seq = np.array(pad_sequences(train_just_sequences, maxlen = max_len_seq))

val_stm_seq = np.array(pad_sequences(val_stm_sequences, maxlen = max_len_seq))
val_just_seq = np.array(pad_sequences(val_just_sequences, maxlen = max_len_seq))

test_stm_seq = np.array(pad_sequences(test_stm_sequences, maxlen = max_len_seq))
test_just_seq = np.array(pad_sequences(test_just_sequences, maxlen = max_len_seq))

In [11]:
num_words = min(max_no_of_words, len(tokenizer.word_index)) + 1 # add 1 tokenizer index starts from 1
embedding_matrix = np.zeros((num_words, embeddings_dim)) 
for word, i in tokenizer.word_index.items():
    if i > num_words-1:
        continue
    embedding_vector = embeddings_index.get(word) # to avoid KeyError exception
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros as .get will return None
        embedding_matrix[i] = embedding_vector

In [12]:
six_val = {"pants-fire":0, "false":1, "barely-true":2, "half-true":3, "mostly-true":4, "true":5}
rev_six_val = dict(map(reversed, six_val.items()))

train_sj_6 = np.array(list(map(lambda l: six_val[l], list(train_data["label"]))))
train_cat_sj_6 = to_categorical(train_sj_6)

val_sj_6 = np.array(list(map(lambda l: six_val[l], list(val_data["label"]))))
val_cat_sj_6 = to_categorical(val_sj_6)

test_sj_6 = np.array(list(map(lambda l: six_val[l], list(test_data["label"]))))
test_cat_sj_6 = to_categorical(test_sj_6)

In [13]:
from keras.layers import concatenate

stm_inp_6 = Input(shape = (max_len_seq, ), dtype = 'int32')
stm_x_6 = Embedding(num_words, embeddings_dim, embeddings_initializer = Constant(embedding_matrix),
                    input_length = max_len_seq, trainable = False)(stm_inp_6)
stm_x_6 = Bidirectional(LSTM(64))(stm_x_6)

just_inp_6 = Input(shape = (max_len_seq,), dtype = "int32")
just_x_6 = Embedding(num_words, embeddings_dim, embeddings_initializer = Constant(embedding_matrix),
                    input_length = max_len_seq, trainable = False)(just_inp_6)
just_x_6 = Bidirectional(LSTM(64))(just_x_6)

x_sj_6 = concatenate([stm_x_6, just_x_6])
x_sj_6 = Dense(64, activation = 'tanh')(x_sj_6)
c_sj_6 = Dense(6, activation = 'softmax')(x_sj_6)

model_sj_6 = Model(inputs = [stm_inp_6, just_inp_6], outputs = c_sj_6)

model_sj_6.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

Instructions for updating:
Colocations handled automatically by placer.


In [14]:
model_sj_6.fit([train_stm_seq, train_just_seq], train_cat_sj_6, batch_size = 32, epochs = 10, verbose = 1, validation_data = ([val_stm_seq, val_just_seq], val_cat_sj_6))

Instructions for updating:
Use tf.cast instead.
Train on 10240 samples, validate on 1284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12cdfdd30>

In [18]:
print("test accuracy = {}".format(model_sj_6.evaluate([test_stm_seq, test_just_seq], test_cat_sj_6)[1]))

pred_prob_sj_6 = model_sj_6.predict([test_stm_seq, test_just_seq])
print(list(map(lambda r: rev_six_val[r], list(np.argmax(pred_prob_sj_6[:10], axis = 1)))))
print(test_data["label"].head(10))

test accuracy = 0.21941594317872973
['pants-fire', 'false', 'barely-true', 'false', 'false', 'barely-true', 'barely-true', 'false', 'false', 'barely-true']
0           true
1          false
2          false
3      half-true
4     pants-fire
5           true
6           true
7    barely-true
8           true
9    barely-true
Name: label, dtype: object


In [19]:
model_sj_6.save_weights("model_d_sj_6_weights_1.h5")