# Notebook initialization
- Fix random seeds for reproducibile results. This cannot be done entirely in keras due to GPU intrinsically non-deterministic operations (https://keras.io/getting_started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development)

- Enable Weights&biases, a platform for experiment logging. (keep it disabled if no login is available)

In [39]:
#@title { form-width: "31%" }
import os
import random
import math
import numpy as np
import tensorflow as tf

# fix random seeds
seed_value = 42 #@param {type:"integer"}

os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

tf.compat.v1.set_random_seed(seed_value)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

# Weigths and Biases API int
ENABLE_WANDB = False        #@param {type:"boolean"}
if ENABLE_WANDB:
    !pip install wandb
    !wandb login wandb_api_token
    import wandb

# Dataset preparation

- The dataset is downloaded by the provided script  
- Sentences are preprocessed  
- The dataset is converted to word embeddings  
- A data iterator is defined for usage by the model

In [40]:
#@title Download { form-width: "31%" }

import os
import requests
import zipfile

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

download_data('dataset')

## Preprocessing

Light preprocessing is done to each sentence:
- the dataset comes from wikipedia english but has IPA phonetic alphabet symbols: we remove every non-ascii symbols to filter them out;  
- bracket symbols: `["-LRB-", "-LSB-", "-RRB-", "-RSB-"]` are extensively used, we remove them as stopwords;  
- every word is lowercased so to match the selected GloVe word embedding representations;  
- SUPPORT/REFUTES labels are converted to binary.

Every claim and evidence sentence is processed in the same way.  


In [41]:
#@title { form-width: "31%" }
import pandas as pd
# implement preprocessor pipeline, show its effect on a single sentence

import re
from functools import reduce

GOOD_SYMBOLS_RE = re.compile('[^0-9a-zA-Z]')
STOPWORDS = ["-LRB-", "-LSB-", "-RRB-", "-RSB-"]

def lower(text):
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def filter_out_uncommon_symbols(text):
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """
    
    return GOOD_SYMBOLS_RE.sub(' ', text)

def remove_stopwords(text):
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])

PREPROCESSING_PIPELINE = [
                          remove_stopwords,
                          filter_out_uncommon_symbols,
                          lower
                          ]

# Anchor method

def text_prepare(text, filter_methods=None):
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE

    return reduce(lambda txt, f: f(txt), filter_methods, text)

dir = "dataset/"
names = ["N", "Claim",	"Evidence",	"ID",	"Label"]

# create pandas dataframes
df_train = pd.read_csv(dir+"train_pairs.csv")
df_val = pd.read_csv(dir+"val_pairs.csv")
df_test = pd.read_csv(dir+"test_pairs.csv")

# converts string label to binary class
df_train['Label'] = df_train['Label'].map(lambda x: 1 if x=='SUPPORTS' else 0)
df_val['Label'] = df_val['Label'].map(lambda x: 1 if x=='SUPPORTS' else 0)
df_test['Label'] = df_test['Label'].map(lambda x: 1 if x=='SUPPORTS' else 0)

df_train_strings = df_train.copy()
df_val_strings = df_val.copy()
df_test_strings = df_test.copy()

df_train.drop(df_train.columns[0], axis='columns', inplace=True)
df_val.drop(df_val.columns[0], axis='columns', inplace=True)
df_test.drop(df_test.columns[0], axis='columns', inplace=True)

evidence = df_train["Evidence"][0]
print('Original evidence text:')
print(evidence,end='\n\n')
print('Uncommon symbols and stopwords removal:')
print(text_prepare(evidence),end='\n\n')

# apply the text_prepare pipeline to each sentence
# drops firt element of evidences, that is: the sentence pair id
df_train["Claim"] = df_train["Claim"].map(lambda s: text_prepare(s).split())
df_train["Evidence"] = df_train["Evidence"].map(lambda s: text_prepare(s).split()[1:])
df_val["Claim"] = df_val["Claim"].map(lambda s: text_prepare(s).split())
df_val["Evidence"] = df_val["Evidence"].map(lambda s: text_prepare(s).split()[1:])
df_test["Claim"] = df_test["Claim"].map(lambda s: text_prepare(s).split())
df_test["Evidence"] = df_test["Evidence"].map(lambda s: text_prepare(s).split()[1:])

df_train

Original evidence text:
2	Hemsworth has also appeared in the science fiction action film Star Trek -LRB- 2009 -RRB- , the thriller adventure A Perfect Getaway -LRB- 2009 -RRB- , the horror comedy The Cabin in the Woods -LRB- 2012 -RRB- , the dark-fantasy action film Snow White and the Huntsman -LRB- 2012 -RRB- , the war film Red Dawn -LRB- 2012 -RRB- , and the biographical sports drama film Rush -LRB- 2013 -RRB- .	Star Trek	Star Trek (film)	A Perfect Getaway	A Perfect Getaway	The Cabin in the Woods	The Cabin in the Woods	Snow White and the Huntsman	Snow White and the Huntsman	Red Dawn	Red Dawn (2012 film)	Rush	Rush (2013 film)

Uncommon symbols and stopwords removal:
2 hemsworth has also appeared in the science fiction action film star trek 2009   the thriller adventure a perfect getaway 2009   the horror comedy the cabin in the woods 2012   the dark fantasy action film snow white and the huntsman 2012   the war film red dawn 2012   and the biographical sports drama film rush 2013   st

Unnamed: 0,Claim,Evidence,ID,Label
0,"[chris, hemsworth, appeared, in, a, perfect, g...","[hemsworth, has, also, appeared, in, the, scie...",3,1
1,"[roald, dahl, is, a, writer]","[roald, dahl, langpron, ro, ld, d, l, u, l, d,...",7,1
2,"[roald, dahl, is, a, governor]","[roald, dahl, langpron, ro, ld, d, l, u, l, d,...",8,0
3,"[ireland, has, relatively, low, lying, mountains]","[the, island, s, geography, comprises, relativ...",9,1
4,"[ireland, does, not, have, relatively, low, ly...","[the, island, s, geography, comprises, relativ...",10,0
...,...,...,...,...
121735,"[april, was, the, month, anderson, silva, was,...","[anderson, da, silva, de, s, siwv, born, april...",229440,1
121736,"[anderson, silva, is, an, american, brazilian,...","[anderson, da, silva, de, s, siwv, born, april...",229443,0
121737,"[anderson, silva, is, incapable, of, being, a,...","[anderson, da, silva, de, s, siwv, born, april...",229444,0
121738,"[anderson, silva, was, born, on, the, month, o...","[anderson, da, silva, de, s, siwv, born, april...",229445,1


## Embedding and vocabularies
Here we download the selected size GloVe pre-trained word embeddings.  
This is then used to incrementally build Train, Validation and Test vocabularies, respectively v2, v3 and v4.  
Corresponding `embedding_matrix_v2`, `embedding_matrix_v3` , `embedding_matrix_v4` will be passed to train, validation and test models to keep indices consistency.   

The code proceeds as follows:

repeat 1-6 for each one of train, val, test different vocabularies
1. build separate vocabularies
2. merge with previous vocabulary with consistent indices
3. parse txt data to categorical (for embedding layer)
4. build inverse vocabulary
5. embed OOV terms with random vectors
6. concatenate pad and unk vectors to embedding

In [42]:
#@title { form-width: "31%" }

import gensim
import gensim.downloader as gloader
from collections import OrderedDict, Counter
from itertools import chain
from functools import reduce

from tqdm import tqdm

def build_unique_voc(words, special_tokens=[]):
    wordset = set()
    wordset = wordset.union(*[set([word for word in wordlist]) for wordlist in words])
    w2i = OrderedDict()
    i2w = OrderedDict()
    for i,w in enumerate(chain(special_tokens, wordset)):
        w2i[w] = i
        i2w[i] = w
    return w2i, i2w
    
def text_to_categorical(df, w2i):
    """Converts DataFrame words to categorical for use in the RNNs"""
    df['Claim'] = df['Claim'].map(lambda s: [w2i.get(w, w2i[UNK]) for w in s])
    df['Evidence'] = df['Evidence'].map(lambda s: [w2i.get(w, w2i[UNK]) for w in s])

def load_embedding_model(embedding_dimension=50):
    """Loads GloVe with specified embedding dimension."""
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Error downloading GloVe")
        raise e
    return emb_model

def merge_voc(old_voc, add_voc):
    """Merges vocabularies keeping consistent indices."""
    voc = old_voc.copy()
    added_counter = 0
    oov_terms = []
    for i, word in enumerate(add_voc.keys()):
        if word not in old_voc.keys():
            oov_terms.append(word)
            voc[word] = added_counter + len(old_voc)
            added_counter += 1
    return voc, oov_terms

def embedd_OOV_terms(embedding_model, oov_terms, co_occurrence_matrix, w2i, i2w, rnd_OOV = False):
    """Embedd OOV words by weighted average of co-occurring neighbors."""
    for i, word in enumerate(oov_terms):
        if rnd_OOV:
            oov_vec = np.random.rand(embedding_dimension)
        else:
            oov_vec = np.zeros(embedding_dimension)
            for count_row in co_occurrence_matrix[w2i[word]]:
                weights_acc = 0
                for count, index in zip(count_row.data, count_row.indices):
                    if i2w[index] not in oov_terms:
                        weights_acc += count
                        oov_vec += count*embedding_model[i2w[index]]

            oov_vec/=weights_acc
        embedding_model.add(word, oov_vec)

    return embedding_model

# download pretrained GloVe embedding
embedding_dimension = 50 #@param [50, 100, 300] {type:"raw"}
print("Downloading Glove embedding with dimension:", embedding_dimension)
print("Be ",int(np.sqrt((embedding_dimension//50 - 1)))*"very ","patient :)",sep='')
embedding_model = load_embedding_model(embedding_dimension)

# allows "PAD" to have index zero, crucial for consistency
PAD, UNK = '<pad>', '<unk>'
pad_unk_vec = np.zeros((2, embedding_dimension)) # pad
pad_unk_vec[1] = np.random.rand(embedding_dimension) # unk
v1 = {e:i for i,e in enumerate(chain([PAD, UNK], embedding_model.vocab.keys()))}


#build vocabularies and embedding matrices

# v2 (glove + train)
print("Building train vocabulary...")
w2i_train, _ = build_unique_voc(df_train["Claim"].append(df_train["Evidence"]))
v2, oov1 = merge_voc(v1, w2i_train)
print(f"Found {len(oov1)} oov words: {len(oov1)/len(w2i_train):.2%}")
text_to_categorical(df_train, v2)
inv2 = {v:k for k,v in v2.items()}

embeding_model = embedd_OOV_terms(embedding_model, oov1, None, v2, inv2, rnd_OOV=True)
embedding_matrix_v2 = np.concatenate((pad_unk_vec, embedding_model.vectors.copy()))

# v3 (glove + train + val)
print("Building validation vocabulary...")
w2i_val, _ = build_unique_voc(df_val["Claim"].append(df_val["Evidence"]))
v3, oov2 = merge_voc(v2, w2i_val)
print(f"Found {len(oov2)} oov words: {len(oov2)/len(w2i_val):.2%}")
text_to_categorical(df_val, v3)
inv3 = {v:k for k,v in v3.items()}

embeding_model = embedd_OOV_terms(embedding_model, oov2, None, v3, inv3, rnd_OOV=True)
embedding_matrix_v3 = np.concatenate((pad_unk_vec, embedding_model.vectors.copy()))

# v4 (glove + train + val + test)
print("Building test vocabulary...")
w2i_test, _ = build_unique_voc(df_test["Claim"].append(df_test["Evidence"]))
v4, oov3 = merge_voc(v3, w2i_test)
print(f"Found {len(oov3)} oov words: {len(oov3)/len(w2i_test):.2%}")
text_to_categorical(df_test, v4)
inv4 = {v:k for k,v in v4.items()}

embeding_model = embedd_OOV_terms(embedding_model, oov3, None, v4, inv4, rnd_OOV=True)
embedding_matrix_v4 = np.concatenate((pad_unk_vec, embedding_model.vectors.copy()))


Downloading Glove embedding with dimension: 50
Be patient :)
Building train vocabulary...
Found 2311 oov words: 7.15%
Building validation vocabulary...
Found 193 oov words: 2.17%
Building test vocabulary...
Found 235 oov words: 2.37%


## Data iterators
A data iterator provides batches of `(claim, evidence), label` to the model.  
We build train and validation ones.

In [43]:
# @title { form-width: "30%" }
# helper class to iterate the data
class DataIterator:
    def __init__(self, df, sequence_len, batch_size):
        self.X = df[['Claim', 'Evidence']]
        self.Y = df['Label'].to_numpy()
        assert self.X.shape[0] == self.Y.shape[0]
        self.num_sentence = self.X.shape[0]
        self.batch_size = batch_size
        # pad sentences
        self.data_list = []
        for i in range(self.num_sentence):
            c = np.zeros((sequence_len))
            e = np.zeros((sequence_len))
            label = self.Y[i]
            l0 = min(len(self.X.iloc[i][0]), sequence_len)
            l1 = min(len(self.X.iloc[i][1]), sequence_len)
            c[:l0] = self.X.iloc[i][0][:l0]
            e[:l1] = self.X.iloc[i][1][:l1]
            self.data_list.append((c, e, label))
        self.shuffle()

    def shuffle(self):
        self.current = 0
        random.shuffle(self.data_list)
        # batch the data
        num_batches = math.ceil(self.num_sentence/self.batch_size)
        self.batches_c = []
        self.batches_e = []
        self.batches_y = []
        for i in range(num_batches):
            batchc = []
            batche = []
            batchy = []
            for j in range(self.batch_size):
                if i*self.batch_size+j >= self.num_sentence:
                    break
                batchc.append(self.data_list[i*self.batch_size+j][0])
                batche.append(self.data_list[i*self.batch_size+j][1])
                batchy.append(self.data_list[i*self.batch_size+j][2])
            self.batches_c.append(np.array(batchc))
            self.batches_e.append(np.array(batche))
            self.batches_y.append(np.array(batchy))

    def __iter__(self):
        return self

    def __next__(self):
        if self.current >= len(self.batches_c):
            raise StopIteration
        claim = self.batches_c[self.current]
        evidence = self.batches_e[self.current]
        y = self.batches_y[self.current]
        claim = tf.cast(claim, tf.float32)
        evidence = tf.cast(evidence, tf.float32)
        y = tf.cast(y, tf.float32)
        self.current += 1
        return claim, evidence, y
 
# hyperparameters

sequence_len = 216 # marks the longest sentence token count
batch_size =   128 #@param {type:"integer"}

print("Building Data Iterators")
train_data = DataIterator(df_train, sequence_len, batch_size)
val_data = DataIterator(df_val, sequence_len, batch_size)
print("Built")

Building Data Iterators
Built


# Model
We define a `BaseModel` class that implements common attributes and methods for all possible models,  
`sentence_embedder` and `embedding_merger` required arguments allow for model customization:  
the required embedding and merging strategies are implemented as `keras.layers` and passed to the constructor at instantiation time, to allow for building each feasible combination.  


In [44]:
#@title Model definition { form-width: "31%" }

from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, LayerNormalization, LSTM, Dense, 
                                     Bidirectional, Embedding, 
                                     concatenate, Add, Average, Flatten, Dropout)

################################################################################
class BaseModel(tf.keras.Model):
    def __init__(self, batch_size, sequence_len, embedding_matrix,
                 sentence_embedder, embedding_merger, train_word_emb = True,**kwargs):
      
        super(BaseModel, self).__init__(**kwargs)

        word_voc = embedding_matrix.shape[0]
        embedding_dim = embedding_matrix.shape[1]

        self.trainable = train_word_emb

        self.input_layer_claim = Input(batch_input_shape=(batch_size, sequence_len),
                                       name="claim_input")
        self.input_layer_evidence = Input(batch_input_shape=(batch_size, sequence_len),
                                          name="evidence_input")
        
        self.word_embedding = Embedding(word_voc, embedding_dim,
                            weights=[embedding_matrix], trainable=self.trainable,
                            mask_zero=True, name="word_embedding")
        
        self.sentence_embedder = sentence_embedder
        self.embedding_merger = embedding_merger

        self.classifier = Dense(2, activation="softmax", name="classifier")


    def call(self, c, e):
        claim_word_embedding = self.word_embedding(c)
        evidence_word_embedding = self.word_embedding(e)
        
        claim_sentence_embedding = self.sentence_embedder(claim_word_embedding)
        evidence_sentence_embedding = self.sentence_embedder(evidence_word_embedding)
        merged_embedding = self.embedding_merger(claim_sentence_embedding,
                                            evidence_sentence_embedding)
                                            
        output = self.classifier(merged_embedding)
        return output

    def loss_function(self, y, predictions):
        sce = tf.keras.losses.sparse_categorical_crossentropy(
                y, predictions, from_logits=False)
        return tf.reduce_mean(sce)

    def update_metrics(self, y, predictions, loss_obj, acc_obj, f1_obj):
        # loss
        current_loss = self.loss_function(y, predictions)
        loss_obj.update_state(current_loss)

        pred_np = np.argmax(predictions, axis=-1)
        # accuracy
        acc_obj.update_state(y, pred_np)
        # f1 macro
        y_np = y.numpy().astype(np.int32)
        f1 = f1_score(y_np.flatten(), pred_np.flatten(), 
                        average='macro', zero_division=0, 
                        labels=[0,1])
        f1_obj.update_state(f1)

    # consistently copies embedding weights from one model to another
    def copy_weights_from(self, other):
        for layer, other_layer in zip(self.layers, other.layers):
            if layer.name == "word_embedding":
                from_weights = other_layer.get_weights()[0];
                new_weights = layer.get_weights()[0][from_weights.shape[0]:]
                layer.set_weights([np.vstack((from_weights, new_weights))])
            else:
                layer.set_weights(other_layer.get_weights())

    def summary(self):
        model = Model(inputs=[self.input_layer_claim, self.input_layer_evidence],
                      outputs=self.call(self.input_layer_claim, self.input_layer_evidence))
        model.summary()

################################################################################
########################  CUSTOM SENTENCE EMBEDDERS  ###########################
################################################################################

class BOVEmbeddeing(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(BOVEmbeddeing, self).__init__(**kwargs)

    def call(self, x): 
        return tf.reduce_mean(x, axis=1)

class MLPEmbedding(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, **kwargs):
        super(MLPEmbedding, self).__init__(**kwargs)
        self.flatten = Flatten()
        self.d1 = Dense(embedding_dim)

    def call(self, x):
        flattened = self.flatten(x)
        return self.d1(flattened)

class RNNEmbedding(tf.keras.layers.Layer):
    def __init__(self, latent_dim, average_all_outputs, **kwargs):
        super(RNNEmbedding, self).__init__(**kwargs)
        self.average_all_outputs = average_all_outputs
        self.lstm = Bidirectional(LSTM(
            latent_dim, return_state=False, return_sequences=self.average_all_outputs))
        
    def call(self, x): 
        if self.average_all_outputs:
            return tf.reduce_mean(self.lstm(x), axis=1)
        else:
            return self.lstm(x)

################################################################################
#########################  CUSTOM EMBEDDING MERGERS  ###########################
################################################################################

class Merge(tf.keras.layers.Layer):
    def __init__(self, cosine, **kwargs):
        super(Merge, self).__init__(**kwargs)
        self.cosine = cosine
        self.dot = tf.keras.layers.Dot((1, 1))

    def compute(self, a, b):
        pass

    def call(self, a, b):
        if not self.cosine:
            return self.compute(a, b)
        cs = self.dot([a, b])/(tf.norm(a)*tf.norm(b)+1e-9)
        computed = self.compute(a, b)
        return concatenate([computed, cs])

class ConcatMerge(Merge):
    def __init__(self, cosine, **kwargs):
        super(ConcatMerge, self).__init__(cosine, **kwargs)

    def compute(self, a, b): 
        return concatenate([a, b])

class SumMerge(Merge):
    def __init__(self, cosine, **kwargs):
        super(SumMerge, self).__init__(cosine, **kwargs)
        self.add_layer = Add()

    def compute(self, a, b): 
        return self.add_layer([a, b])

class AvgMerge(Merge):
    def __init__(self, cosine, **kwargs):
        super(AvgMerge, self).__init__(cosine, **kwargs)
        self.avg_layer = Average()

    def compute(self, a, b):
        return self.avg_layer([a, b])


## Training
We train

In [None]:
#@title { form-width: "31%" }

RNN_latent_dim = 64#@param {type:"integer"}

RNNlast = RNNEmbedding(latent_dim=RNN_latent_dim, average_all_outputs=False, name="RNNEmbedding-last")
RNNavg = RNNEmbedding(latent_dim=RNN_latent_dim, average_all_outputs=True, name="RNNEmbedding-avg")
MLP = MLPEmbedding(embedding_dim=50, name="MLPEmbedding")
BOV = BOVEmbeddeing(name="BOVEmbedding")

concat_merge = ConcatMerge(cosine=False, name="ConcatMerge")
concat_merge_cosine = ConcatMerge(cosine=True, name="ConcatMerge-cos")

sum_merge = SumMerge(cosine=False, name="SumMerge")
sum_merge_cosine = SumMerge(cosine=True, name="SumMerge-cos")

avg_merge = AvgMerge(cosine=False, name="AvgMerge")
avg_merge_cosine = AvgMerge(cosine=True, name="AvgMerge-cos")

################################################################################

sentence_embedder = RNNlast #@param ['RNNlast', 'RNNavg', 'MLP', 'BOV'] {type:"raw"}
embedding_merger = concat_merge #@param ['concat_merge', 'concat_merge_cosine','sum_merge','sum_merge_cosine','avg_merge','avg_merge_cosine'] {type:"raw"}

max_epochs = 100
learning_rate = 5e-4 #@param {type:"number"}


@tf.function
def train_step_graph_fn(optimizer, model, c, e, y):
    with tf.GradientTape() as tape:
        predictions = model(c, e)
        loss = model.loss_function(y, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return predictions

@tf.function
def val_step_graph_fn(model, c, e):
    predictions = model(c,e)
    return predictions

# train model
model_name = f"{sentence_embedder.name}__{embedding_merger.name}"
model = BaseModel(batch_size, sequence_len, embedding_matrix_v2, 
                  sentence_embedder, embedding_merger, name=model_name)
model.summary()
# val model
val_model = BaseModel(batch_size, sequence_len, embedding_matrix_v3,
                        sentence_embedder, embedding_merger, name=model_name)
# best model
best_model = BaseModel(batch_size, sequence_len, embedding_matrix_v4,
                        sentence_embedder, embedding_merger, name=model_name)
# build the graph model
train_data.shuffle()
c, e, _ = train_data.__next__()
val_model(c, e)
best_model(c, e)

if ENABLE_WANDB:
    wandb.config.batch_size = batch_size
    wandb.config.latent_dim = latent_dim
    wandb.config.learning_rate = learning_rate 
    wandb.config.emdedding_dim = embedding_dimension
    wandb.config.model = model.name
    wandb_experiment_name = f"{model.name}_b{batch_size}_lr{learning_rate:.0e}"
    wandb.init(project="NLP04", name=wandb_experiment_name)

# metrics
train_loss_obj = tf.keras.metrics.Mean(name='train_loss')
train_f1_obj = tf.keras.metrics.Mean(name='train_f1')
train_acc_obj = tf.keras.metrics.Accuracy(name='train_accuracy')
val_loss_obj = tf.keras.metrics.Mean(name='val_loss')
val_f1_obj = tf.keras.metrics.Mean(name='val_f1')
val_acc_obj = tf.keras.metrics.Accuracy(name='val_accuracy')

# optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# train loop
max_val_acc = -1
not_improving = 0
max_iter_not_improv = 3

for epoch in range(max_epochs):
    train_loss_obj.reset_states()
    train_f1_obj.reset_states()
    train_acc_obj.reset_states()

    train_data.shuffle()
    for c, e, y in train_data:
        predictions = train_step_graph_fn(optimizer, model, c, e, y)
        model.update_metrics(y, predictions, train_loss_obj, train_acc_obj, train_f1_obj)

    print("{}.  \t[TRAINING]\t  loss: {}  \t accuracy: {} \t f1-macro: {}".format(epoch, 
                                                      train_loss_obj.result(),
                                                      train_acc_obj.result(),
                                                      train_f1_obj.result()))
    if ENABLE_WANDB:
        wandb.log({
            'train_loss': train_loss_obj.result(),
            'train_accuracy': train_acc_obj.result(),
            'train_f1': train_f1_obj.result()
        }, step=epoch)

    # validation
    if True:# epoch%5 == 4:
        val_loss_obj.reset_states()
        val_acc_obj.reset_states()
        val_f1_obj.reset_states()

        val_model.copy_weights_from(model)
        val_data.shuffle()
        for c, e, y in val_data:
            predictions = val_step_graph_fn(val_model, c, e)
            val_model.update_metrics(y, predictions, val_loss_obj, val_acc_obj, val_f1_obj)

        print("     \t[VALIDATION]\t   loss: {}  \t  accuracy: {} \t  f1-macro: {}".format(
                                                      val_loss_obj.result(),
                                                      val_acc_obj.result(),
                                                      val_f1_obj.result()))
        if ENABLE_WANDB:
            wandb.log({
                'val_loss': val_loss_obj.result(),
                'val_accuracy': val_acc_obj.result(),
                'val_f1': val_f1_obj.result()
            }, step=epoch)

        # early stopping
        if val_acc_obj.result() > max_val_acc:
            best_model.copy_weights_from(val_model)
            max_val_acc = val_acc_obj.result()
            not_improving = 0
        else:
            not_improving += 1
            print("VALIDATION ACCURACY NOT IMPROVING, STRIKE:", not_improving,"!!")
            if not_improving >= max_iter_not_improv:
                print("Validation accuracy not improving for", max_iter_not_improv,
                      "successive computations.")
                print("YOU ARE OUT !!!")
                print("Best model accuracy:", max_val_acc.numpy())
                break

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
claim_input (InputLayer)        [(128, 216)]         0                                            
__________________________________________________________________________________________________
evidence_input (InputLayer)     [(128, 216)]         0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (128, 216, 50)       20115650    claim_input[0][0]                
                                                                 evidence_input[0][0]             
__________________________________________________________________________________________________
RNNEmbedding-last (RNNEmbedding (128, 128)           58880       word_embedding[0][0]         

## Evaluation on test

In [46]:
#@title { form-width: "31%" }

test_loss_obj = tf.keras.metrics.Mean(name='test_loss')
test_f1_obj = tf.keras.metrics.Mean(name='test_f1')
test_acc_obj = tf.keras.metrics.Accuracy(name='test_accuracy')

test_data = DataIterator(df_test, sequence_len, batch_size)

test_data.shuffle()
for c, e, y in test_data:
    predictions = val_step_graph_fn(best_model, c, e)
    best_model.update_metrics(y, predictions, test_loss_obj, test_acc_obj, test_f1_obj)

test_log = "\nTEST loss: {}  \t accuracy: {} \t f1-macro: {}\n".format(
                                                test_loss_obj.result(),
                                                test_acc_obj.result(),
                                                test_f1_obj.result())
if ENABLE_WANDB:
    wandb.log({"Test": wandb.Html(
        "<pre>"+test_log+"<pre>", inject=False)})
print(test_log)


TEST loss: 0.555887758731842  	 accuracy: 0.7259702086448669 	 f1-macro: 0.7153576612472534



# BERT

We play with transfer learning via a pre-trained BERT model to explore the state-of-the-art in sentence embedding.  
We select a simple functional model and implement the data iterator as a versatile `keras.utils.Sequence`

In [51]:
!pip install -q -U sentence-transformers
from keras.utils import Sequence
from sentence_transformers import SentenceTransformer

#Semantic Textual Similarity trained model
bert_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

class BERTSequence(Sequence):
    def __init__(self, batch_size, df):
        self.df = df
        self.claims = df["Claim"]
        self.evidences = df["Evidence"]
        self.labels = df["Label"]
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.df)/self.batch_size)

    def __getitem__(self, idx):
        processed_c = bert_model.encode(list(self.claims[idx * self.batch_size : (idx + 1) * self.batch_size]))
        processed_e = bert_model.encode(list(self.evidences[idx * self.batch_size : (idx + 1) * self.batch_size]))
        batch_y = self.labels[idx * self.batch_size : (idx + 1) * self.batch_size].to_numpy()
        return (processed_c, processed_e), batch_y

batch_size = 512
train_sequence = BERTSequence(batch_size, df_train_strings)
val_sequence = BERTSequence(batch_size, df_val_strings)

In [54]:
# BERT classification model


input_c = Input(shape=(768)) # BERT sentence embedded claims
input_e = Input(shape=(768)) # BERT sentence embedded evidences
merged = concat_merge(input_c, input_e)
output = Dense(2, activation="softmax")(merged)

model = tf.keras.Model(inputs=(input_c, input_e), outputs=output, name="BERT")
model.summary()

learning_rate = 5e-4
 
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    metrics=["accuracy"]
)
 
history = model.fit(train_sequence, batch_size=batch_size,
                    epochs=2, verbose=True, shuffle=True,
                    validation_data=val_sequence)

Model: "BERT"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           [(None, 768)]        0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           [(None, 768)]        0                                            
__________________________________________________________________________________________________
ConcatMerge (ConcatMerge)       multiple             0           input_15[0][0]                   
                                                                 input_16[0][0]                   
__________________________________________________________________________________________________
dense_26 (Dense)                (None, 2)            3074        ConcatMerge[5][0]             

In [55]:
test_sequence = BERTSequence(batch_size, df_test_strings)
model.evaluate(test_sequence)



[0.5887011289596558, 0.7160940170288086]