In [1]:
import numpy as np
import tensorflow as tf
import keras
import os

os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
print(tf.__version__)
print(keras.__version__)

2.4.1
2.4.0


Using TensorFlow backend.


In [2]:
# Some utils
def max(a, b):
    return a if a > b else b

In [3]:
import pandas as pd

# Reads CSVs
def read_csv(file, encoding):
    df = pd.read_csv(file, encoding=encoding)
    df.fillna(value="", inplace=True)
    return df

# Read covid BERT documents
faq_covidbert = read_csv("data/faqs/faq_covidbert.csv", "utf8")
faq_covidbert.head(2)

Unnamed: 0,question,answer,answer_html,link,name,source,category,country,region,city,lang,last_update
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,<p>A novel coronavirus is a new coronavirus th...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...","<p>On February 11, 2020 the World Health Organ...",\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17


In [4]:
import nltk

nltk.download('averaged_perceptron_tagger')

from nltk.tag.perceptron import PerceptronTagger

# Preprocess the text with POS Tagger
# @input - sentence : string
# @return - tuples : array of tuples.
def pos_tagger(sentence):
    pretrainTagger = PerceptronTagger()
    tuples = pretrainTagger.tag(sentence.split())
    return tuples

# Test tagger for a sentence.
tagged_answer0 = pos_tagger(faq_covidbert.iloc[0]['answer'])        

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/wizard/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [5]:
tagged_answers = [pos_tagger(s) for s in faq_covidbert['answer']]

In [6]:
# Load GloVe embeddings
import numpy as np

GLOVE_PATH = 'glove/glove.6B.200d.txt'
def load_glove(glove_path):
    f = open(glove_path,encoding="utf8")
    embeddings_index = dict()
    for line in f:
        values = line.split()
        word = values[0]
        embeddings = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embeddings
    f.close()
    print('Found {} word vectors.'.format(len(embeddings_index)))
    return embeddings_index

embeddings = load_glove(GLOVE_PATH)

Found 400000 word vectors.


In [7]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')

def tokenize_sentence(sentence):
    tokens = word_tokenize(sentence)
    tokens = [str.lower(t) for t in tokens]
    return tokens

tokens = tokenize_sentence("Hello World, Dude")
tokens

[nltk_data] Downloading package punkt to /home/wizard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['hello', 'world', ',', 'dude']

In [8]:
# JSON data processor
# Read JSON file and isolate the knowledge base, questions, and answers.
# @input - json file path : string
# @output - list of questions and its corresponding answers.
import json
def read_json(file_path):
    f = open(file_path, encoding='utf8')
    json_data = json.load(f)
    data = json_data['data']
    max_possible_answer = 0
    max_word_length = 0
    qa_tuples = []
    for datum in data:
        paragraphs = datum['paragraphs']
        for paragraph in paragraphs:
            qas = paragraph['qas']
            for qa in qas:
                question = qa['question']
                answers = qa['answers']
                qa_tuples.append({'question' : question, 'answers': answers})
                max_possible_answer = max(max_possible_answer, len(answers))
                for answer in answers:
                    a = answer
                    a_tokens = word_tokenize(a['text'])
                    max_word_length = max(max_word_length, len(a_tokens))
                    
    print('{} entry(s) of knowledge base.'.format(len(data)))
    print('{} question(s)'.format(len(qa_tuples)))
    print('max possible answer for a question is {}'.format(max_possible_answer))
    print('max word length in sentence is {}'.format(max_word_length))
    return data, qa_tuples

faq_data, qa_tuples = read_json('data/question-answering/COVID-QA.json')

147 entry(s) of knowledge base.
2019 question(s)
max possible answer for a question is 1
max word length in sentence is 157


In [9]:
VOCAB_SIZE = 200
DIMENSION_SIZE = 200

# Create vector representation of question.
# @input - question : string
# @output - vector representation of question.
def create_embedding_matrix(question, embeddings):
    if(not embeddings):
        print('please provide embeddings')
        quit()
    if(not question):
        print('please provide question')
        quit()
    embedding_matrix = np.zeros((VOCAB_SIZE, DIMENSION_SIZE))
    tokens = tokenize_sentence(question)
    for token in tokens:
        embedding_vector = embeddings.get(token)
        embedding_matrix[tokens.index(token)] = embedding_vector
        # print('{} => {}'.format(token, embedding_vector))
    
    return embedding_matrix

question = "What are you doing?"
embeddings = load_glove(GLOVE_PATH)
embedding_matrix = create_embedding_matrix(question, embeddings)
embedding_matrix

Found 400000 word vectors.


array([[ 0.39396   ,  0.44185001, -0.0042279 , ...,  0.47576001,
         0.20977999, -0.11687   ],
       [ 0.036749  ,  0.19893999, -0.093035  , ..., -0.013302  ,
        -0.0039236 ,  0.71275997],
       [ 0.85395002,  0.57146001, -0.023652  , ...,  0.31083   ,
        -0.22303   ,  0.20370001],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
# Cosine similarity
# @input - a, b : matrix
# @output - similarity score
from sklearn.metrics.pairwise import cosine_similarity
def calculate_cosine_similarity(a, b):
    score = cosine_similarity(a, b)
    return score

In [11]:
# Calculate sentence similarity between a matrix and a set of matrices
# @input - a : matrix, bs : set of matrix
# @output - index of which matrix having best similarity score
def calculate_sentence_similarity(a, bs):
    scores = [calculate_consine_similarity(a, b) for b in bs]
    max_score = max(scores)
    max_score_index = scores.index(max_score)
    return max_score_index

In [12]:
# Prepare training data
X_train = []
Y_train = []
for qa in qa_tuples:
    X_train.append(create_embedding_matrix(qa['question'], embeddings))
    Y_train.append(create_embedding_matrix(qa['answers'][0]['text'], embeddings))
    
training_fraction = int(0.8 * len(X_train))
X_train = X_train[:training_fraction]
Y_train = Y_train[:training_fraction]

In [13]:
# Prepare validation data
X_validation = X_train[training_fraction:]
Y_validation = Y_train[training_fraction:]

test_fraction = int(0.5 * len(X_validation))
X_validation = X_validation[:test_fraction]
Y_validation = Y_validation[:test_fraction]

In [14]:
# Prepare test data
X_test = X_validation[:test_fraction]
Y_test = Y_validation[:test_fraction]

In [16]:
# Creating model.

from keras import layers

INPUT_DIMENSION = 200 # using GloVe Embedding 200 dimensions.
OUTPUT_DIMENSION = 200 # output embedding layer 50 dimensions.
KERNEL_SIZE = 5 # Kernel size.

# Model architecture
model = keras.Sequential()
model.add(layers.Embedding(input_dim=INPUT_DIMENSION, output_dim=OUTPUT_DIMENSION))
model.add(layers.Convolution1D(filters=INPUT_DIMENSION, kernel_size=KERNEL_SIZE))
model.add(layers.MaxPooling1D(pool_size=2, strides=None, padding="valid"))
model.add(layers.Dense(128))
model.add(layers.Convolution1D(filters=INPUT_DIMENSION, kernel_size=KERNEL_SIZE))
model.add(layers.MaxPooling1D(pool_size=2, strides=None, padding="valid"))
model.add(layers.Dense(128))
model.add(layers.LSTM(128))
model.add(layers.Dense(10))
model.summary()
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 200)         40000     
_________________________________________________________________
conv1d (Conv1D)              (None, None, 200)         200200    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 200)         0         
_________________________________________________________________
dense (Dense)                (None, None, 128)         25728     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 200)         128200    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 200)         0         
_________________________________________________________________
dense_1 (Dense)              (None, None, 128)        

In [26]:
# Model training
# Callback
from tensorflow.keras.callbacks import ModelCheckpoint

callback = ModelCheckpoint(filepath='models/12042021', 
                           monitor='val_accuracy', 
                           mode='max',
                           save_best_only=True)
model.fit(X_train, Y_train, batch_size=10, epochs=10, validation_data=(X_validation, Y_validation), verbose=True, callbacks=[callback])

Epoch 1/10


ValueError: in user code:

    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:756 train_step
        y, y_pred, sample_weight, regularization_losses=self.losses)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/engine/compile_utils.py:203 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/losses.py:152 __call__
        losses = call_fn(y_true, y_pred)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/losses.py:256 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/losses.py:1569 sparse_categorical_crossentropy
        y_true, y_pred, from_logits=from_logits, axis=axis)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/keras/backend.py:4941 sparse_categorical_crossentropy
        labels=target, logits=output)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py:4241 sparse_softmax_cross_entropy_with_logits_v2
        labels=labels, logits=logits, name=name)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/wizard/Reference/lectures/IF6082 NLP/virtualenv/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py:4156 sparse_softmax_cross_entropy_with_logits
        logits.get_shape()))

    ValueError: Shape mismatch: The shape of labels (received (2000,)) should equal the shape of logits except for the last dimension (received (10, 10)).


In [57]:
# Exploration with ALBERT
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
import torch

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs = tokenizer(question, text, return_tensors='pt')
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])

outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

ImportError: 
AlbertTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment.
