In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os

os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

In [6]:
import pandas as pd

# Reads CSVs
def read_csv(file, encoding):
    df = pd.read_csv(file, encoding=encoding)
    df.fillna(value="", inplace=True)
    return df

# Read covid BERT documents
faq_covidbert = read_csv("data/faqs/faq_covidbert.csv", "utf8")
faq_covidbert.head(2)

Unnamed: 0,question,answer,answer_html,link,name,source,category,country,region,city,lang,last_update
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,<p>A novel coronavirus is a new coronavirus th...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...","<p>On February 11, 2020 the World Health Organ...",\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17


In [13]:
import nltk

nltk.download('averaged_perceptron_tagger')

from nltk.tag.perceptron import PerceptronTagger

# Preprocess the text with POS Tagger
# @input - sentence : string
# @return - tuples : array of tuples.
def pos_tagger(sentence):
    pretrainTagger = PerceptronTagger()
    tuples = pretrainTagger.tag(sentence.split())
    return tuples

# Test tagger for a sentence.
tagged_answer0 = pos_tagger(faq_covidbert.iloc[0]['answer'])        

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lmanw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [16]:
tagged_answers = [pos_tagger(s) for s in faq_covidbert['answer']]

In [None]:
# Documents retriever
# Retrieve documents and store them into database
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.retrieve.elasticsearch import ElasticsearchRetriever

retriever = ElasticsearchRetriever()
document_store = ElasticsearchRetriever()

In [7]:
# Load GloVe embeddings
def load_glove():
    f = open('glove/glove.6B.100d.txt',encoding="utf8")
    embeddings_index = dict()
    for line in f:
        values = line.split()
        word = values[0]
        embeddings = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embeddings
    f.close()
    print('Found {} word vectors.'.format(len(embeddings_index)))

load_glove()

Found 400000 word vectors.


In [11]:
INPUT_DIMENSION = 100 # using GloVe Embedding 100 dimensions.
OUTPUT_DIMENSION = 50 # output embedding layer 50 dimensions.
KERNEL_SIZE = 5 # Kernel size.

model = keras.Sequential()
model.add(layers.Embedding(input_dim=INPUT_DIMENSION, output_dim=OUTPUT_DIMENSION))
model.add(layers.Convolution1D(filters=INPUT_DIMENSION, kernel_size=KERNEL_SIZE))
model.add(layers.MaxPooling1D(pool_size=2, strides=None, padding="valid"))
model.add(layers.Dense(128))
model.add(layers.Convolution1D(filters=INPUT_DIMENSION, kernel_size=KERNEL_SIZE))
model.add(layers.MaxPooling1D(pool_size=2, strides=None, padding="valid"))
model.add(layers.Dense(128))
model.add(layers.LSTM(128))
model.add(layers.Dense(10))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 50)          5000      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 100)         25100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 100)         0         
_________________________________________________________________
dense_3 (Dense)              (None, None, 128)         12928     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 100)         64100     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, None, 100)         0         
_________________________________________________________________
dense_4 (Dense)              (None, None, 128)        

In [21]:
# Exploration with ALBERT
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
import torch

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs = tokenizer(question, text, return_tensors='pt')
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])

outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

ImportError: 
AlbertForQuestionAnswering requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
