<a href="https://colab.research.google.com/github/rvignav/aigents-java-nlp/blob/master/src/test/resources/Baseline_QA/Baseline_QA_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet transformers sentence-transformers nltk pyter3

[K     |████████████████████████████████| 2.5MB 31.6MB/s 
[K     |████████████████████████████████| 92kB 10.1MB/s 
[K     |████████████████████████████████| 901kB 34.2MB/s 
[K     |████████████████████████████████| 3.3MB 35.2MB/s 
[K     |████████████████████████████████| 1.2MB 46.1MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('/content/drive/MyDrive/squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('/content/drive/MyDrive/squad/dev-v2.0.json')

In [3]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        answer['answer_end'] = end_idx

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('deepset/roberta-base-squad2')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=79.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=571.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=772.0, style=ProgressStyle(description_…




In [5]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [6]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [7]:
from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=496313727.0, style=ProgressStyle(descri…




In [8]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    print("Epoch: ", epoch+1)
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

Epoch:  1
Epoch:  2
Epoch:  3


In [9]:
model.eval()

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [10]:
def wer_score(hyp, ref, print_matrix=False):
    import numpy as np
    N = len(hyp)
    M = len(ref)
    L = np.zeros((N,M))
    for i in range(0, N):
        for j in range(0, M):
            if min(i,j) == 0:
                L[i,j] = max(i,j)
            else:
                deletion = L[i-1,j] + 1
                insertion = L[i,j-1] + 1
                sub = 1 if hyp[i] != ref[j] else 0
                substitution = L[i-1,j-1] + sub
                L[i,j] = min(deletion, min(insertion, substitution))
    if print_matrix:
        print("WER matrix ({}x{}): ".format(N, M))
        print(L)
    return int(L[N-1, M-1])

def metrics(fname):
    # BLEU
    from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
    scores = []
    f = open("/content/drive/MyDrive/squad/poc_english.txt", "r")
    f2 = open(fname, "r")
    lines = f.readlines()
    cand = f2.readlines()
    for i in range(len(cand)):
        line = lines[i]
        candidate = []
        l = cand[i].lower().strip('\n')[1:len(cand[i])-2].split(", ")
        for item in l:
            item = item.strip('.').split(" ")
            candidate.append(item)
        arr = line.strip('.\n').split(" ")
        for i in range(len(arr)):
            arr[i] = arr[i].lower()
        reference = [arr]
        for c in candidate:
            # print(reference, c, ': ', sentence_bleu(reference, c, weights=(1,0)))
            scores.append(sentence_bleu(reference, c, weights=(1,0)))

    print("BLEU: " + str(sum(scores)/(1.0*len(scores))))

    # Word2Vec Cosine Similarity
    import torch
    import torch.nn.functional as F
    from sentence_transformers import SentenceTransformer
    import nltk
    nltk.download('punkt')
    from nltk import tokenize
    def similarity(par1, par2):
        transformer = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
        transformer.eval()
        par1 = tokenize.sent_tokenize(par1)
        vec1 = torch.Tensor(transformer.encode(par1))
        vec1 = vec1.mean(0)
        par2 = tokenize.sent_tokenize(par2)
        vec2 = torch.Tensor(transformer.encode(par2))
        vec2 = vec2.mean(0)
        cos_sim = F.cosine_similarity(vec1, vec2, dim=0)
        return cos_sim.item()

    scores = []
    f = open("/content/drive/MyDrive/squad/poc_english.txt", "r")
    f2 = open(fname, "r")
    lines = f.readlines()
    cand = f2.readlines()
    for i in range(len(cand)):
        line = lines[i]
        candidate = []
        l = cand[i].lower().strip('\n')[1:len(cand[i])-2].split(", ")
        for item in l:
            item = item.strip('.').split(" ")
            candidate.append(item)
        arr = line.strip('.\n').split(" ")
        if (len(arr) == 1):
            continue
        for i in range(len(arr)):
            arr[i] = arr[i].lower()
        reference = arr
        for c in candidate:
            scores.append(similarity(" ".join(reference), " ".join(c)))
    print("Word2Vec Cosine Similarity: " + str(sum(scores)/(1.0*len(scores))))

    # WER
    scores = []
    f = open("/content/drive/MyDrive/squad/poc_english.txt", "r")
    f2 = open(fname, "r")
    lines = f.readlines()
    cand = f2.readlines()
    for i in range(len(cand)):
        line = lines[i]
        candidate = []
        l = cand[i].lower().strip('\n')[1:len(cand[i])-2].split(", ")
        for item in l:
            item = item.strip('.').split(" ")
            candidate.append(item)
        arr = line.strip('.\n').split(" ")
        if (len(arr) == 1):
            continue
        for i in range(len(arr)):
            arr[i] = arr[i].lower()
        reference = arr
        for c in candidate:
            scores.append(wer_score(c, reference))
    print("WER: " + str(sum(scores)/(1.0*len(scores))))

    # TER
    import pyter

    scores = []
    f = open("/content/drive/MyDrive/squad/poc_english.txt", "r")
    f2 = open(fname, "r")
    lines = f.readlines()
    cand = f2.readlines()
    for i in range(len(cand)):
        line = lines[i]
        candidate = []
        l = cand[i].lower().strip('\n')[1:len(cand[i])-2].split(", ")
        for item in l:
            item = item.strip('.').split(" ")
            candidate.append(item)
        arr = line.strip('.\n').split(" ")
        if (len(arr) == 1):
            continue
        for i in range(len(arr)):
            arr[i] = arr[i].lower()
        reference = arr
        for c in candidate:
            scores.append(pyter.ter(reference, c))
    print("TER: " + str(sum(scores)/(1.0*len(scores))))

def run(modelname, model, tokenizer):
    # model = AutoModelForQuestionAnswering.from_pretrained(modelname)
    # tokenizer = AutoTokenizer.from_pretrained(modelname)

    from transformers import pipeline
    nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

    rel_and_food = "A mom is a human. A dad is a human. A mom is a parent. A dad is a parent. A son is a child. A daughter is a child. A son is a human. A daughter is a human. A mom likes cake. A daughter likes cake. A son likes sausage. A dad likes sausage. Cake is a food. Sausage is a food. Mom is a human now. Dad is a human now. Mom is a parent now. Dad is a parent now. Son is a child now. Daughter is a child now. Son is a human now. Daughter is a human now. Mom likes cake now. Daughter likes cake now. Son likes sausage now. Dad likes sausage now. Cake is a food now. Sausage is a food now. Mom was a daughter before. Dad was a son before. Mom was not a parent before. Dad was not a parent before. Mom liked cake before. Dad liked sausage before. Cake was a food before. Sausage was a food before."
    prof = "Mom is on the board of directors. Dad is on the board of directors. Son is on the board of directors. Daughter is on the board of directors. Mom writes with chalk on the board. Dad writes with chalk on the board. Son writes with chalk on the board. Daughter writes with chalk on the board. Dad wants Mom to be on the board of directors. Mom wants Dad to be on the board of directors. Dad wants his son to be on the board of directors. Mom wants her daughter to be on the board of directors. Mom writes to Dad with chalk on the board. Dad writes to Mom with chalk on the board. Son writes to Dad with chalk on the board. Daughter writes to Mom with chalk on the board."
    tools_and_pos = "Mom has a hammer. Mom has a saw. Dad has a hammer. Dad has a saw. Mom has a telescope. Mom has binoculars. Dad has a telescope. Dad has binoculars. Mom saw Dad with a hammer. Mom saw Dad with a saw. Dad saw Mom with a hammer. Dad saw Mom with a saw. Saw is a tool. Hammer is a tool. Binoculars are a tool. A telescope is a tool. Mom sawed the wood with a saw. Dad sawed the wood with a saw. Son sawed the wood with a saw. Daughter sawed the wood with a saw. Mom knocked the wood with a hammer. Dad knocked the wood with a hammer. Son knocked the wood with a hammer. Daughter knocked the wood with a hammer. Mom saw Dad with binoculars. Mom saw Dad with a telescope. Dad saw Mom with binoculars. Dad saw Mom with a telescope."

    f = open("/content/drive/MyDrive/squad/poc_english_queries.txt", "r")
    f2name = modelname.split("/")[1] + ".txt"
    f2 = open(f2name, "w")
    
    for line in f:
        parts = line.split(" ")
        context = ""
        if "relationships" in parts[0]:
            context = rel_and_food
        elif "tools" in parts[0]:
            context = tools_and_pos
        else:
            context = prof
        question = ""
        for i in range(len(parts)-1):
            question = question + parts[i+1].rstrip() + " "
        question = question[0:len(question)-1] + "?"
        f2.write(nlp({'question': question, 'context': context })['answer'].replace(".",",") + "\n")
    
    f2.close()

    print(f2name)
    metrics(f2name)
    print('\n')

In [11]:
run('deepset/roberta-base-squad2', model, tokenizer)

roberta-base-squad2.txt
BLEU: 0.76735405740744
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=748.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3975.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=688.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456356.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=498661169.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=52.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=52.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=239.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355881.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=334.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798293.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…


Word2Vec Cosine Similarity: 0.7992178608973821
WER: 0.15
TER: 0.24488095238095225


