Details of data -

Passages - Saved only the passages for the queries that have top 10 answers
Train set - 15k queries with well-formed answers
Test set - 3k queries with well-formed answers

Basic Setup

In [2]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment1/'
FOLDERNAME = 'MasterCourses/compsci685/Project/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")

!pip install transformers

%cd /content/drive/My\ Drive/$FOLDERNAME/

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install -r rouge/requirements.txt
!pip install rouge-score

Mounted at /content/drive
Found device: Tesla T4, n_gpu: 1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hu

Utility - Function to clean up cuda space

In [8]:
import gc

def clear_cache(model):
  torch.cuda.empty_cache()
  gc.collect()
  del model

Util functions - Function to compute rouge score

In [7]:
from rouge_score import rouge_scorer

def compute_rouge_score(wellFormedAnswers, output):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    rouge_precision_scores = []
    rouge_recall_scores = []
    rouge_f1_scores = []

    for ind in range(len(output)):
        score = scorer.score(wellFormedAnswers[ind], output[ind])
        rouge_precision_scores.append(score['rougeL'].precision)
        rouge_recall_scores.append(score['rougeL'].recall)
        rouge_f1_scores.append(score['rougeL'].fmeasure)
    
    return(rouge_precision_scores, rouge_recall_scores, rouge_f1_scores) 

Util functions - Functions to save a file for lowest, highest and mid results for rouge-L (for human evaluation)




In [9]:
import numpy as np
import math
import json

def human_eval_rouge_l(rouge_precision_scores, rouge_recall_scores, rouge_f1_scores, passages, filename, wellFormedAnswers, query_passages_answer_list):
  num_scores = len(rouge_precision_scores)
  mid = math.ceil((num_scores - 1) / 2)

  prec_scores_arg_sorted = np.argsort(rouge_precision_scores)
  recall_scores_arg_sorted = np.argsort(rouge_recall_scores)
  rouge_f1_scores_arg_sorted = np.argsort(rouge_f1_scores)

  human_eval_query_indices = {}

  human_eval_query_indices['precision'] = {}
  human_eval_query_indices['recall'] = {}
  human_eval_query_indices['f1'] = {}


  human_eval_query_indices['precision']['lowest'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in prec_scores_arg_sorted[0:5]]

  human_eval_query_indices['precision']['mid'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in prec_scores_arg_sorted[(mid - 2): (mid + 3)]]

  human_eval_query_indices['precision']['highest'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in prec_scores_arg_sorted[(num_scores-5): num_scores]]


  human_eval_query_indices['recall']['lowest'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in recall_scores_arg_sorted[0:5]]

  human_eval_query_indices['recall']['mid'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in recall_scores_arg_sorted[(mid - 2): (mid + 3)]]

  human_eval_query_indices['recall']['highest'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in recall_scores_arg_sorted[(num_scores-5): num_scores]]


  human_eval_query_indices['f1']['lowest'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in rouge_f1_scores_arg_sorted[0:5]]

  human_eval_query_indices['f1']['mid'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in rouge_f1_scores_arg_sorted[(mid - 2): (mid + 3)]]

  human_eval_query_indices['f1']['highest'] = [{
      'query': query_passages_answer_list[ind]['query'],
      'generated_answer': test_output[ind],
      'well_formed_answer': wellFormedAnswers[ind],
      'retrieved_passages': passages[ind]
  } for ind in rouge_f1_scores_arg_sorted[(num_scores-5): num_scores]]

  # Serializing json
  json_object = json.dumps(human_eval_query_indices, indent=4)
  
  # Writing to vlad_bm25_output_analysis.json
  with open(filename, "w") as outfile:
      outfile.write(json_object)

Util functions - Functions to save a file for lowest, highest and mid results for rouge-L (for human evaluation)

In [10]:
def find_average_rouge_save_human_eval(wellFormedAnswers, output, passages, filename, query_passages_answer_list):

  rouge_score = compute_rouge_score(wellFormedAnswers, output)

  rouge_precision_scores = rouge_score[0]
  rouge_recall_scores = rouge_score[1]
  rouge_f1_scores = rouge_score[2]

  num_scores = len(rouge_precision_scores)

  avg_precision = sum(rouge_precision_scores) / num_scores
  avg_recall = sum(rouge_recall_scores) / num_scores
  avg_f1 = sum(rouge_f1_scores) / num_scores

  print(avg_precision, avg_recall, avg_f1)

  human_eval_rouge_l(rouge_precision_scores, rouge_recall_scores, rouge_f1_scores, passages, filename, wellFormedAnswers, query_passages_answer_list)

Initializing variables used over and over

In [11]:
import pickle

with open("dataset/train_query_passages_answer_list", "rb") as fp:   # Unpickling
    train_query_passages_answer_list = pickle.load(fp)

with open("dataset/val_query_passages_answer_list", "rb") as fp:   # Unpickling
    val_query_passages_answer_list = pickle.load(fp)

with open("dataset/test_query_passages_answer_list", "rb") as fp:   # Unpickling
    test_query_passages_answer_list = pickle.load(fp)

with open("dataset/passages", "rb") as fp:   # Unpickling
    passages = pickle.load(fp)

with open("retriever/bm25/bm25_results/test_okapi_bm_25_top10", "rb") as fp:   # Unpickling
    test_okapi_bm_25_top10 = pickle.load(fp)

with open("retriever/dpr/results/query_marcob256_passage_marcob256_top10", "rb") as fp:   # Unpickling
    test_dpr_top10 = pickle.load(fp)

with open("retriever/random_passages_list", "rb") as fp:   # Unpickling
    random_passages_list = pickle.load(fp)

test_bm_25_passages_list = []
for ind in range(len(test_okapi_bm_25_top10)):
  test_bm_25_passages_list.append(list(passages[i] for i in test_okapi_bm_25_top10[ind]))

test_dpr_passages_list = []
for ind in range(len(test_dpr_top10)):
  test_dpr_passages_list.append(list(passages[i] for i in test_dpr_top10[ind]))

test_queries = [test_query_passages_answer['query'] for test_query_passages_answer in test_query_passages_answer_list]

test_wellFormedAnswers = [test_query_passages_answer['wellFormedAnswer'] for test_query_passages_answer in test_query_passages_answer_list]

test_correct_passages_list = [test_bm_25_passages['passages'] for test_bm_25_passages in test_query_passages_answer_list]

Obtaining first query and relevant BM25 passages

In [None]:
import pickle

zeroth_query_passages_answer = test_query_passages_answer_list[0]
print(zeroth_query_passages_answer['query'])
print(zeroth_query_passages_answer['wellFormedAnswer'])

zeroth_bm_25_passages = test_bm_25_passages_list[0]
print(zeroth_bm_25_passages)

albany mn population
The population of Albany, Minnesota is 2,662. 
['City of Albany, MN Zip Codes. City of Albany, MN Demographic Information. * Demographic data is based on information taken from the 2000 Census. City of Albany, MN covers 1 Area Code. City of Albany, MN covers 1 Zip Code. 15 Cities within 15 Miles of the City of Albany, MN.', 'The population density in New Albany is 143% higher than Ohio. The median age in New Albany is 1% higher than Ohio. In New Albany 83.52% of the population is Caucasian. In New Albany 1.84% of the population is African American. In New Albany 7.31% of the population is Asian.', 'Recent posts about Albany, Minnesota on our local forum with over 2,000,000 registered users. Albany is mentioned 87 times on our forum: Latest news from Albany, MN collected exclusively by city-data.com from local newspapers, TV, and radio stations. Ancestries: German (55.6%), Irish (10.0%), Polish (5.9%), Norwegian (5.4%), Swedish (2.8%), United States (2.6%).', 'Alban

Trying this out with bert base model

In [None]:
import torch

from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM

model_name = "facebook/bart-base"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

# it all starts with a question/query
query = zeroth_query_passages_answer['query']

# given the question above suppose these documents below were found in some document store 
documents = zeroth_bm_25_passages

# concatenate question and support documents into BART input
conditioned_doc = "<P> " + " <P> ".join([d for d in documents])
query_and_docs = "question: {} context: {}".format(query, conditioned_doc)

model_input = tokenizer(query_and_docs, truncation=True, padding=True, return_tensors="pt")

# greedy decoding
generated_answers_encoded = model.generate(input_ids=model_input["input_ids"].to(device))

tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]



['question: albany mn population context: <P> City of Albany,']

In [None]:
clear_cache(model)

bart-base gives weird answer. Let us try the other model fine-tuned on ELI5 for this format

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM

model_name = "vblagoje/bart_lfqa"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

# it all starts with a question/query
query = zeroth_query_passages_answer['query']

# given the question above suppose these documents below were found in some document store 
documents = zeroth_bm_25_passages

# concatenate question and support documents into BART input
conditioned_doc = "<P> " + " <P> ".join([d for d in documents])
query_and_docs = "question: {} context: {}".format(query, conditioned_doc)

model_input = tokenizer(query_and_docs, truncation=True, padding=True, return_tensors="pt")

# Using greedy decoding
generated_answers_encoded = model.generate(input_ids=model_input["input_ids"].to(device))
tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]



['I live in Albany, MN. I live in the city of Albany, MN. I']

Now replace documents with some random text and check if similar answer is generated.

In [None]:
# given the question above suppose these documents below were found in some document store 
documents = ["when the skin is completely wet. The body continuously loses water by...",
             "at greater pressures. There is an ambiguity, however, as to the meaning of the terms 'heating' and 'cooling'...",
             "are not in a relation of thermal equilibrium, heat will flow from the hotter to the colder, by whatever pathway...",
             "air condition and moving along a line of constant enthalpy toward a state of higher humidity. A simple example ...",            
             "Thermal contact conductance In physics, thermal contact conductance is the study of heat conduction between solid ..."]

# concatenate question and support documents into BART input
conditioned_doc = "<P> " + " <P> ".join([d for d in documents])
query_and_docs = "question: {} context: {}".format(query, conditioned_doc)

model_input = tokenizer(query_and_docs, truncation=True, padding=True, return_tensors="pt")

generated_answers_encoded = model.generate(input_ids=model_input["input_ids"].to(device))
                                           
tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True)

["I'm not sure if this is a good question, but I'll try to answer it"]

Answers are different. We see in the first case that information is actually picked from the BM25 passages retrieved.

Now, lets pass all the test bm25 examples through this model and observe

In [None]:
queries_and_docs = []

for ind in range(len(test_query_passages_answer_list)):
  queries_and_docs.append("question: {} context: {}".format(test_queries[ind], test_bm_25_passages_list[ind]))

In [None]:
print(queries_and_docs[0])

question: albany mn population context: ['City of Albany, MN Zip Codes. City of Albany, MN Demographic Information. * Demographic data is based on information taken from the 2000 Census. City of Albany, MN covers 1 Area Code. City of Albany, MN covers 1 Zip Code. 15 Cities within 15 Miles of the City of Albany, MN.', 'The population density in New Albany is 143% higher than Ohio. The median age in New Albany is 1% higher than Ohio. In New Albany 83.52% of the population is Caucasian. In New Albany 1.84% of the population is African American. In New Albany 7.31% of the population is Asian.', 'Recent posts about Albany, Minnesota on our local forum with over 2,000,000 registered users. Albany is mentioned 87 times on our forum: Latest news from Albany, MN collected exclusively by city-data.com from local newspapers, TV, and radio stations. Ancestries: German (55.6%), Irish (10.0%), Polish (5.9%), Norwegian (5.4%), Swedish (2.8%), United States (2.6%).', 'Albany, Ohio - Basic Facts. The V

In [None]:
test_output = []

if False:
  for i in range(int(len(queries_and_docs)/5)):
      print(i)
      model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
      generated_answers_encoded = model.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)

      test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))

Save output from pretrained bart_lfqa model

In [None]:
if False:
  with open("generator/generator_results/vlad_generator/bm25_top10_passages/bm25_vlad_beam_answers", "wb") as fp:   # Unpickling
        pickle.dump(test_output, fp)

Next, lets evaluate the ROUGE-L score for these outputs and the well-formed answers.

In [None]:
with open("generator/generator_results/vlad_generator/bm25_top10_passages/bm25_vlad_beam_answers", "rb") as fp:   # Unpickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, test_bm_25_passages_list, 'generator/generator_results/vlad_generator/bm25_top10_passages/bm25_vlad_beam_analysis.json', test_query_passages_answer_list)

0.15020265121241716 0.5032841851962083 0.21033006965476783


Now, lets try to run the same model with dpr results

In [None]:
queries_and_docs = []

if False:
    for ind in range(len(test_query_passages_answer_list)):
      queries_and_docs.append("question: {} context: {}".format(test_queries[ind], test_dpr_passages_list[ind]))

    test_output = []

    for i in range(int(len(queries_and_docs)/5)):
        print(i)
        model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
        generated_answers_encoded = model.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)
        test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))
        
    with open("generator/generator_results/vlad_generator/dpr_top10_passages/dpr_vlad_beam_answers", "wb") as fp:   # pickling
        pickle.dump(test_output, fp)

In [None]:
with open("generator/generator_results/vlad_generator/dpr_top10_passages/dpr_vlad_beam_answers", "rb") as fp:   # Unpickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, test_dpr_passages_list, 'generator/generator_results/vlad_generator/dpr_top10_passages/dpr_vlad_beam_analysis.json', test_query_passages_answer_list)

0.15145248382855286 0.5367253732194298 0.21604219710151806


Now replace with random documents as context and find ROUGE-L for this model

In [None]:
import random
  
queries_and_docs = []

if False:

    for ind in range(len(test_query_passages_answer_list)):
      queries_and_docs.append("question: {} context: {}".format(test_queries[ind], random_passages_list[ind]))

    test_output = []

    for i in range(int(len(queries_and_docs)/5)):
        print(i)
        model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
        generated_answers_encoded = model.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)
        test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))
        
    with open("generator/generator_results/vlad_generator/random_passages/random_vlad_beam_answers", "wb") as fp:   # pickling
        pickle.dump(test_output, fp)

In [None]:
with open("generator/generator_results/vlad_generator/random_passages/random_vlad_beam_answers", "rb") as fp:   # Unpickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, random_passages_list, "generator/generator_results/vlad_generator/random_passages/random_vlad_beam_analysis.json", test_query_passages_answer_list)

0.09373083827289931 0.36526152767757153 0.13498635883782625


We see that rouge-L is smaller for random docs than the retrieved BM25 docs and DPR docs. Hence, model is working as expected.

Now, check rouge-L for 100% correct retrieval. This is the best the given generator can do.




In [None]:
import random

if False:
  queries_and_docs = []

  random.seed(0)

  for ind in range(len(query_details_passages)):
    query = query_details_passages[ind]['query']
    correct_passages = query_details_passages[ind]['passages']

    queries_and_docs.append("question: {} context: {}".format(query, correct_passages))


  test_output = []

  for i in range(int(len(queries_and_docs)/5)):
      print(i)
      model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
      generated_answers_encoded = model.generate(input_ids=model_input["input_ids"].to(device), max_length=256)
      test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))

  with open("generator/generator_results/vlad_generator/correct_passages/greedy/correct_vlad_greedy_answers", "wb") as fp:   # pickling
      pickle.dump(test_output, fp)

In [22]:
with open("generator/generator_results/vlad_generator/correct_passages/beam/correct_vlad_beam_answers", "rb") as fp:   # Unpickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, test_correct_passages_list, "generator/generator_results/vlad_generator/correct_passages/beam/correct_vlad_beam_analysis.json", test_query_passages_answer_list)

0.1584504368388216 0.5376603241304027 0.22338830818959646


In [25]:
for ind in range(len(test_wellFormedAnswers)):
  if ("There are after 3 days" in test_wellFormedAnswers[ind]):
    print(ind)

print(test_output[11])

11
It depends on what you mean by insane. If you mean insane in the sense that you can't be reasoned with, then yes. If you mean insane in the sense that you can't be reasoned with, then no. If you mean insane in the sense that you can't be reasoned with, then yes.


In the best case above, we see that for all correct passages provided, the 

---

ROUGE-L scores are lower. Upon human evaluation, we find that ROUGE-L is actually corresponding to the correct human judgement.

The answers that seem to be bad are -
(i) Keep repeating non-sensical sentences. - Update decoding algorithm and check
(ii) Hallucinate from ELI5 (particularly if answer is not directly obtainable from the context.) - Perhaps start and fine tune a new small generator

Why does greedy decoding algorithm repeat sentences? - Common problem with greedy [https://towardsdatascience.com/the-three-decoding-methods-for-nlp-23ca59cb1e9d].

Let's try beam search decoding for correct passages

In [None]:
import random


if False:

    queries_and_docs = []

    random.seed(0)

    for ind in range(len(query_details_passages)):
      query = query_details_passages[ind]['query']
      correct_passages = query_details_passages[ind]['passages']

      queries_and_docs.append("question: {} context: {}".format(query, correct_passages))


    test_output = []

    for i in range(int(len(queries_and_docs)/5)):
        print(i)
        model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
        generated_answers_encoded = model.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)
        test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True, clean_up_tokenization_spaces=True))

        with open("generator/generator_results/vlad_generator/correct_passages/beam/correct_vlad_beam_answers", "wb") as fp:   # pickling
          pickle.dump(test_output, fp)

In [None]:
with open("generator/generator_results/vlad_generator/correct_passages/beam/correct_vlad_beam_answers", "rb") as fp:   # Unpickling
  test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, test_correct_passages_list, "generator/generator_results/vlad_generator/correct_passages/beam/correct_vlad_beam_analysis.json", test_query_passages_answer_list)

0.1584504368388216 0.5376603241304027 0.22338830818959646


Here beam is working better than greedy for correct passages. By default, the model is returning long answers, beam search forcing it to return shorter answers

Hence use a t5 small model as generator and finetune it on our training data.

In [None]:
from transformers import AutoTokenizer

if False:

  model_checkpoint = "t5-small"
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_target_length = 256, model_max_length=2000)

Creating datasets

In [None]:
def preprocess_data(query_passage_details):
  inputs = []
  labels = []

  for ind in range(len(query_passage_details)):
    query = query_passage_details[ind]['query']
    passages = query_passage_details[ind]['passages']
    conditioned_doc = "<P> " + " <P> ".join([passage for passage in passages])
    query_and_docs = "question: {} context: {}".format(query, conditioned_doc)
    inputs.append(query_and_docs)
    labels.append(query_passage_details[ind]['wellFormedAnswer'])

  return (inputs, labels)

In [None]:
import pickle

if False:
    with open("dataset/train_query_passages_answers_list", "rb") as fp:   # Unpickling
        train_query_passages_answers_list = pickle.load(fp)

    train_inputs_labels = preprocess_data(train_query_passages_answers_list)
    train_input_encodings = tokenizer(train_inputs_labels[0], padding=True, truncation=True)
    train_label_encodings = tokenizer(train_inputs_labels[1], padding=True, truncation=True)

In [None]:
import pickle

if False:
    with open("dataset/val_query_passages_answers_list", "rb") as fp:   # Unpickling
        val_query_passages_answers_list = pickle.load(fp)

    val_inputs_labels = preprocess_data(val_query_passages_answers_list)
    val_input_encodings = tokenizer(val_inputs_labels[0], padding=True, truncation=True)
    val_label_encodings = tokenizer(val_inputs_labels[1], padding=True, truncation=True)

In [None]:
import pickle

if False:
    with open("dataset/test_query_passages_answers_list", "rb") as fp:   # Unpickling
        test_query_passages_answers_list = pickle.load(fp)

    test_inputs_labels = preprocess_data(test_query_passages_answers_list)
    test_input_encodings = tokenizer(test_inputs_labels[0], padding=True)
    test_label_encodings = tokenizer(test_inputs_labels[1], padding=True)

In [None]:
import torch

class QADataset(torch.utils.data.Dataset):
    def __init__(self, input_encodings, label_encodings):
        self.input_encodings = input_encodings
        self.label_encodings = label_encodings

    def __getitem__(self, idx):
        item = {
            'input_ids':  torch.tensor(self.input_encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.input_encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.label_encodings['input_ids'][idx])
        }
        return item

    def __len__(self):
        return len(self.label_encodings['input_ids'])

In [None]:
if False:
    train_dataset = QADataset(train_input_encodings, train_label_encodings)
    val_dataset = QADataset(val_input_encodings, val_label_encodings)
    test_dataset = QADataset(test_input_encodings, test_label_encodings)

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
if False:
    batch_size = 3 # No processing happens for more than this size
    model_name = "t5-small-marco"
    model_dir = f"Models/{model_name}"

    # hyperparameters = [{
    #     'lr': 4e-5,
    #     'epochs': 1,
    #     'decay': 0.01
    # }] # Not being used. Just test if training is working

    args = Seq2SeqTrainingArguments(
        model_dir,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_strategy="steps",
        logging_steps=100,
        save_strategy="steps",
        save_steps=200,
        learning_rate=4e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        num_train_epochs=1,
        predict_with_generate=True,
        fp16=True,
        load_best_model_at_end=True,
        metric_for_best_model="rougel",
        resume_from_checkpoint="t5-base-medium-title-generation/checkpoint-800"
    )

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    f1_list = []

    # Compute ROUGE scores
    for ind in range(len(decoded_preds)):
        score = scorer.score(decoded_labels[ind], decoded_preds[ind])
        f1_list.append(score['rougeL'].fmeasure)
    
    f1 = sum(f1_list) / len(f1_list)
    return {"rougel": f1, "length_preds": len(f1_list)}

In [None]:
# Function that returns an untrained model to be trained

def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

if False:

    trainer = Seq2SeqTrainer(
        model_init=model_init,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [None]:
if False:
    trainer.train()
    trainer.save_model("generator/finetuned_models/t5-small-marco/best_model")

Remember this model is saved in location above -- Check out its performance.

In [None]:
clear_cache(model)

In [None]:
model_checkpoint = f"generator/finetuned_models/t5-small-marco/best_model"

new_t5 = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_target_length = 256, model_max_length=2000)

In [None]:
import pickle

queries_and_docs = []

for ind in range(len(test_query_passages_answer_list)):
  queries_and_docs.append("question: {} context: {}".format(test_queries[ind], test_correct_passages_list[ind]))

In [None]:
model_input = tokenizer(queries_and_docs[0], truncation=True, padding=True, return_tensors="pt")
generated_answers_encoded = new_t5.generate(input_ids=model_input["input_ids"].to(device))
print(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True, max_length=256))
print(test_wellFormedAnswers[0])


['The population of albany, Minnesota is 2,662 people.']
The population of Albany, Minnesota is 2,662. 


Seems to be workig well. Ru for etire test set.

In [None]:
if False:

    queries_and_docs = []

    random.seed(0)

    for ind in range(len(query_details_passages)):
      query = query_details_passages[ind]['query']
      correct_passages = query_details_passages[ind]['passages']

      queries_and_docs.append("question: {} context: {}".format(query, correct_passages))


    test_output = []

    for i in range(int(len(queries_and_docs)/5)):
        print(i)
        model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
        generated_answers_encoded = new_t5.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)
        test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))

In [None]:
import pickle

if False:
    with open("generator/generator_results/t5_generator/correct_passages/greedy/correct_t5_greedy_answers", "wb") as fp:   # Unpickling
        pickle.dump(test_output, fp)

In [12]:
import pickle
with open("generator/generator_results/t5_generator/correct_passages/greedy/correct_t5_greedy_answers", "rb") as fp:   # Unpickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, test_correct_passages_list, "generator/generator_results/t5_generator/correct_passages/greedy/correct_t5_greedy_analysis.json", test_query_passages_answer_list)

0.5398611826569814 0.5419433090341561 0.512867106286264


Also, check what t5-small does to check if your model actually achieves somethig.

In [None]:
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_target_length = 256, model_max_length=2000)
t5_small = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

input = "question: {} context: {}".format(test_queries[0], test_correct_passages_list[0])

model_input = tokenizer(input, truncation=True, padding=True, return_tensors="pt")
generated_answers_encoded = t5_small.generate(input_ids=model_input["input_ids"].to(device))
print(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True, clean_up_tokenization_spaces=True, max_length=256))
print(test_wellFormedAnswers[0])

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

['as per 2017 US Census estimate, has a community population of 2,662 people']
The population of Albany, Minnesota is 2,662. 


In [None]:
clear_cache(t5_small)

So we see that training helps. This model has achieved pretty good results with correct passages - 0.51 rouge score - Hece our comparison metric is 0.5398611826569814 0.5419433090341561 0.512867106286264. Geerator is good now. Lets also try the same geerator with beam search decoding.

In [None]:
if False:
  test_output = []

  for i in range(int(len(queries_and_docs)/5)):
      print(i)
      model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
      generated_answers_encoded = new_t5.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)
      test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))

In [None]:
import pickle

if False:
  with open("generator/generator_results/t5_generator/correct_passages/beam/correct_t5_beam_answers", "wb") as fp:   # Unpickling
      pickle.dump(test_output, fp)

In [None]:
with open("generator/generator_results/t5_generator/correct_passages/beam/correct_t5_beam_answers", "rb") as fp:   # Unpickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, test_correct_passages_list, "generator/generator_results/t5_generator/correct_passages/beam/correct_t5_beam_analysis.json", test_query_passages_answer_list)

0.5193452158417611 0.5422452700886792 0.5067625466398057


Rouge F1 decreases a little. So, keepig the decoder constat at greedy.




Finding numbers for the same for bm25, dpr and random.


In [None]:
queries_and_docs = []

for ind in range(len(test_query_passages_answer_list)):
  queries_and_docs.append("question: {} context: {}".format(test_queries[ind], test_bm_25_passages_list[ind]))

test_output = []

for i in range(int(len(queries_and_docs)/5)):
    print(i)
    model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
    generated_answers_encoded = new_t5.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)
    test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))

with open("generator/generator_results/t5_generator/bm25_top10_passages/bm25_t5_beam_answers", "wb") as fp:   # Unpickling
    pickle.dump(test_output, fp)

In [None]:
with open("generator/generator_results/t5_generator/bm25_top10_passages/bm25_t5_beam_answers", "rb") as fp:   # Unpickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, test_bm_25_passages_list, "generator/generator_results/t5_generator/bm25_top10_passages/bm25_t5_beam_analysis.json", test_query_passages_answer_list)

0.45445431845862755 0.476637614577084 0.4408452929768457


Now working on dpr passages

In [None]:
import random


if False:

    queries_and_docs = []

    for ind in range(len(test_query_passages_answer_list)):
      queries_and_docs.append("question: {} context: {}".format(test_queries[ind], test_dpr_passages_list[ind]))

    test_output = []

    for i in range(int(len(queries_and_docs)/5)):
        print(i)
        model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
        generated_answers_encoded = new_t5.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)
        test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))

    with open("generator/generator_results/t5_generator/dpr_top10_passages/dpr_t5_beam_answers", "wb") as fp:   # pickling
      pickle.dump(test_output, fp)

In [None]:
with open("generator/generator_results/t5_generator/dpr_top10_passages/dpr_t5_beam_answers", "rb") as fp:   # pickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, test_dpr_passages_list, "generator/generator_results/t5_generator/dpr_top10_passages/dpr_t5_beam_analysis.json", test_query_passages_answer_list)

0.5002062468783519 0.513547558070691 0.47993025933401773


Now working on random passages

In [None]:
import random

if False:
    queries_and_docs = []

    for ind in range(len(test_query_passages_answer_list)):
      queries_and_docs.append("question: {} context: {}".format(test_queries[ind], random_passages_list[ind]))

    test_output = []

    for i in range(int(len(queries_and_docs)/5)):
        print(i)
        model_input = tokenizer(queries_and_docs[(5*i):(5*i+5)], truncation=True, padding=True, return_tensors="pt")
        generated_answers_encoded = new_t5.generate(input_ids=model_input["input_ids"].to(device), max_length=256, do_sample=False, early_stopping=True, num_beams=8)
        test_output.extend(tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True))

    with open("generator/generator_results/t5_generator/random_passages/random_t5_beam_answers", "wb") as fp:   # pickling
      pickle.dump(test_output, fp)

In [None]:
with open("generator/generator_results/t5_generator/random_passages/random_t5_beam_answers", "rb") as fp:   # pickling
    test_output = pickle.load(fp)

find_average_rouge_save_human_eval(test_wellFormedAnswers, test_output, random_passages_list, "generator/generator_results/t5_generator/random_passages/random_t5_beam_analysis.json", test_query_passages_answer_list)

0.3404436502603323 0.3161419398080305 0.30356307248627595


Do human evaluation. The focus on making retrieval better.