<a href="https://colab.research.google.com/github/bronte-baer/Folder-Structure-Conventions/blob/master/code/inference/BART_pt_quac_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Inference: BART trained on SQuAD for QuAC

In [None]:
import os
import re
import numpy as np
import pandas as pd
import json

from matplotlib import pyplot as plt
from pprint import pprint

from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Preprocess QuAC data

DO NOT RUN SECTION

(will not need to run this section again since the csv files for train and validation pairs will already be created)

In [None]:
# Some important file locations and constants

dataset_root = "/content/drive/MyDrive/w266 NLP Final Project/Data/"
dataset_name = "quac"
dataset_folder = f'{dataset_root}{dataset_name}/'
training_file = dataset_folder + 'bart_train_pairs.csv'
validation_file = dataset_folder + 'bart_valid_pairs.csv'
training_file_source = f"{dataset_folder}{dataset_name}_train.squad.json"
validation_file_source = f"{dataset_folder}{dataset_name}_val.squad.json"

In [None]:
train_quac = pd.read_json(training_file_source)
train_quac = train_quac['data']

In [None]:
training_context = [sample['paragraphs'][0]['context'].rstrip(" CANNOTANSWER") for sample in train_quac]

In [None]:
# For some reason I can't get this to work as single list comprehension.
# Note the special handling where there is no answer, we will remove those rows later

training_answers = [sample['paragraphs'][0]['qas'][0]['answers'] if sample else sample for sample in train_quac]
training_answers = [sample[0]['text'] if sample else sample for sample in training_answers]

In [None]:
training_questions = [sample['paragraphs'][0]['qas'][0]['question'] for sample in train_quac]

In [None]:
# Create our validation version....
validation_quac = pd.read_json(validation_file_source)
validation_quac = validation_quac['data']
validation_context = [sample['paragraphs'][0]['context'].rstrip(" CANNOTANSWER") for sample in validation_quac]
validation_answers = [sample['paragraphs'][0]['qas'][0]['answers'] if sample else sample for sample in validation_quac]
validation_answers = [sample[0]['text'] if sample else sample for sample in validation_answers]
validation_questions = [sample['paragraphs'][0]['qas'][0]['question'] for sample in validation_quac]

In [None]:
training_triples = [(answer, context, question) for answer, context, question in zip (training_answers, training_context, training_questions) if answer]
validation_triples = [(answer, context, question) for answer, context, question in zip (validation_answers, validation_context, validation_questions) if answer]

In [None]:
training_orig = [f"{answer} </s> {context}" for answer, context, _ in training_triples]
training_target = [question for _, _, question in training_triples]
validation_orig = [f"{answer} </s> {context}" for answer, context, _ in validation_triples]
validation_target = [question for _, _, question in validation_triples]

In [None]:
training_df = pd.DataFrame()
training_df['orig'] = training_orig
training_df['target'] = training_target
training_df

In [None]:
validation_df = pd.DataFrame()
validation_df['orig'] = validation_orig
validation_df['target'] = validation_target
validation_df

In [None]:
# Save splits to separate csv files, to load only part at a time later
training_df.to_csv(training_file)
validation_df.to_csv(validation_file)

## Try inference with QuAC

In [None]:
# Set these constants for each model and validation dataset combination

model_name = "bart_base_pt.squad"
validation_dataset_name = "quac"

save_predictions = True
save_mode = 'a' # w for write, a for append

max_length = 512 # 1024 for long model and 512 otherwise
batch_size = 150 # 150 is the norm, but dial back when needed

start_sample = None  # If None, then 0 will be used
end_sample = None # If None, then the end of the set will be used

### Generate predictions

In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 5.5 MB 4.1 MB/s 
[K     |████████████████████████████████| 163 kB 95.4 MB/s 
[K     |████████████████████████████████| 7.6 MB 52.9 MB/s 
[?25h

In [None]:
!pip install -q sentencepiece

[?25l[K     |▎                               | 10 kB 30.2 MB/s eta 0:00:01[K     |▌                               | 20 kB 4.9 MB/s eta 0:00:01[K     |▊                               | 30 kB 7.1 MB/s eta 0:00:01[K     |█                               | 40 kB 3.3 MB/s eta 0:00:01[K     |█▎                              | 51 kB 3.4 MB/s eta 0:00:01[K     |█▌                              | 61 kB 4.1 MB/s eta 0:00:01[K     |█▉                              | 71 kB 4.3 MB/s eta 0:00:01[K     |██                              | 81 kB 4.5 MB/s eta 0:00:01[K     |██▎                             | 92 kB 5.1 MB/s eta 0:00:01[K     |██▋                             | 102 kB 4.0 MB/s eta 0:00:01[K     |██▉                             | 112 kB 4.0 MB/s eta 0:00:01[K     |███                             | 122 kB 4.0 MB/s eta 0:00:01[K     |███▍                            | 133 kB 4.0 MB/s eta 0:00:01[K     |███▋                            | 143 kB 4.0 MB/s eta 0:00:01[K    

In [None]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Some important file locations and constants

project_root = "/content/drive/MyDrive/w266 NLP Final Project/"
dataset_root = project_root + "Data/"
model_root = project_root + "Models/"
prediction_folder = project_root + "Predictions/"

tokenizer = "facebook/bart-base"

model_folder = model_root + model_name

validation_data_file = f"{dataset_root}quac/bart_valid_pairs.csv"
if validation_dataset_name != "quac":
  validation_data_file = f"{dataset_root}{validation_dataset_name}/bart_valid_pairs.csv"

prediction_file = f"{prediction_folder}predictions.{model_name}.{validation_dataset_name}.csv"

In [None]:
validation_df = pd.read_csv(validation_data_file)
validation_df[['orig', 'target']][:2]

Unnamed: 0,orig,target
0,"In May 1983, she married Nikos Karvelas, a com...",what happened in 1983?
1,in November she gave birth to her daughter Sof...,did they have any children?


In [None]:
validation_df.shape[0]

5868

In [None]:
# Download tokenizer and model, associate the model with the GPU

bart_tokenizer = BartTokenizer.from_pretrained(tokenizer)
bart_model = BartForConditionalGeneration.from_pretrained(model_folder)
# bart_model.to(torch.device('cuda:0'))
# pass

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [None]:
predictions = []

if start_sample is None:
  start_sample = 0

if end_sample is None:
  end_sample = validation_df.shape[0]

print(f"Generating predictions from {start_sample} to {end_sample}:")
for start in range (start_sample, end_sample, batch_size):
  to = min([end_sample, start + batch_size])
  inputs = bart_tokenizer(validation_df['orig'][start:to].to_list(), return_tensors='pt', max_length=max_length, truncation=True, padding=True)
  output_ids = bart_model.generate(inputs['input_ids'], max_length=max_length)
  prediction_batch = bart_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  predictions.extend(prediction_batch)
  print (f"{to} ", end="")
  if to%1000 == 0: print()
print("Predictions generated.")

Generating predictions from 0 to 5868:
150 300 450 600 750 900 1050 1200 1350 1500 1650 1800 1950 2100 2250 2400 2550 2700 2850 3000 
3150 3300 3450 3600 3750 3900 4050 4200 4350 4500 4650 4800 4950 5100 5250 5400 5550 5700 5850 5868 Predictions generated.


In [None]:
# predictions = []
# for input_text in validation_df['orig']:
#   inputs = bart_tokenizer(input_text, return_tensors='pt')
#   output_ids = bart_model.generate(inputs['input_ids'])
#   prediction = "".join([bart_tokenizer.decode(out_ids, skip_special_tokens=True, 
#                                             clean_up_tokenization_spaces=False) for out_ids in output_ids])
#   predictions.append(prediction)

# validation_df['prediction'] = predictions

In [None]:
df=pd.DataFrame()
df['context'] = [str.split('</s>')[1] for str in validation_df['orig'][start_sample:end_sample]]
df['answer'] =  [str.split('</s>')[0] for str in validation_df['orig'][start_sample:end_sample]]
# df['context'] = [str.split('context: ')[1] for str in validation_df['orig'][start_sample:end_sample]]
# df['answer'] =  [str.split('context: ')[0][26: ] for str in validation_df['orig'][start_sample:end_sample]]
df['target'] = validation_df['target']
df['prediction'] = predictions

In [None]:
# Reorder columns
df = df[['context', 'answer', 'target', 'prediction']]

df[:2]

Unnamed: 0,context,answer,target,prediction
0,"In May 1983, she married Nikos Karvelas, a co...","In May 1983, she married Nikos Karvelas, a com...",what happened in 1983?,What was the name of Nikos Karvelas?
1,"In May 1983, she married Nikos Karvelas, a co...",in November she gave birth to her daughter Sof...,did they have any children?,What was the name of Vissi's daughter?


In [None]:
len(df)

5868

Add new rows to predictions csv

(don't need to do if not running predictions in sample subsets)

In [None]:
# Create list of lists for rows in dataframe
# new_data = df.values.tolist()

# new_data

In [None]:
# import csv

# # Add new rows to csv

# with open(prediction_file, 'a') as file:
#     writer = csv.writer(file)
#     # writer.writerow(student_header)
#     # Use writerows() not writerow()
#     writer.writerows(new_data)

In [None]:
# Close file
# file.close()

Save predictions

In [None]:
if save_predictions:
  df.to_csv(prediction_file, mode=save_mode)

Unused code

In [None]:
# predictions = []

# if start_sample is None:
#   start_sample = 0

# if end_sample is None:
#   end_sample = validation_df.shape[0]

# print(f"Generating predictions from {start_sample} to {end_sample}:")
# for start in range (start_sample, end_sample, batch_size):
#   to = min([end_sample, start + batch_size])
#   inputs = bart_tokenizer(validation_df['orig'][start:to].to_list(), return_tensors='pt', max_length=max_length, truncation=True, padding=True)
#   output_ids = bart_model.generate(inputs['input_ids'].cuda(), max_length=max_length)
#   prediction_batch = bart_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
#   predictions.extend(prediction_batch)
#   print (f"{to} ", end="")
#   if to%1000 == 0: print()
# print("Predictions generated.")

Generating predictions from 10000 to 10570:


In [None]:
# df=pd.DataFrame()
# df['context'] = [str.split('context: ')[1] for str in validation_df['orig'][start_sample:end_sample]]
# df['answer'] =  [str.split('context: ')[0][26: ] for str in validation_df['orig'][start_sample:end_sample]]
# df['target'] = validation_df['target']
# df['prediction'] = predictions