In [1]:
!pip install -q -U transformers
!pip install -q -U datasets
!pip install -q -U evaluate
!pip install -q -U tokenizers

In [2]:
import re
import random
import numpy as np
from scipy.special import softmax

import torch
import transformers
import evaluate
import json
from datasets import Dataset, load_dataset

# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration  # this won't import twice, just noting here what's for each model

# For all T5 models
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# For BLEURT (to load a trained model for evaluation)
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# For style classifier model (also for evaluating the seq2seq model output)
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer

In [3]:
# Mount Google drive.
from google.colab import drive
drive.mount('/content/gdrive')

# Go to working directory for the final project.
%cd /content/gdrive/My Drive/DS266/project

train_file = 'data/v1-3/train.jsonl'
dev_file = 'data/v1-3/dev.jsonl'
test_file = 'data/v1-3/test.jsonl'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/DS266/project


In [4]:
def load_data(file_path):
  with open(test_file) as f:
      lines = f.read().split("\n")[:-1]

  document_question_response = []
  for line in lines:
      data = json.loads(line)
      document = data["document"]
      questions = data["questions"]
      for question in questions:
          question_text = question["question_text"]
          responses = question["responses"]
          for response in responses:
              response_text = response["response_text"]
              document_question_response.append((document, question_text, response_text))

  return document_question_response

In [5]:
test_triplets = load_data(test_file)

In [6]:
# Look at some examples
for _ in range(3):
    sample = random.choice(test_triplets)
    print(f"Document: {sample[0][:50]}", "\n")
    print(f"Question: {sample[1]}", "\n")
    print(f"Response: {sample[2]}", "\n")

Document: DELAY IN TRANSIT
 
 
 By F. L. WALLACE
 
 Illustra 

Question: Who is Murra Foray and how is she significant to the story? 

Response: Murra Foray is the first counselor of the travelers aid bureau, she looks enigmatic and dangerous. At first, she is doubtful about Cassel’s destination to Tunney 21 and his occupation as a sales engineer. She points out that there are a thousand races, how is Cassal able to have special knowledge of all those different types of customers. 

Later she tells Cassal that his ship has already left in the morning. And no one is sure when the next ship will be coming to Godolph. Murra suggests five years if lucky. Star hopping would also take that much of time since he has only covered one third of the whole distance. Then later Murra realizes that someone has already boarded the ship under Cassal’s name, using Cassal’s identification. Now the stalker’s motive of stealing his wallet becomes clear. Then Murra suggests that he donate to the bureau so 

In [7]:
random.shuffle(test_triplets)

In [8]:
def make_dataset(triplets):
    documents, questions, responses = zip(*triplets)
    documents = list(documents)
    questions = list(questions)
    responses = list(responses)

    dataset = Dataset.from_dict({"documents": documents, "questions": questions, "responses": responses})
    return dataset.shuffle()

# Make the test data set.
test_dataset = make_dataset(test_triplets)


In [10]:
print("First Document: ", test_dataset["documents"][0][:50], "\n")
print("First Question: ", test_dataset["questions"][0], "\n")
print("First Response: ", test_dataset["responses"][0], "\n")

First Document:  



Produced by Greg Weeks, Mary Meehan and the On 

First Question:  Who is Magnan, and what is his role in and relevance to the story? 

First Response:  Magnan is the Terrestrial Ambassador to the Fustians. He is the figurehead of their influence on the Fustian planet, and works closely with Retief, the Terrestrial diplomat who uncovers a plot against the Terrestrials through the course of the story. He is the man who tries to convince Retief to sponsor the Youth Group SCARS in the beginning of the story, and we encounter him at the banquet near the end of the story. As the figurehead, he is responsible for announcing the role of the Terrestrials in funding the Youth Group, which creates an opportunity for Retief to announce the Grocian plot to everyone. Ambassador Magnan eventually joins Retief and Whonk as they leave the event to stop the criminals, but he is thrown into an alley by Whonk and doesn't have an opportunity to help directly. After the issue is dealt w

In [11]:
from transformers import pipeline, AutoTokenizer, BartForConditionalGeneration

# Load the baseline SQuALITY model checkpoint from Pagnoni et. al., 2021.
socratic_checkpoint_name = "Salesforce/squality-socratic-books-30M"
socratic_tokenizer = AutoTokenizer.from_pretrained(socratic_checkpoint_name)
socratic_model = BartForConditionalGeneration.from_pretrained(socratic_checkpoint_name)
#pipeline("summarization", model=socratic_checkpoint_name, tokenizer=socratic_checkpoint_name)


In [12]:
MAX_SEQUENCE_LENGTH = 1024

def make_question_document_pairs(batch_triplets):
    question_document_pairs = []
    for document, question in zip(batch_triplets["documents"], batch_triplets["questions"]):
        question_document_pairs.append(f"<ask&answer> {question} <qsep> {document}")

    return question_document_pairs

def preprocess_socratic_batch(batch_triplets, tokenizer):
    question_document_pairs = make_question_document_pairs(batch_triplets)

    input_encoded = tokenizer.batch_encode_plus(
        question_document_pairs,
        max_length=MAX_SEQUENCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    labels_encoded = tokenizer.batch_encode_plus(
        batch_triplets["responses"],
        max_length=MAX_SEQUENCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    return {'input_ids': input_encoded['input_ids'],
            'label_ids': labels_encoded['input_ids']}

In [13]:
test_encoded = test_dataset.map(
    preprocess_socratic_batch,
    batched=True,
    fn_kwargs={
      'tokenizer': socratic_tokenizer
})

Map:   0%|          | 0/1040 [00:00<?, ? examples/s]

In [14]:
from transformers import logging
logging.set_verbosity(logging.INFO)

def generate_output(model, tokenizer, batch_triplets, batch_size, **kwargs):
    input_sentences = make_question_document_pairs(batch_triplets)

    all_outputs = []

    for i in range(int(len(input_sentences) / batch_size) + 1):
        start_i, end_i = i * batch_size, (i + 1) * batch_size
        if start_i >= len(input_sentences):
            break

        inputs_encoded = tokenizer(
            input_sentences[start_i:end_i],
            max_length=MAX_SEQUENCE_LENGTH,
            padding=True,
            truncation=True,
            return_tensors='pt')

        output_ids = model.cuda().generate(
            inputs_encoded['input_ids'].cuda(),
            **kwargs)

        generated_sentences = tokenizer.batch_decode(output_ids,
                                                     skip_special_tokens=True,
                                                     clean_up_tokenization_spaces=False)
        all_outputs.extend(generated_sentences)

    return all_outputs

In [18]:
# Check the outputs from the model.
generate_kwargs = {
    "num_beams": 3,
    "do_sample": True,
    "no_repeat_ngram_size": 3,
    "max_length": 512
}
samples = generate_output(socratic_model, socratic_tokenizer, test_dataset, 16, **generate_kwargs)

In [16]:
print(random.choice(samples))

The members of the space ship Explorer hunt animals to test them for contagion. The people of the ship are in airtight spacesuits and doctors are in green spacesuits. They hunt animals that look like humans to test for the animals’ diseases. They wait while their doctors, in air


In [20]:
# Samples Last Run: 1h 37m 39s
# Save to reimport later.

import pickle

samples_file_path = "outputs/sample_output-test.jsonl-socratic-squality.pkl"

with open(samples_file_path, "wb") as file:  # "wb" for write binary
    pickle.dump(samples, file)

In [28]:
# Reload if samples is empty.

try:
    samples
except NameError:
    with open(samples_file_path, "rb") as file:  # "rb" for read binary
      samples = pickle.load(file)

print(len(samples))

1040


In [30]:
# Calculate ROUGE scores

!pip install -q rouge_score

rouge = evaluate.load('rouge')
predictions = samples
references = test_dataset["responses"]
rouge_results = rouge.compute(predictions=predictions,
                        references=references)

print("ROUGE: ", rouge_results)

Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE:  {'rouge1': np.float64(0.3109285772757844), 'rouge2': np.float64(0.0625451922802101), 'rougeL': np.float64(0.15850135492874273), 'rougeLsum': np.float64(0.17814044477983487)}


In [31]:
# Calculate BLEU scores

bleu = evaluate.load("bleu")
bleu_results = bleu.compute(predictions=predictions, references=references)
print("BLEU: ", bleu_results)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

BLEU:  {'bleu': 0.036026649487998386, 'precisions': [0.3197619834278181, 0.07399894305980903, 0.01583924272017338, 0.0044947830771417145], 'brevity_penalty': 1.0, 'length_ratio': 1.1670388907341023, 'translation_length': 305693, 'reference_length': 261939}
