In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

In [3]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 12460/12460 [00:00<00:00, 44204.94 examples/s]
Generating validation split: 100%|██████████| 500/500 [00:00<00:00, 16718.63 examples/s]
Generating test split: 100%|██████████| 1500/1500 [00:00<00:00, 34870.01 examples/s]


In [7]:
example_indices = [40, 200]

dash_line = "-".join(" " for c in range(100))
for element, index in enumerate(example_indices):
    print(dash_line)
    print(f"Example {index}: ")
    print(dash_line)
    print("INPUT DIALOGUE: ")
    print(dataset["test"][index]["dialogue"])
    print(dash_line)
    print("BASELINE HUMAN SUMMARY:")
    print(dataset["test"][index]["summary"])
    print(dash_line)

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Example 40: 
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
INPUT DIALOGUE: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
BASELINE HUM

In [8]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [12]:
sentence = "What time is it, Tom?"

sentence_encoded = tokenizer(sentence, return_tensors="pt")
sentence_decoded = tokenizer.decode(
    sentence_encoded["input_ids"][0], 
    skip_special_token=True,
    )

print("ENCODED SENTENCE: ")
print(sentence_encoded["input_ids"][0])
print("DECODED SENTENCE:")
print(sentence_decoded)

ENCODED SENTENCE: 
tensor([ 363,   97,   19,   34,    6, 3059,   58,    1])
DECODED SENTENCE:
What time is it, Tom?</s>


In [13]:
for i, index in enumerate(example_indices):
    dialogue = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]

    inputs = tokenizer(dialogue, return_tensors="pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print(f"Example {i}")
    print(dash_line)
    print(f"INPUT PROMPT: \n{dialogue}")
    print(dash_line)
    print(f"BASELINE HUMAN SUMMARY:\n{summary}")
    print(dash_line)
    print(f"MODEL GENERATION - WITHOUT MODEL ENGINEERING:\n{output}\n")

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Example 0
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
INPUT PROMPT: 
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
BASELINE HUMAN SU