In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM 
from transformers import AutoTokenizer 
from transformers import GenerationConfig 


In [2]:
#TASK: Text summarization.
#Dataset loading 

dataset_name = 'knkarthick/dialogsum'
dataset = load_dataset(dataset_name)

In [3]:
dataset['train']

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})

In [4]:
dataset['validation']

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 500
})

In [5]:
dataset['test']

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 1500
})

In [6]:
dash_line = '-'.join(' ' for x in range(50)) #Create 50 spaces joined by '-' character.
dash_line

example_idxs = [0, 2]
for i, idx in enumerate(example_idxs):
    print(f'Example {1+i}')
    print('Dialogue \n')
    print(dash_line)
    print(dataset['test'][idx]['dialogue'])
    print(dash_line)
    print('Summary \n')
    print(dash_line)
    print(dataset['test'][idx]['summary'])
    print(dash_line)

Example 1
Dialogue 

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don

In [7]:
checkpoint = 'google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


In [8]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, usefast=True)


In [9]:
sentence = 'Hello, how are you?'
tokenized_sentence = tokenizer(sentence, return_tensors='pt')
print(tokenized_sentence['input_ids'][0])
tokenizer.decode(tokenized_sentence['input_ids'][0], skip_special_tokens=True)

tensor([8774,    6,  149,   33,   25,   58,    1])


'Hello, how are you?'

In [10]:
sentence = 'Hello world'
tokenizer.tokenize(sentence)
print(sentence)
batch = dataset['test'][:4]
encoded_sentence = tokenizer(batch['summary'], return_tensors='pt', truncation=True, padding=True) 
# For handling a batch returning tensors give errors
# Truncation and padding solve this problem
# One possible solution to handle batches is to use the DataCollator or tokenizer itself, which handles by itself the 
# padding and the conversion to tensors. Used later for model fine tuning in lazy-programmer course

print(encoded_sentence['input_ids'][0]) # See the first sentence
decoded_sentence = tokenizer.decode(encoded_sentence['input_ids'][2], skip_special_tokens=True)
print(decoded_sentence) # See the first decoded sentence.

Hello world
tensor([  283,     7,     5, 31676,  1691,  1713,   345, 13515,   536,  4663,
           12,  1431,     3,     9, 22986,    12,  3261,   334,  3490,    24,
           79,    43,    12,   483,     8,  1901,  1573,    11,   225,    59,
          169, 18882,  6598,  5855,  7595,     5,     1,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
Ms. Dawson takes a dictation for #Person1# about prohibiting the use of Instant Message programs in the office. They argue about its reasonability but #Person1# still insists.


In [11]:
example_idxs = [40,41]

for i, idx in enumerate(example_idxs):
    tokenized_dialogue = tokenizer(dataset['test'][idx]['dialogue'], return_tensors='pt')
    #print(tokenized_dialogue['input_ids'][0])
    decoded_summary = tokenizer.decode(
        model.generate(tokenized_dialogue['input_ids'], max_new_tokens=50)[0],
        skip_special_tokens=True)  
    # This handles a batch of data (tokenized_dialogue['input_ids'])
    # This also returns a tuple and we have to take the [0]th element to obtain the summary to decode
    

    print('Dialogue: ', dataset['test'][idx]['dialogue'], '\n')
    print('Decoded summary: ',decoded_summary, '\n')
    print('Target summary: ', dataset['test'][idx]['summary'], '\n')

Dialogue:  #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there. 

Decoded summary:  Person1: It's ten to nine. 

Target summary:  #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time. 

Dialogue:  #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there. 

Decoded summary:  Person1: It's ten to nine. 

Target summary:  #Person1# is rushing to catch a train but

In [12]:
example_idxs = [40,41]

for i, idx in enumerate(example_idxs):
    dialogue = dataset['test'][idx]['dialogue']
    summary = dataset['test'][idx]['summary']
    prompt = f"""
Summarize the following conversation.
{dialogue}

Summary:
    """
    tokenized_dialogue = tokenizer(prompt, return_tensors='pt')
    m_summary = tokenizer.decode(
        model.generate(tokenized_dialogue['input_ids'], max_new_tokens=50)[0],
        skip_special_tokens = True
        )
    print('Prompt: ', prompt, '\n')
    
    print('Target summary: ', summary, '\n')

    print('Model summary: ', m_summary, '\n')


Prompt:  
Summarize the following conversation.
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

Summary:
     

Target summary:  #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time. 

Model summary:  The train is about to leave. 

Prompt:  
Summarize the following conversation.
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

Summary:
     

T

In [13]:
example_idxs = [40,41]

for i, idx in enumerate(example_idxs):
    dialogue = dataset['test'][idx]['dialogue']
    summary = dataset['test'][idx]['summary']
    prompt = f"""
Dialogue:
{dialogue}

What is going on?
    """
    tokenized_dialogue = tokenizer(prompt, return_tensors='pt')
    m_summary = tokenizer.decode(
        model.generate(tokenized_dialogue['input_ids'], max_new_tokens=50)[0],
        skip_special_tokens = True
        )
    print('Prompt: ', prompt, '\n')
    
    print('Target summary: ', summary, '\n')

    print('Model summary: ', m_summary, '\n')

Prompt:  
Dialogue:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What is going on?
     

Target summary:  #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time. 

Model summary:  Tom is late for the train. 

Prompt:  
Dialogue:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What is going on?
     

Target summary:  #Person1# is rushing to 

In [14]:
def make_prompt(all_indices, index):
    prompt = ''
    for i, idx in enumerate(all_indices):
        dialogue = dataset['test'][idx]['dialogue']
        summary = dataset['test'][idx]['summary']
        prompt+= f"""
                Dialogue: {dialogue}

                What's going on? {summary}
        """
    selected_dialogue = dataset['test'][index]['dialogue']
    prompt+=f"""
                Dialogue: {selected_dialogue}

                What's going on?
        """
    
    return prompt

In [15]:
#ONE SHOT EXAMPLE 

example_idxs = [45]
selected_index = 46
one_shot_prompt = make_prompt(example_idxs, selected_index)
print(one_shot_prompt)


                Dialogue: #Person1#: Would you like to go to the party tonight?
#Person2#: Whose party?
#Person1#: Ruojia's. Don't you know that? Ruojia has got married.
#Person2#: What! Is she really? I can't believe it!
#Person1#: Yes. Yesterday.
#Person2#: Good gracious. That's incredible! I feel so happy for her!
#Person1#: Yes, me too.
#Person2#: But how do you know that?
#Person1#: I saw the news from her twitter. And she sent an email about it.
#Person2#: What? I didn't receive it!
#Person1#: Maybe you should check your email.
#Person2#: Oh yes, I find it. Tonight at her home. Will you bring something?
#Person1#: Yes, a pair of wineglasses and a card to wish her happy marriage.
#Person2#: I will buy a tea set.

                What's going on? #Person1# tells #Person2# that Ruojia is married and will have a party tonight. #Person2#'s surprised to know that. They will bring their gifts to bless her.
        
                Dialogue: #Person1#: Would you like to go to the party 

In [16]:
tokenized_input = tokenizer(one_shot_prompt, return_tensors='pt')
generated_summary = tokenizer.decode(
    model.generate(tokenized_input['input_ids'], max_new_tokens=50)[0],
      skip_special_tokens=True)
print('Generated_summary: ', generated_summary)
print('Target_summary: ', dataset['test'][selected_index]['summary'])

Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors


Generated_summary:  #Person1 tells #Person2# that Ruojia is married and will have a party tonight. #Person1 tells #Person2# that Ruojia is married and will have a party tonight.
Target_summary:  #Person2# is surprised to know from #Person1# that Ruojia is married. Then #Person2# finds Ruojia has sent an email about it. They will go to Ruojia's party and give their presents to her.


In [18]:
example_idxs = [45, 46, 47]
selected_index = 48
few_shot_prompt = make_prompt(example_idxs, selected_index)
print(few_shot_prompt)


                Dialogue: #Person1#: Would you like to go to the party tonight?
#Person2#: Whose party?
#Person1#: Ruojia's. Don't you know that? Ruojia has got married.
#Person2#: What! Is she really? I can't believe it!
#Person1#: Yes. Yesterday.
#Person2#: Good gracious. That's incredible! I feel so happy for her!
#Person1#: Yes, me too.
#Person2#: But how do you know that?
#Person1#: I saw the news from her twitter. And she sent an email about it.
#Person2#: What? I didn't receive it!
#Person1#: Maybe you should check your email.
#Person2#: Oh yes, I find it. Tonight at her home. Will you bring something?
#Person1#: Yes, a pair of wineglasses and a card to wish her happy marriage.
#Person2#: I will buy a tea set.

                What's going on? #Person1# tells #Person2# that Ruojia is married and will have a party tonight. #Person2#'s surprised to know that. They will bring their gifts to bless her.
        
                Dialogue: #Person1#: Would you like to go to the party 

In [19]:
tokenized_input = tokenizer(few_shot_prompt, return_tensors='pt')
generated_summary = tokenizer.decode(
    model.generate(tokenized_input['input_ids'], max_new_tokens=50)[0],
      skip_special_tokens=True)
print('Generated_summary: ', generated_summary)
print('Target_summary: ', dataset['test'][selected_index]['summary'])

Generated_summary:  #Person1: You're a good friend.
Target_summary:  #Person2# at first thinks #Person1#'s behaviour cruel but finally joins #Person1#.


In [23]:
generation_config = GenerationConfig(max_new_tokens=70, do_sample=True, temperature=0.2, top_p=0.5)

tokenized_input = tokenizer(few_shot_prompt, return_tensors='pt')
generated_summary = tokenizer.decode(
    model.generate(tokenized_input['input_ids'], generation_config=generation_config)[0],
      skip_special_tokens=True)
print('Generated_summary: ', generated_summary)
print('Target_summary: ', dataset['test'][selected_index]['summary'])


Generated_summary:  #Person1: You're a good friend.
Target_summary:  #Person2# at first thinks #Person1#'s behaviour cruel but finally joins #Person1#.
