In [None]:
# Licensing Information:  You are free to use or extend this project for
# educational purposes provided that (1) you do not distribute or publish
# solutions, (2) you retain this notice, and (3) you provide clear
# attribution to The Georgia Institute o/f Technology, includin/g a link to https://aritter.github.io/CS-7650/

# Attribution Information: 
# This Project was developed at the Georgia Institute of Technology by Ashutosh Baheti (ashutosh.baheti@cc.gatech.edu), 
# borrowing  from the Neural Machine Translation Project (Project 2) 
# of the UC Berkeley NLP course https://cal-cs288.github.io/sp20/

# Project #3: Neural Chatbot

Neural Dialog Model are Sequence-to-Sequence (Seq2Seq) models that produce conversational response given the dialog history. State-of-the-art dialog models are trained on millions of multi-turn conversations. However, in this assignment we will narrow our scope to single turn conversations to make the problem easier.  

In this assignment you will implement,
1. Seq2Seq encoder-decoder model
2. Seq2Seq model with attention mechanism
3. Greedy and Beam search decoding algorithms  

First import libraries required for the implementation

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk
from google.colab import files

Then we implement some standard util functions that will be useful in the rest of the code.

In [None]:
# General util functions
def make_dir_if_not_exists(directory):
	if not os.path.exists(directory):
		logging.info("Creating new directory: {}".format(directory))
		os.makedirs(directory)

def print_list(l, K=None):
	# If K is given then only print first K
	for i, e in enumerate(l):
		if i == K:
			break
		print(e)
	print()

def remove_multiple_spaces(string):
	return re.sub(r'\s+', ' ', string).strip()

def save_in_pickle(save_object, save_file):
	with open(save_file, "wb") as pickle_out:
		pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
	with open(pickle_file, "rb") as pickle_in:
		return pickle.load(pickle_in)

def save_in_txt(list_of_strings, save_file):
	with open(save_file, "w") as writer:
		for line in list_of_strings:
			line = line.strip()
			writer.write(f"{line}\n")

def load_from_txt(txt_file):
	with open(txt_file, "r") as reader:
		all_lines = list()
		for line in reader:
			line = line.strip()
			all_lines.append(line)
		return all_lines

Finally we will check if GPU is available and set the device accordingly.

Tip: While debugging use `CPU` and change the runtime type to `GPU` when you are ready to train your models to efficiently use free Colab GPU

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


## Dataset

For the dataset we will be using a small sample of single turn input and response pairs from [Cornell Movie Dialog Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html). We filter conversational pairs with sentences > 10 tokens. To reduce your work, we have already created a sample of tokenized, lowercased single turn conversations from Cornell Movie Dialog Corpus. The preprocessed dataset sample is stored in pickle format and can be downloaded from [this link](https://drive.google.com/file/d/1qYdSlDJ89AvgozK3V5tik8Op93zPbG6e/view?usp=sharing). Please download the `processed_CMDC.pkl` file from the link and upload it in colab.

In [None]:
# Loading the pre-processed conversational exchanges (source-target pairs) from pickle data files
all_conversations = load_from_pickle("processed_CMDC.pkl")
# Extract 100 conversations from the end for evaluation and keep the rest for training
eval_conversations = all_conversations[-100:]
all_conversations = all_conversations[:-100]

# Logging data stats
print(f"Number of Training Conversation Pairs = {len(all_conversations)}")
print(f"Number of Evaluation Conversation Pairs = {len(eval_conversations)}")

Number of Training Conversation Pairs = 53065
Number of Evaluation Conversation Pairs = 100


Let's print a couple of conversations to check if they are loaded properly.

In [None]:
print_list(all_conversations, 5)

('there .', 'where ?')
('you have my word . as a gentleman', 'you re sweet .')
('hi .', 'looks like things worked out tonight huh ?')
('have fun tonight ?', 'tons')
('well no . . .', 'then that s all you had to say .')



## Vocabulary

The words in the sentences need to be converted into integer tokens so that the neural model can operate on them. For this purpose, we will create a vocabulary which will convert the input strings into model recognizable integer tokens.

In [None]:
pad_word = "<pad>"
bos_word = "<s>"
eos_word = "</s>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3
    
def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
        self.word_count = {}
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}
        self.num_words = 4
    
    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word] if word in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids
    
    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                # add this word to the vocabulary
                self.word_to_id[word] = self.num_words
                self.id_to_word[self.num_words] = word
                self.word_count[word] = 1
                self.num_words += 1
            else:
                # update the word count
                self.word_count[word] += 1

vocab = Vocabulary()
for src, tgt in all_conversations:
    vocab.add_words_from_sentence(src)
    vocab.add_words_from_sentence(tgt)
print(f"Total words in the vocabulary = {vocab.num_words}")

Total words in the vocabulary = 7727


Let's print top 30 vocab words:

In [None]:
print_list(sorted(vocab.word_count.items(), key=lambda item: item[1], reverse=True), 30)

('.', 84255)
('?', 36822)
('you', 25093)
('i', 18946)
('what', 10765)
('s', 10089)
('it', 9668)
('!', 8872)
('the', 8011)
('t', 7411)
('to', 6929)
('a', 6582)
('that', 5992)
('no', 4931)
('me', 4839)
('do', 4745)
('is', 4434)
('don', 3577)
('are', 3503)
('he', 3413)
('yes', 3384)
('m', 3382)
('not', 3252)
('we', 3252)
('know', 3171)
('re', 2965)
('your', 2809)
('this', 2726)
('yeah', 2708)
('in', 2678)



Print a couple of sentences to verify that the vocabulary is working as intended.

In [None]:
for src, tgt in all_conversations[:3]:
    sentence = tgt
    word_tokens = vocab.tokenized_sentence(sentence)
    # Automatically adds bos_id and eos_id before and after sentence ids respectively
    word_ids = vocab.get_ids_from_sentence(sentence)
    print(sentence)
    print(word_tokens)
    print(word_ids)
    print(vocab.decode_sentence_from_ids(word_ids))
    print()

word = "the"
word_id = vocab.word_to_id[word]
print(f"Word = {word}")
print(f"Word ID = {word_id}")
print(f"Word decoded from ID = {vocab.decode_sentence_from_ids([word_id])}")

where ?
['<s>', 'where', '?', '</s>']
[1, 6, 7, 2]
where ?

you re sweet .
['<s>', 'you', 're', 'sweet', '.', '</s>']
[1, 8, 15, 16, 5, 2]
you re sweet .

looks like things worked out tonight huh ?
['<s>', 'looks', 'like', 'things', 'worked', 'out', 'tonight', 'huh', '?', '</s>']
[1, 18, 19, 20, 21, 22, 23, 24, 7, 2]
looks like things worked out tonight huh ?

Word = the
Word ID = 47
Word decoded from ID = the


## Dataset Prepration (5 points)

We will use built-in dataset utilities, `torch.utils.data.Dataset` and `torch.utils.data.DataLoader`, to get batched data readily useful for training.

In [None]:
class SingleTurnMovieDialog_dataset(Dataset):
    """Single-Turn version of Cornell Movie Dialog Cropus dataset."""

    def __init__(self, conversations, vocab, device):
        """
        Args:
            conversations: list of tuple (src_string, tgt_string) 
                         - src_string: String of the source sentence
                         - tgt_string: String of the target sentence
            vocab: Vocabulary object that contains the mapping of 
                    words to indices
            device: cpu or cuda
        """
        self.conversations = conversations
        self.vocab = vocab
        self.device = device

        def encode(src, tgt):
            src_ids = self.vocab.get_ids_from_sentence(src)
            tgt_ids = self.vocab.get_ids_from_sentence(tgt)
            return (src_ids, tgt_ids)

        # We will pre-tokenize the conversations and save in id lists for later use
        self.tokenized_conversations = [encode(src, tgt) for src, tgt in self.conversations]
        
    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {"conv_ids":self.tokenized_conversations[idx], "conv":self.conversations[idx]}

def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (src_seq, trg_seq).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of dicts {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, trg_str)}.
            - src_ids: list of src piece ids; variable length.
            - tgt_ids: list of tgt piece ids; variable length.
            - src_str: String of src
            - tgt_str: String of tgt
    Returns: dict { "conv_ids":     (src_ids, tgt_ids), 
                    "conv":         (src_str, tgt_str), 
                    "conv_tensors": (src_seqs, tgt_seqs)}
            src_seqs: torch tensor of shape (src_padded_length, batch_size).
            trg_seqs: torch tensor of shape (tgt_padded_length, batch_size).
            src_padded_length = length of the longest src sequence from src_ids
            tgt_padded_length = length of the longest tgt sequence from tgt_ids
    """
    # Sort conv_ids based on decreasing order of the src_lengths.
    # This is required for efficient GPU computations.
    src_ids = [torch.LongTensor(e["conv_ids"][0]) for e in data]
    tgt_ids = [torch.LongTensor(e["conv_ids"][1]) for e in data]
    src_str = [e["conv"][0] for e in data]
    tgt_str = [e["conv"][1] for e in data]
    data = list(zip(src_ids, tgt_ids, src_str, tgt_str))
    data.sort(key=lambda x: len(x[0]), reverse=True)
    src_ids, tgt_ids, src_str, tgt_str = zip(*data)


    # Pad the src_ids and tgt_ids using token pad_id to create src_seqs and tgt_seqs
    
    # Implementation tip: You can use the nn.utils.rnn.pad_sequence utility
    # function to combine a list of variable-length sequences with padding.
    
    # YOUR CODE HERE
    ...
    
    src_seqs = pad_sequence(src_ids, padding_value= pad_id )
    tgt_seqs = pad_sequence(tgt_ids, padding_value= pad_id )


    return {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, tgt_str), "conv_tensors":(src_seqs.to(device), tgt_seqs.to(device))}

In [None]:
# Create the DataLoader for all_conversations
dataset = SingleTurnMovieDialog_dataset(all_conversations, vocab, device)

batch_size = 5

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

Let's test a batch of data to make sure everything is working as intended

In [None]:
# Test one batch of training data
first_batch = next(iter(data_loader))
print(f"Testing first training batch of size {len(first_batch['conv'][0])}")
print(f"List of source strings:")
print_list(first_batch["conv"][0])
print(f"Tokenized source ids:")
print_list(first_batch["conv_ids"][0])
print(f"Padded source ids as tensor (shape {first_batch['conv_tensors'][0].size()}):")
print(first_batch["conv_tensors"][0])

Testing first training batch of size 5
List of source strings:
you girls are in some hot water .
you didn t walk out here .
it was you juliet it was you
why we sittin here mama ?
yeah .

Tokenized source ids:
tensor([   1,    8, 1730,  191,   83,  401, 1343,  409,    5,    2])
tensor([   1,    8,  208,  103, 1332,   22,  143,    5,    2])
tensor([   1,   68,   89,    8, 6309,   68,   89,    8,    2])
tensor([   1,   87,  197, 5729,  143, 2936,    7,    2])
tensor([  1, 179,   5,   2])

Padded source ids as tensor (shape torch.Size([10, 5])):
tensor([[   1,    1,    1,    1,    1],
        [   8,    8,   68,   87,  179],
        [1730,  208,   89,  197,    5],
        [ 191,  103,    8, 5729,    2],
        [  83, 1332, 6309,  143,    0],
        [ 401,   22,   68, 2936,    0],
        [1343,  143,   89,    7,    0],
        [ 409,    5,    8,    2,    0],
        [   5,    2,    2,    0,    0],
        [   2,    0,    0,    0,    0]], device='cuda:0')


## Baseline Seq2Seq model (25 points)

With the training `Dataset` and `DataLoader` ready, we can implement our Seq2Seq baseline model. 

The model will consist of
1. Shared embedding layer between encoder and decoder that converts the input sequence of word ids to dense embedding representations
2. Bidirectional GRU encoder that encodes the embedded source sequence into hidden representation
3. GRU decoder that predicts target sequence using final encoder hidden representation

In [None]:
class Seq2seqBaseline(nn.Module):
    def __init__(self, vocab, emb_dim = 300, hidden_dim = 300, num_layers = 2, dropout=0.1):
        super().__init__()

        # Initialize your model's parameters here. To get started, we suggest
        # setting all embedding and hidden dimensions to 300, using encoder and
        # decoder GRUs with 2 layers, and using a dropout rate of 0.1.

        # Implementation tip: To create a bidirectional GRU, you don't need to
        # create two GRU networks. Instead use nn.GRU(..., bidirectional=True).
        
        self.num_words = num_words = vocab.num_words
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        # YOUR CODE HERE
        ...
       # self.droupout = droupout
        #print(vocab)
        self.vocab = vocab
        self.embedding_layer = nn.Embedding(self.num_words, self.emb_dim)
        self.encode_layer = nn.GRU(self.emb_dim, self.hidden_dim, num_layers = self.num_layers, dropout=dropout,bidirectional=True )
        self.decode_layer = nn.GRU(self.emb_dim, self.hidden_dim, num_layers = self.num_layers, dropout=dropout,bidirectional=False )

        self.linear_layer = nn.Linear(self.hidden_dim, self.num_words)


    def encode(self, source):
        """Encode the source batch using a bidirectional GRU encoder.

        Args:
            source: An integer tensor with shape (max_src_sequence_length,
                batch_size) containing subword indices for the source sentences.

        Returns:
            A tuple with three elements:
                encoder_output: The output hidden representation of the encoder 
                    with shape (max_src_sequence_length, batch_size, hidden_size).
                    Can be obtained by adding the hidden representations of both 
                    directions of the encoder bidirectional GRU. 
                encoder_mask: A boolean tensor with shape (max_src_sequence_length,
                    batch_size) indicating which encoder outputs correspond to padding
                    tokens. Its elements should be True at positions corresponding to
                    padding tokens and False elsewhere.
                encoder_hidden: The final hidden states of the bidirectional GRU 
                    (after a suitable projection) that will be used to initialize 
                    the decoder. This should be a tensor h_n with shape 
                    (num_layers, batch_size, hidden_size). Note that the hidden 
                    state returned by the bi-GRU cannot be used directly. Its 
                    initial dimension is twice the required size because it 
                    contains state from two directions.

        The first two return values are not required for the baseline model and will
        only be used later in the attention model. If desired, they can be replaced
        with None for the initial implementation.
        """

        # Implementation tip: consider using packed sequences to more easily work
        # with the variable-length sequences represented by the source tensor.
        # See https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.PackedSequence.

        # https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch

        # Implementation tip: there are many simple ways to combine the forward
        # and backward portions of the final hidden state, e.g. addition, averaging,
        # or a linear transformation of the appropriate size. Any of these
        # should let you reach the required performance.

        # Compute a tensor containing the length of each source sequence.
        source_lengths = torch.sum(source != pad_id, axis=0).cpu()

        # YOUR CODE HERE
        ...
        max_src_sequence_length = source.shape[0]
        batch_size = source.shape[1]
        #print(max_src_sequence_length)
        #print(batch_size)
        encoder_mask = source == pad_id
        embeddings = self.embedding_layer(source)
       # print(self.encode_layer(embeddings))
        encoder_output, h_n= self.encode_layer(embeddings)
        encoder_output = encoder_output.view( max_src_sequence_length, batch_size, 2, self.hidden_dim)
        encoder_output = torch.mean(encoder_output, dim=2)

        h_n = h_n.view(self.num_layers,2, batch_size, self.hidden_dim)
        h_n = torch.mean(h_n, dim=1)

        # print(encoder_output.size())
        # print(h_n.size())

        return encoder_output, encoder_mask, h_n 
    

    def decode(self, decoder_input, last_hidden, encoder_output, encoder_mask):
        """Run the decoder GRU for one decoding step from the last hidden state.

        The third and fourth arguments are not used in the baseline model, but are
        included for compatibility with the attention model in the next section.

        Args:
            decoder_input: An integer tensor with shape (1, batch_size) containing 
                the subword indices for the current decoder input.
            last_hidden: A pair of tensors h_{t-1} representing the last hidden
                state of the decoder, each with shape (num_layers, batch_size,
                hidden_size). For the first decoding step the last_hidden will be 
                encoder's final hidden representation.
            encoder_output: The output of the encoder with shape
                (max_src_sequence_length, batch_size, hidden_size).
            encoder_mask: The output mask from the encoder with shape
                (max_src_sequence_length, batch_size). Encoder outputs at positions
                with a True value correspond to padding tokens and should be ignored.

        Returns:
            A tuple with three elements:
                logits: A tensor with shape (batch_size,
                    vocab_size) containing unnormalized scores for the next-word
                    predictions at each position.
                decoder_hidden: tensor h_n with the same shape as last_hidden 
                    representing the updated decoder state after processing the 
                    decoder input.
                attention_weights: This will be implemented later in the attention
                    model, but in order to maintain compatible type signatures, we also
                    include it here. This can be None or any other placeholder value.
        """
        max_src_sequence_length= encoder_output.shape[0]
        batch_size =encoder_output.shape[1]
        hidden_size =encoder_output.shape[2]

        # These arguments are not used in the baseline model.
        del encoder_output
        del encoder_mask

        # YOUR CODE HERE
        ...
        attention_weights = None
       # print("decoder input")
       # print(decoder_input.shape)
        embeddings = self.embedding_layer(decoder_input)
        output, decoder_hidden = self.decode_layer(embeddings, last_hidden)
        # print(output.size())
        logits = self.linear_layer(output)
       # print('logits size')
       # print(logits.size())
        # print(decoder_hidden.size())

        return logits, decoder_hidden, attention_weights

        
    def compute_loss(self, source, target):
        """Run the model on the source and compute the loss on the target.

        Args:
            source: An integer tensor with shape (max_source_sequence_length,
                batch_size) containing subword indices for the source sentences.
            target: An integer tensor with shape (max_target_sequence_length,
                batch_size) containing subword indices for the target sentences.

        Returns:
            A scalar float tensor representing cross-entropy loss on the current batch
            divided by the number of target tokens in the batch.
            Many of the target tokens will be pad tokens. You should mask the loss 
            from these tokens using appropriate mask on the target tokens loss.
        """

        # Implementation tip: don't feed the target tensor directly to the decoder.
        # To see why, note that for a target sequence like <s> A B C </s>, you would
        # want to run the decoder on the prefix <s> A B C and have it predict the
        # suffix A B C </s>.

        # You may run self.encode() on the source only once and decode the target 
        # one step at a time.

        # YOUR CODE HERE
        ...
        encoder_output, encoder_mask, encoder_hidden = self.encode(source)
        logits, deocoder_hidden, attention = self.decode(target[0:-1, :], encoder_hidden, encoder_output, encoder_mask)
        target_labels = target[1:, :].view(-1)
        logits = logits.view(-1, self.num_words)
        return F.cross_entropy(logits, target_labels, reduction='mean')

We provide a training loop for training the model. You are welcome to modify the training loop by adjusting the learning rate or changing optmization settings.

**Important:** During our testing we found that training the encoder and decoder with different learning rates is crucial for getting good performance over the small dialog corpus. Specifically, the decoder parameter learning rate should be 5 times the encoder parameter learning rate. Hence, add the encoder parameter variable names in the `encoder_parameter_names` as a list. For example, if encoder is using `self.embedding_layer` and `self.encoder_gru` layer then the `encoder_parameter_names` should be `['embedding_layer', 'encoder_gru']` 

In [None]:
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):
    """Train the model for given number of epochs and save the trained model in 
    the final model_file.
    """

    decoder_learning_ratio = 5.0
    
    encoder_parameter_names = ['embedding_layer', 'encoder_gru']
                               
    encoder_named_params = list(filter(lambda kv: any( key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any( key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([{'params': encoder_params},
                {'params': decoder_params, 'lr': learning_rate * decoder_learning_ratio}], lr=learning_rate)
    
    clip = 50.0
    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        # print(f"Total training instances = {len(train_dataset)}")
        # print(f"train_data_loader = {len(train_data_loader)} {1180 > len(train_data_loader)/20}")
        with tqdm.notebook.tqdm(
                data_loader,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(data_loader)) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                source, target = batch_data["conv_tensors"]
                optimizer.zero_grad()
                loss = model.compute_loss(source, target)
                total_loss += loss.item()
                loss.backward()
                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
    # Save the model after training         
    torch.save(model.state_dict(), model_file)

We can now train the baseline model.

A correct implementation should get a average train loss of < 3.00  
The code will automatically save and download the model at the end of training.

In [None]:
# You are welcome to adjust these parameters based on your model implementation.

#num 10, batch 32, learning_rate=0.00007, standard other hyperparams works ok, 


num_epochs = 10
batch_size = 32
# Reloading the data_loader to increase batch_size
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

baseline_model = Seq2seqBaseline(vocab).to(device)
train(baseline_model, data_loader, num_epochs, "baseline_model.pt" )#, learning_rate=0.00007)
# Download the trained model to local for future use
files.download('baseline_model.pt')

HBox(children=(FloatProgress(value=0.0, description='training', max=10.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='epoch 1', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 2', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 3', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 4', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 5', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 6', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 7', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 8', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 9', max=1659.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='epoch 10', max=1659.0, style=ProgressStyle(description_wi…





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Reload the model from the model file. 
# Useful when you have already trained and saved the model
baseline_model = Seq2seqBaseline(vocab).to(device)
baseline_model.load_state_dict(torch.load("baseline_model.pt", map_location=device))

<All keys matched successfully>

## Greedy Search (10 points)

For evaluation, we also need to be able to generate entire strings from the model. We'll first define a greedy inference procedure here. Later on, we'll implement beam search.


In [None]:
def predict_greedy(model, sentence, max_length=100):
    """Make predictions for the given input using greedy inference.
    
    Args:
        model: A sequence-to-sequence model.
        sentence: A input string.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.
    
    Returns:
        Model's predicted greedy response for the input, represented as string.
    """

    # You should make only one call to model.encode() at the start of the function, 
    # and make only one call to model.decode() per inference step.
    model.eval()
   # print(sentence)

    model.cpu()
    encoded_sentence = model.vocab.get_ids_from_sentence(sentence)
    #print(encoded_sentence)
    sentence_tensor = torch.Tensor(encoded_sentence)
    sentence_tensor.cpu()
    #sentence_tensor = torch.reshape(sentence_tensor, (1, len(encoded_sentence))).int()
    sentence_tensor = sentence_tensor.unsqueeze(1)
    #print(sentence_tensor.shape)
    encoder_output, encoder_mask,  last_hidden = model.encode(sentence_tensor.int())
    decode_input = sentence_tensor.new_ones(1,1).int() * bos_id
    decode_hidden = last_hidden
    response = []
    for i in range(max_length):
      logits, hidden, attention_weights = model.decode(decode_input, decode_hidden,encoder_output, encoder_mask)
      logits = logits.view(-1, model.num_words)
      logits = logits.squeeze(0)
      #print(logits)
      best_word_encoded = logits.argmax().item()
      #vals, idx = torch.max(logits,0)
      #best_word_encoded = vals.argmax().item()
      #print(best_word_encoded)
      decode_input = sentence_tensor.new_ones(1, 1).int() * best_word_encoded
      response.append(best_word_encoded)
      decode_hidden = hidden
      if best_word_encoded == eos_id:
        pass
    decoded_response = model.vocab.decode_sentence_from_ids(response)
    return decoded_response

Let's chat interactively with our trained baseline Seq2Seq dialog model and save the generated conversations for submission (please make sure to keep the conversations in your submission ["PG-13"](https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system)). We will reuse the conversational inputs while testing Seq2Seq + Attention model.

Note: enter "q" or "quit" to end the interactive chat

In [None]:
def chat_with_model(model, mode="greedy"):
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy
    chat_log = list()
    input_sentence = ''
    while(1):
        # Get input sentence
        input_sentence = input('Input > ')
        # Check if it is quit case
        if input_sentence == 'q' or input_sentence == 'quit': break
        
        generation = predict_f(model, input_sentence)
        if mode == "beam":
            generation = generation[0]
        print('Greedy Response:', generation)
        print()
        chat_log.append((input_sentence, generation))
    return chat_log

In [None]:
baseline_chat = chat_with_model(baseline_model)

Input > hello.
Greedy Response: hello .

Input > How are you?
Greedy Response: no .

Input > who are you?
Greedy Response: i m a lawyer .

Input > what is your name?
Greedy Response: my name is .

Input > where do you eat?
Greedy Response: in the garage .

Input > Which way should I go?
Greedy Response: oh yes .

Input > What sport do you like?
Greedy Response: no .

Input > what time is it?
Greedy Response: eight thirty .

Input > ok goodbye.
Greedy Response: it s okay .

Input > goodbye
Greedy Response: i ll be there .

Input > q


\## Seq2Seq + Attention Model (15 points)

Next, we extend the baseline model to include an attention mechanism in the decoder. With attention mechanism, the model doesn't need to encode the input into a fixed dimensional hidden representation. Rather, it creates a new context vector for each turn that is a weighted sum of encoder hidden representation. 

Your implementation can use any attention mechanism to get weight distribution over the source words. One simple way to include attention in decoder goes as follows (reminder: the decoder processed one token at a time),
1. Process the current decoder_input through embedding layer and decoder GRU layer.
2. Use the current decoder token representation, $d$ of shape $(1 * b * h)$ and encoder representation, $e_1, \dots, e_n$ or shape $(n * b * h)$, where $n$ is max_src_length after padding) to compute attention score matrix of shape $(b * n)$. There are multiple options to compute this score matrix. A few of such options are available in [the table provided in this blog](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html#a-family-of-attention-mechanisms)
3. Normalize the attention scores $(b * n)$ so that they sum up to $1.0$ by taking a `softmax` over the second dimention. 

After computing the normalized attention distribution, take a weighted sum of the encoder outputs to obtain the attention context $c = \sum_i w_i e_i$, and add this to the decoder output $d$ to obtain the final representation to be passed to the vocabulary projection layer (you may need another linear layer to make the sizes match before adding $c$ and $d$).

In [None]:
class Seq2seqAttention(Seq2seqBaseline):
    def __init__(self, vocab):
        super().__init__(vocab)

        # Initialize any additional parameters needed for this model that are not
        # already included in the baseline model.
        
        # YOUR CODE HERE
        self.attention_1 = nn.Linear(self.hidden_dim, self.hidden_dim, bias=True)

        self.attention_2 = nn.Linear(self.hidden_dim, self.hidden_dim, bias=True)

      

    def decode(self, decoder_input, last_hidden, encoder_output, encoder_mask):
        """Run the decoder GRU for one decoding step from the last hidden state.

        The third and fourth arguments are not used in the baseline model, but are
        included for compatibility with the attention model in the next section.

        Args:
            decoder_input: An integer tensor with shape (1, batch_size) containing 
                the subword indices for the current decoder input.
            last_hidden: A pair of tensors h_{t-1} representing the last hidden
                state of the decoder, each with shape (num_layers, batch_size,
                hidden_size). For the first decoding step the last_hidden will be 
                encoder's final hidden representation.
            encoder_output: The output of the encoder with shape
                (max_src_sequence_length, batch_size, hidden_size).
            encoder_mask: The output mask from the encoder with shape
                (max_src_sequence_length, batch_size). Encoder outputs at positions
                with a True value correspond to padding tokens and should be ignored.

        Returns:
            A tuple with three elements:
                logits: A tensor with shape (batch_size,
                    vocab_size) containing unnormalized scores for the next-word
                    predictions at each position.
                decoder_hidden: tensor h_n with the same shape as last_hidden 
                    representing the updated decoder state after processing the 
                    decoder input.
                attention_weights: A tensor with shape (batch_size, 
                    max_src_sequence_length) representing the normalized
                    attention weights. This should sum to 1 along the last dimension.
        """

        # YOUR CODE HERE
        ...
        max_src_sequence_length= encoder_output.shape[0]
        batch_size =encoder_output.shape[1]
        hidden_size =encoder_output.shape[2]
        decoder_shape = decoder_input.shape[0]
        # print("batch size and hidden size")
        # print(batch_size)
        # print(hidden_size)
        # print('encoder output size')
        # print(encoder_output.shape)
        embeddings = self.embedding_layer(decoder_input)
        decoder_output, decoder_hidden = self.decode_layer(embeddings, last_hidden)
        curr_decoder_token = self.attention_1(decoder_output).view(decoder_shape, 1, batch_size, self.hidden_dim)
        encoder_representation = encoder_output.view(1, max_src_sequence_length, batch_size, self.hidden_dim)
        # print("decoder_token shape")
        # print(curr_decoder_token.shape)
        e = self.attention_2(encoder_output).view(1, max_src_sequence_length, batch_size, self.hidden_dim)
        # print('encoder representation shape')
        # print(encoder_representation.shape)
        product = curr_decoder_token * encoder_representation
        attention_weights = torch.sum(product, dim=3)
        # print('attention weights shape')
        # print(attention_weights.shape)

        softmax = torch.nn.Softmax(dim=1)
        norm_attention_weights = softmax(attention_weights)


        attention_context = torch.sum(norm_attention_weights.view(decoder_shape, max_src_sequence_length, batch_size, 1) * e,dim=1)


        attention_weights = torch.transpose(attention_weights, 1, 2)
    
        logits = self.linear_layer(decoder_output + attention_context)

 
        return logits, decoder_hidden, attention_weights


We can now train the attention model.

A correct implementation should also get an average train loss of < 3.00  
The code will automatically save and download the model at the end of training.

It may happen that the baseline model achieves lower loss than attention model. This is because our dataset is very small and the attention model may be over parameterized for our toy dataset. Regardless, we would consider this as acceptable submission if the attention model generated responses look comparable to the baseline model.

In [None]:
# You are welcome to adjust these parameters based on your model implementation.
num_epochs = 8
batch_size = 64

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, 
                               shuffle=True, collate_fn=collate_fn)

attention_model = Seq2seqAttention(vocab).to(device)
train(attention_model, data_loader, num_epochs, "attention_model.pt")
# Download the trained model to local for future use
files.download('attention_model.pt')

HBox(children=(FloatProgress(value=0.0, description='training', max=8.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='epoch 1', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 2', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 3', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 4', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 5', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 6', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 7', max=830.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='epoch 8', max=830.0, style=ProgressStyle(description_widt…





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Reload the model from the model file. 
# Useful when you have already trained and saved the model
attention_model = Seq2seqAttention(vocab).to(device)
attention_model.load_state_dict(torch.load("attention_model.pt", map_location=device))

<All keys matched successfully>

Let's test the attention model on the same inputs as baseline model.

In [None]:
def test_conversations_with_model(model, conversational_inputs = None, include_beam = False):
    # Some predefined conversational inputs. 
    # You may append more inputs at the end of the list, if you want to.
    basic_conversational_inputs = [
                                    "hello.",
                                    "please share you bank account number with me",
                                    "i have never met someone more annoying that you",
                                    "i like pizza. what do you like?",
                                    "give me coffee, or i'll hate you",
                                    "i'm so bored. give some suggestions",
                                    "stop running or you'll fall hard",
                                    "what is your favorite sport?",
                                    "do you believe in a miracle?",
                                    "which sport team do you like?"
    ]
    if not conversational_inputs:
        conversational_inputs = basic_conversational_inputs
    for input in conversational_inputs:
        print(f"Input > {input}")
        generation = predict_greedy(model, input)
        print('Greedy Response:', generation)
        if include_beam:
            # Also print the beam search responses from models
            generations = predict_beam(model, input)
            print('Beam Responses:')
            print_list(generations)
        print()

In [None]:
baseline_chat_inputs = [inp for inp, gen in baseline_chat]
attention_chat = test_conversations_with_model(attention_model, baseline_chat_inputs)

Input > hello.
Greedy Response: hello .

Input > How are you?
Greedy Response: i m fine .

Input > who are you?
Greedy Response: i m martha to be my father .

Input > what is your name?
Greedy Response: i m pulling you re my wife .

Input > where do you eat?
Greedy Response: i don t know .

Input > Which way should I go?
Greedy Response: i m not hungry yet .

Input > What sport do you like?
Greedy Response: i m sorry i m guilty .

Input > what time is it?
Greedy Response: eight thirty .

Input > ok goodbye.
Greedy Response: you ll never do this .

Input > goodbye
Greedy Response: we ll see .



## Beam Search (20 points)

Similar to greedy search, beam search generates one token at a time. However, rather than keeping only the single best hypothesis, we instead keep the top $k$ candidates at each time step. This is accomplished by computing the set of next-token extensions for each item on the beam and finding the top $k$ across all candidates according to total log-probability.

Candidates that are finished should be extracted in a final list of `generations` and removed from the beam. This strategy is useful for doing re-ranking the beam candidates using alternate scorers (example, Maximum Mutual Information Objective from [Li et. al. 2015](https://arxiv.org/pdf/1510.03055.pdf)). For this assignment, you will re-rank the beam generations as follows,  
$final\_score_i = \frac{score_i}{|generation_i|^\alpha}$, where $\alpha \in [0.5, 2]$.  
Terminate the search process once you have $k$ items in the `generations` list.

In [226]:
def predict_beam(model, sentence, k=5, max_length=100):
    """Make predictions for the given inputs using beam search.
    
    Args:
        model: A sequence-to-sequence model.
        sentence: An input sentence, represented as string.
        k: The size of the beam.
        max_length: The maximum length at which to truncate outputs in order to
            avoid non-terminating inference.
    
    Returns:
        A list of k beam predictions. Each element in the list should be a string
        corresponding to one of the top k predictions for the corresponding input,
        sorted in descending order by its final score.
    """

    # Implementation tip: once an eos_token has been generated for any beam, 
    # remove its subsequent predictions from that beam by adding a small negative 
    # number like -1e9 to the appropriate logits. This will ensure that the 
    # candidates are removed from the beam, as its probability will be very close
    # to 0. Using this method, uou will be able to reuse the beam of an already 
    # finished candidate

    # Implementation tip: while you are encouraged to keep your tensor dimensions
    # constant for simplicity (aside from the sequence length), some special care
    # will need to be taken on the first iteration to ensure that your beam
    # doesn't fill up with k identical copies of the same candidate.
    
    # You are welcome to tweak alpha
    alpha = 0.7
    model.eval()
    
    # YOUR CODE HERE
    ...
    model.cpu()

    # print("sentence and k")
    # print(sentence)
    # print(k)
    encoded_sentence = model.vocab.get_ids_from_sentence(sentence)
    #print(encoded_sentence)
    sentence_tensor = torch.Tensor(encoded_sentence)
    sentence_tensor.cpu()
    #sentence_tensor = torch.reshape(sentence_tensor, (1, len(encoded_sentence))).int()
    sentence_tensor = sentence_tensor.unsqueeze(1)
    #print(sentence_tensor.shape)
    encoder_output, encoder_mask,  last_hidden = model.encode(sentence_tensor.int())
    decode_input = sentence_tensor.new_ones(1,1).int() * bos_id
    decode_hidden = last_hidden
    generations = []
    decode_input_list = []



    logits, hidden, attention_weights = model.decode(decode_input, decode_hidden,encoder_output, encoder_mask)
    logits = logits.view(-1, model.num_words)
    logits = logits.squeeze(0)

    log_probs = F.log_softmax(logits)
   # print(log_probs)
    values, indecies = torch.topk(log_probs, k)
   # print(values)
   # print(indecies)
    topk_list = []

    for j in range(k):
      topk_list.append(([indecies[j]], values[j], hidden))
    decode_input_list = topk_list 
    # print('initial decode input list')
    # print(decode_input_list)

    for i in range(max_length):
    #  print(i)
    #  print(decode_input_list)
      topk_list = []
      for tupl in decode_input_list:
        prev_id_list, prev_value, decode_hidden = tupl

        decode_input = prev_id_list[-1]
    #    print("decode input int")
    #    print(decode_input.item())
        decode_input = sentence_tensor.new_ones(1,1).int() * decode_input.item()

        logits, hidden, attention_weights = model.decode(decode_input, decode_hidden,encoder_output, encoder_mask)
        logits = logits.view(-1, model.num_words)
        logits = logits.squeeze(0)
        log_probs = F.log_softmax(logits)
        values, indecies = torch.topk(log_probs, k)
        #print(values)
        #print(indecies)

        for j in range(len(values)):
          prev_final_idx = prev_id_list[-1]
          if prev_final_idx != indecies[j]:
            #print('current value')
            #print(prev_value + values[j])
            topk_list.append((prev_id_list + [indecies[j]],prev_value+ values[j], hidden))

      #overall top k

      # print('all current sentences ')
      # print(topk_list)
      #remove duplicates
      # sorted_topk = sorted(topk_list, reverse=True, key = lambda x: x[1])
      # seen = []
      # decode_input_list = [(a, b) for a, b in sorted_topk if not (a in seen or seen.append(a))]
      # decode_input_list = decode_input_list[:k]
      # print("sorted sentences list and length")
      # print(sorted(topk_list, reverse=True, key = lambda x: x[1]))
      # print(len(topk_list))
      sorted_topk= sorted(topk_list, reverse=True, key = lambda x: x[1])[:k]
      # print('sorted list only top k')
      # print(sorted_topk)
      # print(len(sorted_topk))
      # print("check sentences so far")
      # for lst in sorted_topk:
      #   list_of_ids1 = [id.item() for id in lst[0]]
      #   print(model.vocab.decode_sentence_from_ids(list_of_ids1))
      # # print('overall top k')
      # # print(decode_input_list)

      final_decode_input = []
      for i in range(len(sorted_topk)):

        sentence_ids = sorted_topk[i][0]
        score = sorted_topk[i][1]
        final_sentence_id = sentence_ids[-1]
        prev_hidden = sorted_topk[i][2]
        # print(sentence_ids)
        if final_sentence_id == eos_id:
          generations.append((sentence_ids, score))
          k = k-1
        else:
          final_decode_input.append((sentence_ids, score, prev_hidden))
        # if k == 0:
        #   pass


      decode_input_list = final_decode_input

      # print("current generations")
      # print(generations)
      # print('current decode-input-list')
      # print(decode_input_list)
    # if k >0:
    #   for i in range(k):
    #     generations.append(decode_input_list[i])

 
    # print('generations and size of k')
    # print(generations)
    # print(k)
    final_generations_and_list =[]
    for g in generations:
      list_of_ids, unnormed_score = g

      gen_length = len(list_of_ids)

      final_score = unnormed_score/ (gen_length**alpha)
     # print(final_score)
     # print(list_of_ids)
      list_of_ids = [id.item() for id in list_of_ids]
      #print(list_of_ids)
      final_sentence = model.vocab.decode_sentence_from_ids(list_of_ids) #" ".join(model.vocab.decode_sentence_from_ids(list_of_ids))
     # print(final_sentence)
      final_generations_and_list.append((final_sentence, final_score))
    # print("final generations after sorting")
    #print(final_generations_and_list)
    sorted_generations = sorted(final_generations_and_list, reverse=True, key = lambda x: x[1])
  
    # print(sorted_generations)
    return [i[0] for i in sorted_generations]


Now let's test both baseline and attention models on some predefined inputs and compare their greedy and beam responses side by side.

In [227]:
test_conversations_with_model(baseline_model, include_beam=True)

Input > hello.
Greedy Response: hello .
Beam Responses:
hello .
hello mrs . brigman .
hello ?
i thought you d do .
i thought you d be happy .


Input > please share you bank account number with me




Greedy Response: i m going home .
Beam Responses:
i don t .
i can t .
i don t think so .
i m not .
i don t want to talk .


Input > i have never met someone more annoying that you
Greedy Response: me !
Beam Responses:
me ?
me !
what ?
me neither !
me too !


Input > i like pizza. what do you like?
Greedy Response: i don t know .
Beam Responses:
i don t know .
i d like to talk to him .
i d like to talk to you .
i don t remember .
i don t know . india .


Input > give me coffee, or i'll hate you
Greedy Response: you re not gonna hurt me .
Beam Responses:
you can t !
you can t .
sacha !
sacha i m not alone .
sacha i m gonna kill you !


Input > i'm so bored. give some suggestions
Greedy Response: what is it ?
Beam Responses:
what is it ?
yes .
yeah .
yes you are .
what s that ?


Input > stop running or you'll fall hard
Greedy Response: what ?
Beam Responses:
what do you mean ?
what ?
that s what i m saying .
what s that ?
i don t know .


Input > what is your favorite sport?
Greedy Respo

In [213]:
test_conversations_with_model(attention_model, include_beam=True)

Input > hello.
Greedy Response: hello .




Beam Responses:
who is this ?
hello .
who is it ?
who s this ?
who ?


Input > please share you bank account number with me
Greedy Response: you re welcome .
Beam Responses:
what ?
you re welcome .
i m sorry .
you re not .
i m sorry . i m okay .


Input > i have never met someone more annoying that you
Greedy Response: no .
Beam Responses:
what do you mean ?
no you re not .
no .
oh god .
oh no .


Input > i like pizza. what do you like?
Greedy Response: i don t know .
Beam Responses:
what do you mean ?
i don t know .
i don t know how much i am .
i don t know how much do you ?
i don t know how much i m .


Input > give me coffee, or i'll hate you
Greedy Response: no you re not .
Beam Responses:
no you re not .
no i m not .
no .
no you re not !
no i m not talking about you .


Input > i'm so bored. give some suggestions
Greedy Response: oh god i m gonna go
Beam Responses:
what do you mean ?
what do you want to do ?
what do you want ?
what do you do ?
what do you want for free ?


Input >

## Automatic Evaluation (5 points)

Automatic evaluation of chatbots is an active research area. For this assignment we are going to use 3 very simple evaluation metrics.
1. Average Length of the Responses
2. Distinct1 = proportion of unique unigrams / total unigrams
3. Distinct2 = proportion of unique bigrams / total bigrams  
You will evaluate your baseline and attention models by running the cells below.

In [240]:
# Evaluate diversity of the models
def evaluate_diversity(model, mode="greedy"):
    """Evaluates the model's greedy or beam responses on eval_conversations
    
    Args:
        model: A sequence-to-sequence model.
        mode: "greedy" or "beam"
    
    Returns: avg_length, distinct1, distinct2
        avg_length: average length of the model responses
        distinct1: proportion of unique unigrams / total unigrams
        distinct2: proportion of unique bigrams / total bigrams
    """
    if mode == "beam":
        predict_f = predict_beam
    else:
        predict_f = predict_greedy
    generations = list()
    for src, tgt in eval_conversations:
        generation = predict_f(model, src)
        if mode == "beam":
            generation = generation[0]
        generations.append(generation)
    # Calculate average length, distinct unigrams and bigrams from generations
    
    # YOUR CODE HERE
    unique_unigrams = 0
    unique_bigrams = 0
    total_unigrams =0
    total_bigrams =0 
    total_len = 0 
    total_generations = len(generations)
    bigrams = []
    unigrams = []
    for text in generations:

      text_as_list = text.split()
      unigrams.extend(text_as_list)
      bigrams_i = [(text_as_list[i-1], text_as_list[i]) for i in range(1, len(text_as_list))]
      bigrams.extend(bigrams_i)
      total_len += len(text_as_list)


    total_unigrams = len(unigrams)
    # print(unigrams)
    unigram_set = set(unigrams)
    # print(unigram_set)


    total_bigrams = len(bigrams)
    # print(bigrams)
    # print(len(bigrams))
    # print(set(bigrams))
    # print(len(set(bigrams)))
    unique_bigrams = len(set(bigrams))
    unique_unigrams = len(set(unigrams))
      

    return total_len/total_generations, unique_unigrams/total_unigrams, unique_bigrams/total_bigrams






    

In [241]:
print(f"Baseline Model evaluation:")
avg_length, distinct1, distinct2 = evaluate_diversity(baseline_model)
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")
avg_length, distinct1, distinct2 = evaluate_diversity(baseline_model, mode="beam")
print(f"Beam decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")

Baseline Model evaluation:
Greedy decoding:
Avg Response Length = 4.18
Distinct1 = 0.23684210526315788
Distinct2 = 0.550314465408805




Beam decoding:
Avg Response Length = 4.06
Distinct1 = 0.1921182266009852
Distinct2 = 0.4117647058823529


In [242]:
print(f"Attention Model evaluation:")
avg_length, distinct1, distinct2 = evaluate_diversity(attention_model)
print(f"Greedy decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")
avg_length, distinct1, distinct2 = evaluate_diversity(attention_model, mode="beam")
print(f"Beam decoding:")
print(f"Avg Response Length = {avg_length}")
print(f"Distinct1 = {distinct1}")
print(f"Distinct2 = {distinct2}")

Attention Model evaluation:
Greedy decoding:
Avg Response Length = 4.76
Distinct1 = 0.21008403361344538
Distinct2 = 0.4787234042553192




Beam decoding:
Avg Response Length = 4.09
Distinct1 = 0.19315403422982885
Distinct2 = 0.42394822006472493


## What to turn in?

When you are done, make sure to run all the cells in your solution (including your conversation with the chatbot), and submit your notebook `Assignment 3 Final Release Version.ipynb` to Gradescope.
