# Language Models: Auto-Complete

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# Import required libraries
import math
import numpy as np
import pandas as pd
import nltk
from utils import (split_to_sentences, tokenize_sentences,
                   train_eval_test_split, create_vocab, replace_oov_words_by_unk)

### Load Corpus

We will use twitter data.

In [3]:
with open('./en_US.twitter.txt', 'r', encoding='utf8') as f:
    data = f.read()

print('Some letters from the test corpus:\n', data[:300])
print()
print('Unformatted lettets from the text corpus:')
data[:300]


Some letters from the test corpus:
 How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.
When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.
they've decided its more fun if I don't.
So Tired D; Played Lazer Tag & Ran A 

Unformatted lettets from the text corpus:


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

### Pre-process the data

Preprocess data with the following steps:

1. Split data into sentences using "\n" as the delimiter.
2. Split each sentence into tokens. Note that in this assignment we use "token" and "words" interchangeably.
3. Assign sentences into train or test sets.
4. Find tokens that appear at least N times in the training data.
5. Replace tokens that appear less than N times by `<unk>`

In [4]:
sentences = split_to_sentences(data)
print("First 5 sentences in the corpus:\n")
for i in range(5):
    print(f'{i+1}. {sentences[i]}')

First 5 sentences in the corpus:

1. How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.
2. When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.
3. they've decided its more fun if I don't.
4. So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In 5 Minutes ;)
5. Words from a complete stranger! Made my birthday even better :)


In [5]:
tokenized_sentences = tokenize_sentences(sentences)
print("First 2 sentences in the corpus:\n")
for i in range(2):
    print(f'{i+1}. {tokenized_sentences[i]}\n')

First 2 sentences in the corpus:

1. ['how', 'are', 'you', '?', 'btw', 'thanks', 'for', 'the', 'rt', '.', 'you', 'gon', 'na', 'be', 'in', 'dc', 'anytime', 'soon', '?', 'love', 'to', 'see', 'you', '.', 'been', 'way', ',', 'way', 'too', 'long', '.']

2. ['when', 'you', 'meet', 'someone', 'special', '...', 'you', "'ll", 'know', '.', 'your', 'heart', 'will', 'beat', 'more', 'rapidly', 'and', 'you', "'ll", 'smile', 'for', 'no', 'reason', '.']



### Split tokenized corpus data into train and test sets

In [6]:
train_data, test_data, _ = train_eval_test_split(tokenized_sentences) 
print('Length of training data:', len(train_data))
print('Length of test data:', len(test_data))

Length of training data: 38368
Length of test data: 9592


### Create Vocabulary
Consider tokens that appear at least N (freq) times in the training data.


In [7]:

vocab = create_vocab(train_data)

# add end token - </s> and oov token - <unk> to the vocabulary
# start token - <s> is not needed since it should not appear as the next word
vocab = vocab + ["</s>", "<unk>"]

In [8]:
print('Length of vocabulary considering words occured at least 2times in the training set:', len(vocab))
print('Some words in the vocabulary:', vocab[:5])

Length of vocabulary considering words occured at least 2times in the training set: 14796
Some words in the vocabulary: ['freak', 'suspended', 'themselves', 'lifetime', 'advanced']


### Handling Out of Vocabulary Words 
The words that appear `freq` times or more are in the closed vocabulary. 
- All other words are regarded as `unknown`.
- Replace words not in the closed vocabulary with the token `<unk>`.
- Process accordingly for both train and test data.

In [9]:
processed_train_data = replace_oov_words_by_unk(train_data, vocab, unknown_token='<unk>')
processed_test_data = replace_oov_words_by_unk(test_data, vocab, unknown_token='<unk>')

In [10]:
print('Example of unknown characters in the train dataset:\n',processed_train_data[200])
print()
print('Example of unknown characters in the test dataset:\n',processed_test_data[210])

Example of unknown characters in the train dataset:
 ['i', 'think', 'everybody', 'in', 'their', 'right', 'mind', 'does', '.', 'haha', '#', '<unk>', '&', 'brooke', '<unk>', 'told', 'me', 'about', 'it', '.']

Example of unknown characters in the test dataset:
 ['or', '<unk>', 'up', "'", ':', 'p']


### Build N-gram Model

The conditional probability for the word at position 't' in the sentence, given that the words preceding it are $w_{t-1}, w_{t-2} \cdots w_{t-n+1}$ is:

$$ P(w_t | w_{t-n+1} \dots w_{t-1}) \tag{1}$$

We estimate this probability using,
$$ \hat{P}(w_t | w_{t-n+1} \dots w_{t-1}) = \frac{C(w_{t-n+1} \dots w_{t-1}, w_t)}{C(w_{t-n+1} \dots w_{t-1})} \tag{2} $$

Implementation:
1. Create a function to compute n-gram and (n-1)-gram counts: ${C(w_{t-n+1} \dots w_{t-1}, w_t)}$ and 
${C(w_{t-n+1} \dots w_{t-1})}$ respectively.
2. Using the above two counts we estimate the probability of a word.
3. To handle cases where both counts become zero, resulting in division by zero case, we introduce smoothing techniques.

### Generate n-gram and (n-1)-gram counts

In [12]:
def count_ngrams(dataset, N):
    """
    Returns a dictionary where the keys are the ngrams and the values are their 
    corresponding counts/occurences in the input dataset.

    Params:
    -----------
    dataset: list
        List of sentences tokenized to words.
    N: int
        The number of words to consider for ngrams. N = 2 for bigram, N = 3 for trigram and so on.
    
    Returns:
    -----------
    ngrams_count: dict
        A dictionary where the keys are the ngrams and the values are their 
        corresponding counts/occurences in the input dataset.
    """
    # Initialize a dictionary which stores the count of ngrams
    ngrams_count = {}
    for sent in dataset:
        # Prepend and append start and end tag respectively to the sentences as per ngram.
        sent_with_tag = ['<s>'] * (N-1) + sent + ['</s>']
        
        for i in range(len(sent_with_tag) - N + 1):
            ngram = tuple(sent_with_tag[i:i+N])
            ngrams_count[ngram] = ngrams_count.get(ngram, 0) + 1 
    return ngrams_count

In [13]:
# Capture n-gram counts upto 5grams.
unigram_count = count_ngrams(processed_train_data, N=1)
bigrams_count = count_ngrams(processed_train_data, N=2)
trigrams_count = count_ngrams(processed_train_data, N=3)
quadrigrams_count = count_ngrams(processed_train_data, N=4)
pentagrams_count = count_ngrams(processed_train_data, N=5)

In [14]:
print('First five entries in the unigram count dictionary: ')
print(list(unigram_count.items())[:5])
print()
print('First five entries in the bigram count dictionary: ')
print(list(bigrams_count.items())[:5])
print()
print('First five entries in the trigram count dictionary: ')
print(list(trigrams_count.items())[:5])

First five entries in the unigram count dictionary: 
[(('some',), 978), (('of',), 5850), (('the',), 15247), (('highlights',), 8), (('from',), 1365)]

First five entries in the bigram count dictionary: 
[(('<s>', 'some'), 59), (('some', 'of'), 63), (('of', 'the'), 923), (('the', 'highlights'), 2), (('highlights', 'from'), 1)]

First five entries in the trigram count dictionary: 
[(('<s>', '<s>', 'some'), 59), (('<s>', 'some', 'of'), 9), (('some', 'of', 'the'), 23), (('of', 'the', 'highlights'), 1), (('the', 'highlights', 'from'), 1)]


### Smoothing Techniques:
We will use below smoothing techniques to handle scenarios due to some count of ngrams going to zero.

### add-k smoothing
To handle zero counts while estimating probabilities of n-grams we use  add k-smoothing.
- K-smoothing adds a positive constant $k$ to each numerator and $k \times |V|$ in the denominator, where $|V|$ is the number of words in the vocabulary.

$$ \hat{P}(w_t | w_{t-n+1} \dots w_{t-1}) = \frac{C(w_{t-n+1} \dots w_{t-1}, w_t) + k}{C(w_{t-n+1} \dots w_{t-1}) + k|V|} $$


### Unigram Prior smoothing
$$ \hat{P}(w_t | w_{t-n+1} \dots w_{t-1}) = \frac{C(w_{t-n+1} \dots w_{t-1}, w_t) + mP(w_t)}{C(w_{t-n+1} \dots w_{t-1}) + m} $$

In [15]:
def get_ngram_word_probability(word,
                               n_minus_1_gram,
                               ngrams_count,
                               n_minus_1_grams_count,
                               unigram_count,
                               vocab=vocab, 
                               k=2e-6, 
                               m=0.5):
    """
    Estimates the ngram probability of the word given (n-1)-gram previous sequence of words.
    
    Params:
    -----------
    word: str
        The word for which the n-gram probability needs to be computed given n-1 grams.
    n_minus_1_gram: tuple
        Tuple of words used to model (n-1)-gram to predict the next most probable word.
    ngrams_count: dict
        A dictionary where the keys are the ngrams and the values are their 
        corresponding counts/occurences in the training dataset.
    n_minus_1_grams_count:  dict
        A dictionary where the keys are the n-1 grams and the values are their 
        corresponding counts/occurences in the training dataset.
    unigram_count: dict
        A dictionary where the keys are the unigrams and the values are their 
        corresponding counts/occurences in the training dataset.
    vocab: list
        The vocabulary (list of words) being used.
    k: float
        The add-k smoothing parameter.
    m: float
        The unigram prior smoothing parameter.
    
    Returns:
    -----------
    ngram_word_probability: float
        The ngram probabilities of all words in the vocabulary 
        computed using the previous (n-1) sequence of words.
    """
    V = len(vocab)
    
    ngram = n_minus_1_gram + (word,)
        
        # SMOOTHING TECHNIQUES.

        # NO SMOOTHING APPLIED:
        # if n_minus_1_grams_count.get(n_minus_1_gram, 0) > 0:
        #     ngram_probabilities[word] = ngrams_count.get(ngram, 0) / n_minus_1_grams_count.get(n_minus_1_gram, 0)
        # else:
        #     ngram_probabilities[word] = 0

        # ADD-K SMOOTHING:
        # ngram_probabilities[word] = (ngrams_count.get(ngram, 0) + k) / (n_minus_1_grams_count.get(n_minus_1_gram, 0) + k * V)

        # UNIGRAM PRIOR SMOOTHING:
    numerator = ngrams_count.get(ngram, 0) + m * (unigram_count.get(word, unigram_count[('<unk>',)]) / V)
    denominator = n_minus_1_grams_count.get(n_minus_1_gram, 0) + m
    ngram_word_probability = numerator / denominator

    return ngram_word_probability


### The Auto-complete Function



In [29]:
def auto_complete(previous_words, vocab=vocab):
    """
    Auto-complete a sequence of words by a word based on n-gram model.

    Params:
    ----------
    previous_words: str
        A sequence of words.
    vocab: list
        The vocabulary to use.

    Returns:
    ----------
    suggested_words: list
        List of tuples containing the most probable word to occur 
        following the given sequence of words and its probability.
    """
    # Preprocess input sequence of words into tokens and handle oov characters.
    tokenized_words = tokenize_sentences([previous_words])
    processed_words = replace_oov_words_by_unk(tokenized_words, vocab=vocab)

    # Store the n-grams to use in a list.
    ngrams_count_list = [unigram_count, bigrams_count, trigrams_count, quadrigrams_count, pentagrams_count]

    suggested_words = []
    for i in range(len(ngrams_count_list)-1):
        ngrams_count = ngrams_count_list[i+1]
        n_minus_1_grams_count = ngrams_count_list[i]

        # Get the value of n-gram being currently used.
        N = len(list(ngrams_count.keys())[0])

        # Generate (n-1)-gram tuple which will be used to predict 
        # the next most probable word from the vocabulary.
        n_minus_1_gram = tuple(processed_words[0][-N+1:])

        # For every word in the vocabulary get its n-gram probability 
        # using the previous (n-1)-gram word sequence and store in a dictionary.
        ngram_probabilities = {}
        for word in vocab:
            # Get the most probable word after the (n-1)-gram sequence of words.
            ngram_probabilities[word] = get_ngram_word_probability(word,
                                                                n_minus_1_gram=n_minus_1_gram,
                                                                ngrams_count=ngrams_count, 
                                                                n_minus_1_grams_count=n_minus_1_grams_count, 
                                                                unigram_count=unigram_count)

        next_word = sorted(ngram_probabilities.items(), key=lambda item: item[1], reverse=True)[0]
        suggested_words.append(next_word)
    return suggested_words


In [33]:
# Use trigram model to suggest next word.
# Here we find w such that, P(w | are very) is maximum.
auto_complete("today is a beautiful")

[('day', 0.14130675267935106),
 ('day', 0.42733057127129764),
 ('thing', 0.6789488559561552),
 ('freak', 1.4684374155177076)]

### Model Evaluation - Perplexity

Perplexity score on the test set comprising a list of sentences based on an n-gram model is given by: 

$$ PP(W) =\sqrt[m]{ \prod_{i=1}^m \prod_{j=1}^{|s_i|}  \frac{1}{P(w_j^i \ | \ w_{j -N+ 1}^i \cdots \ w_{j-1}^i)} }$$

- where $W$ is the set of $m$ sentences. $W = (s_1 s_2 \cdots s_m)$
- $s_i$ is the i-th sentence.
- $|s_i|$ is the length of the i-th sentence.
- $w_j^i$ is the j-th word of the i-th sentence.

**NOTE:**
If the list of sentences are concatenated to form a single list of words, then Perplexity is given by,
$$ PP(W) =\sqrt[m]{ \prod_{i=1}^m \frac{1}{P(w_i \ | \ w_{i -N+ 1} \cdots \ w_{i-1})} }$$

- where $W$ is the set of $m$ words. $W = (w_1 w_2 \cdots w_m)$

While concatenating, start tags `<s>` are not added in between sentences. `<s>` is added (n-1) times in the beginning.

The higher the probabilities are, the lower the perplexity will be. The more the n-grams tell us about the sentence, the lower the perplexity score will be. 

**NOTE:**
To prevent underflow, we use log formula,
$$ PP(W) = -\frac{1}{m} \sum_{i=1}^m \log_2 P(w_i \ | \ w_{i -N+ 1} \cdots \ w_{i-1}) $$

Implementation Strategy:<br>
<img src='./perplexity.png' width=1000px>

In [35]:
def calculate_perplexity(sentences, 
                         ngrams_count, 
                         n_minus_1_grams_count,
                         unigram_count):
    """
    Computes the perplexity of a n-gram model.
    
    Params:
    ----------
    sentences: list
        List of sentences containing list of word tokens.
    ngrams_count: dict
        A dictionary where the keys are the ngrams and the values are their 
        corresponding counts/occurences in the training dataset.
    n_minus_1_grams_count:  dict
        A dictionary where the keys are the n-1 grams and the values are their 
        corresponding counts/occurences in the training dataset.
    unigram_count: dict
        A dictionary where the keys are the unigrams and the values are their 
        corresponding counts/occurences in the training dataset.
    
    Returns:
    ----------
    log_perplexity: float
        The log perplexity of the n-gram model.
    
    """
    
    # Get the value of n-gram being currently used.
    N = len(list(ngrams_count.keys())[0])

    # Append start tags to the first sentence in the list.
    sentences[0] = ['<s>'] * (N-1) + sentences[0] 

    # Store all the concatenated sentences in a list.
    flat_sentences = []     
    for sent in sentences:
        flat_sentences = flat_sentences + sent

    # # Consider only the words in the dataset and no added tags (<s>).
    m = len(flat_sentences) - (N-1)
    sum_ = 0
    for i in range(N-1, m+N-1):
        word = flat_sentences[i]
        n_minus_1_gram = tuple(flat_sentences[i-N:i])
        sum_ +=  math.log(get_ngram_word_probability(word=word,
                                                    n_minus_1_gram=n_minus_1_gram,
                                                    ngrams_count=ngrams_count,
                                                    n_minus_1_grams_count=n_minus_1_grams_count,
                                                    unigram_count=unigram_count),2)
    log_perplexity = -1 / m * sum_
    return log_perplexity
        

In [None]:
log_perplexity =  calculate_perplexity(sentences=processed_test_data.copy(), 
                                       ngrams_count=ngrams_count, 
                                       n_minus_1_grams_count=n_minus_1_grams_count,
                                       unigram_count=unigram_count)

In [None]:
print(f'Log perplexity of the n-gram model is {log_perplexity:.4f}')
print(f'Perplexity of the n-gram model is {2 ** log_perplexity:.4f}')

Log perplexity of the n-gram model is -0.5543
Perplexity of the n-gram model is 0.6810
