In [15]:
import math
import random
import numpy as np
import pandas as pd
from collections import Counter
import nltk

## split and tokenize sentences

In [4]:
with open("../data/en_US.twitter.txt", "r") as f:
    data = f.read()

In [5]:
def split_to_sentences(data):
    sentences = data.split('\n')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    return sentences

In [6]:
def tokenize_sentences(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokenized = nltk.word_tokenize(sentence)
        tokenized_sentences.append(tokenized)
    return tokenized_sentences

In [7]:
def get_tokenized_data(data):
    sentences = split_to_sentences(data)
    tokenized_sentences = tokenize_sentences(sentences)
    return tokenized_sentences

In [8]:
tokenized_data = get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [9]:
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))

47961 data are split into 38368 train and 9593 test set


## count words and replace word with low frequency with unknown

In [17]:
def count_words(tokenized_sentences):
    flat_list = [item for sublist in tokenized_sentences for item in sublist]
    word_counts = Counter(flat_list)
    return word_counts

In [18]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
count_words(tokenized_sentences)

Counter({'sky': 1,
         'is': 1,
         'blue': 1,
         '.': 3,
         'leaves': 1,
         'are': 2,
         'green': 1,
         'roses': 1,
         'red': 1})

If your model is performing autocomplete, but encounters a word that it never saw during training, it won't have an input word to help it determine the next word to suggest. The model will not be able to predict the next word because there are no counts for the current word.

* This 'new' word is called an 'unknown word', or out of vocabulary (OOV) words.
* The percentage of unknown words in the test set is called the OOV rate.


In [19]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)
    closed_vocab = [word for word, cnt in word_counts.items() if cnt >= count_threshold]    
    return closed_vocab

In [21]:
#test
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
tmp_closed_vocab = get_words_with_nplus_frequency(tokenized_sentences, count_threshold=2)
print(f"Closed vocabulary:")
print(tmp_closed_vocab)

Closed vocabulary:
['.', 'are']


In [28]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:
        replaced_sentence = [token if token in vocabulary else unknown_token for token in sentence ]
        replaced_tokenized_sentences.append(replaced_sentence)
        
    return replaced_tokenized_sentences

In [30]:
#test
tokenized_sentences = [["dogs", "run"], ["cats", "sleep"]]
vocabulary = ["dogs", "sleep"]
tmp_replaced_tokenized_sentences = replace_oov_words_by_unk(tokenized_sentences, vocabulary)
print(f"Original sentence:")
print(tokenized_sentences)
print(f"tokenized_sentences with less frequent words converted to '<unk>':")
print(tmp_replaced_tokenized_sentences)

Original sentence:
[['dogs', 'run'], ['cats', 'sleep']]
tokenized_sentences with less frequent words converted to '<unk>':
[['dogs', '<unk>'], ['<unk>', 'sleep']]


## preproccess train and test and replace low frwquencies with unknown

In [31]:
def preprocess_data(train_data, test_data, count_threshold):
    vocabulary = get_words_with_nplus_frequency(train_data,count_threshold)
    train_data_replaced = replace_oov_words_by_unk(train_data,vocabulary)
    test_data_replaced = replace_oov_words_by_unk(test_data,vocabulary)
    return train_data_replaced, test_data_replaced, vocabulary

In [32]:

minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

## n-grams
When computing the counts for n-grams, prepare the sentence beforehand by prepending $n-1$ starting markers $"<s>"$ to indicate the beginning of the sentence.
    
Also prepare the sentence for counting by appending an end token $"<e>"$ so that the model can predict when to finish a sentence.

In [40]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
    n_grams = {}
    for sentence in data:
        sentence = [start_token] * n+ sentence + [end_token]
        sentence = tuple(sentence)
        m = len(sentence) if n==1 else len(sentence)-1
        for i in range(m): 
            n_gram = sentence[i:i+n]
            if n_gram in n_grams.keys(): 
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1

    return n_grams

In [41]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print("Bi-gram:")
print(count_n_grams(sentences, 2))


Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


### estimated probability of a word given the previous n-gram using k-smoothing
$$ \hat{P}(w_t | w_{t-1}\dots w_{t-n}) = \frac{C(w_{t-1}\dots w_{t-n}, w_n) + k}{C(w_{t-1}\dots w_{t-n}) + k|V|} \tag{3} $$

In [43]:
def estimate_probability(word, previous_n_gram,n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    count_previous_n_gram = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts.keys() else 0
    denominator = count_previous_n_gram + k * vocabulary_size
    
    n_plus1_gram = previous_n_gram + (word,)
    count_n_plus1_gram = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts.keys() else 0
    numerator = count_n_plus1_gram + k
    probability = numerator / denominator
    return probability

In [45]:
#test
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)

bigram_counts = count_n_grams(sentences, 2)

tmp_prob = estimate_probability("cat", "a", unigram_counts, bigram_counts, len(unique_words), k=1)

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {tmp_prob:.4f}")

The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


Compute probability of all words after a n-gram.

In [46]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):


    previous_n_gram = tuple(previous_n_gram)
    
    # add <e> <unk> to the vocabulary
    # <s> is not needed since it should not appear as the next word
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities


## Perplexity
Perplexity is used as an evaluation metric of your language model.
To calculate the the perplexity score of the test set on an n-gram model, use:
$$ PP(W) =\sqrt[N]{ \prod_{t=n+1}^N \frac{1}{P(w_t | w_{t-n} \cdots w_{t-1})} } \tag{4}$$
* where $N$ is the length of the sentence.
* $n$ is the number of words in the n-gram (e.g. 2 for a bigram).
* In math, the numbering starts at one and not zero.

The more the n-grams tell us about the sentence, the lower the perplexity score will be

In [47]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):

    # length of previous words
    n = len(list(n_gram_counts.keys())[0]) 
    
    # prepend <s> and append <e>
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    
    # length of sentence (after adding <s> and <e> tokens)
    N = len(sentence)
    product_pi = 1.0
    for t in range(n, N): 
        n_gram = sentence[t-n:t]
        word = sentence[t]
        probability = estimate_probability(word,n_gram, n_gram_counts, n_plus1_gram_counts, len(unique_words), k=1)
        product_pi += np.log(1 / probability)

    log_perplexity = product_pi**(1/float(N))

    return log_perplexity

## auto-complete system

In [48]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0]) 

    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    
    suggestion = None
    max_prob = 0
    for word, prob in probabilities.items(): 
        if prob > max_prob: 
            suggestion = word
            max_prob = prob

    return suggestion, max_prob

In [51]:
#test
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)

previous_tokens = ["i", "like"]
tmp_suggest1 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{tmp_suggest1[0]}` with a probability of {tmp_suggest1[1]:.4f}")

The previous words are 'i like',
	and the suggested word is `a` with a probability of 0.2727



## Suggest multiple words using n-grams of varying length

In [52]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [53]:

n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [54]:
previous_tokens = ["i", "want", "to", "go"]
tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest5)

The previous words are ['i', 'want', 'to', 'go'], the suggestions are:


[('to', 0.014051961029228078),
 ('to', 0.004697942168993581),
 ('to', 0.0009424436216762033),
 ('to', 0.0004044489383215369)]

## make automate sentences

In [64]:
previous_tokens = ["i", "want"]
n_gram_counts_list = []
for n in range(1, 8):
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)
for i in range(1,10):
    tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)
    selected_word = max(tmp_suggest5)[0]
    previous_tokens = previous_tokens + [selected_word]
    
print(' '.join(previous_tokens))

i want to be in the same thing i said i


In [69]:
previous_tokens = ["he", "love"]
for i in range(1,10):
    tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=3.0)
    selected_word = max(tmp_suggest5)[0]
    previous_tokens = previous_tokens + [selected_word]

print(' '.join(previous_tokens))

he love you i i i i i i i i


In [71]:
previous_tokens = ["hey", "how", "are", "you"]

for i in range(1,11):
    tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)
    selected_word = max(tmp_suggest5)[0]
    previous_tokens = previous_tokens + [selected_word]

print(' '.join(previous_tokens))

hey how are you ? i miss you guys so much for the rt


## test set

In [None]:
for sentence in test_data_processed[0:10]:
    cntr = 0
    if len(sentence)>2:
        previous_tokens.append(sentence[0])
        for index in range(1,len(sentence)-1):
            tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)
            selected_word = max(tmp_suggest5)[0]
            previous_tokens = previous_tokens + [sentence[index]]
            if selected_word == sentence[index]:
                cntr +=1
                
        score = cntr/(len(sentence)-1)
        print(sentence,":",score," ",cntr)
    else:
        continue

['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>'] : 0.13333333333333333   2
['better', 'than', '``', 'misplaced', '<unk>', 'marks', "''", 'rt', ':', 'lately', 'i', 'find', 'that', 'i', 'am', 'adding', 'too', 'many', 'exclamation', 'points', 'where', 'they', 'do', "n't", 'belong', '!'] : 0.2   5
['producers', '!', 'send', 'me', 'some', 'beats', 'to', 'my', 'email', '....', '@'] : 0.0   0
['yeah', ',', 'the', 'beach', 'house', 'is', 'pretty', '<unk>', 'got', 'the', 'vinyl', 'which', 'came', 'with', 'a', 'download', 'card', '(', 'nice', 'touch', ')', ')', '<unk>', 'hate', 'lp', "'s", 'w/o', 'download', '.'] : 0.0   0
['pittsburgh', 'is', 'trending', '....', 'represent', '.'] : 0.0   0
['full', 'house', 'for', '<unk>', 'of', '<unk>', '!'] : 0.0   0
