# Week 02
: N-Grams Language Models

**1. Write out the equation for trigram probability estimation (modifying Eq. 3.11 from SLP Chapter 3). Now write out all the non-zero trigram probabilities for the I am Sam corpus in Chapter 3 on page 4.**

$$P(w_i | w_{i-2}, w_{i-1}) = \frac{c(w_{i-2}, w_{i-1},w_{i})}{c(w_{i-2}, w_{i-1})}$$

NOTE: hand-writing in ipad






**2. Write a program to compute unsmoothed n-grams. Use the Dr. Seuss corpus to test this.**

In [62]:
import re
from collections import Counter

In [126]:
def pre_process_corpus(corpus):
    return re.findall(r'[A-Za-z0-9|(<s>)|(</s>)]+', corpus)

corpus = '''
<s> I am Sam </s>
<s> Sam I am </s>
<s> I do not like green eggs and ham </s>
'''
print(pre_process_corpus(corpus))
len(pre_process_corpus(corpus))


['<s>', 'I', 'am', 'Sam', '</s>', '<s>', 'Sam', 'I', 'am', '</s>', '<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']


20

In [114]:
def find_consecutive_tuples(lst, n = 3):
    tuples = []
    for i in range(len(lst) - n + 1):
        tuple_n = tuple(lst[i:i+n])
        tuples.append(tuple_n)
    tuples = Counter(tuples)
    return tuples

# Example usage
my_list = [1, 2, 3, 4, 5, 6]
n = 4  # Size of each tuple
find_consecutive_tuples(my_list, n)

Counter({(1, 2, 3, 4): 1, (2, 3, 4, 5): 1, (3, 4, 5, 6): 1})

In [65]:
def cond_prob(sentence, counts_n, counts_n_minus_1):
    prefix = sentence[:-1]

    return counts_n[sentence] / counts_n_minus_1[prefix]

In [124]:
def n_grams(corpus, n = 2):

    # NOTE: I did this implementation of the n_grams without considering the vocabulary
    # a better implementation is to use the vocabulary and analyze every possible
    # n_tuple combination considering the order

    corpus = pre_process_corpus(corpus)
    
    counts_n = find_consecutive_tuples(corpus, n )

    counts_n_minus_1 = find_consecutive_tuples(corpus, n - 1)
    
    # One extreme case
    if n - 1 == 0:
        counts_n_minus_1[()] -= 1

    print(corpus)

    probabilities = {}
    for i in range(len(corpus) - n + 1):
        tuple_n = tuple(corpus[i:i+n])
        prob = cond_prob(tuple_n, counts_n, counts_n_minus_1)
        print(f"P({tuple_n[-1]}|{tuple_n[:-1]}) =", prob)
        probabilities[tuple_n] = prob

    return probabilities

In [67]:
# n-gram with n=2
corpus = '''
<s> I am Sam </s>
<s> Sam I am </s>
<s> I do not like green eggs and ham </s>
'''
n_grams(corpus, n=1)

P(<s>|()) = 0.15
P(I|()) = 0.15
P(am|()) = 0.1
P(Sam|()) = 0.1
P(</s>|()) = 0.15
P(<s>|()) = 0.15
P(Sam|()) = 0.1
P(I|()) = 0.15
P(am|()) = 0.1
P(</s>|()) = 0.15
P(<s>|()) = 0.15
P(I|()) = 0.15
P(do|()) = 0.05
P(not|()) = 0.05
P(like|()) = 0.05
P(green|()) = 0.05
P(eggs|()) = 0.05
P(and|()) = 0.05
P(ham|()) = 0.05
P(</s>|()) = 0.15


In [68]:
# Book Example: bigram or n-gram with n=2
corpus = '''
<s> I am Sam </s>
<s> Sam I am </s>
<s> I do not like green eggs and ham </s>
'''
n_grams(corpus, n=2)

P(I|('<s>',)) = 0.6666666666666666
P(am|('I',)) = 0.6666666666666666
P(Sam|('am',)) = 0.5
P(</s>|('Sam',)) = 0.5
P(<s>|('</s>',)) = 0.6666666666666666
P(Sam|('<s>',)) = 0.3333333333333333
P(I|('Sam',)) = 0.5
P(am|('I',)) = 0.6666666666666666
P(</s>|('am',)) = 0.5
P(<s>|('</s>',)) = 0.6666666666666666
P(I|('<s>',)) = 0.6666666666666666
P(do|('I',)) = 0.3333333333333333
P(not|('do',)) = 1.0
P(like|('not',)) = 1.0
P(green|('like',)) = 1.0
P(eggs|('green',)) = 1.0
P(and|('eggs',)) = 1.0
P(ham|('and',)) = 1.0
P(</s>|('ham',)) = 1.0


In [60]:
# n-gram with n=3
corpus = '''
<s> <s> I am Sam </s>
<s> <s> Sam I am </s>
<s> <s> I do not like green eggs and ham </s>
'''
n_grams(corpus, n=3)

Counter({('<s>', '<s>'): 3, ('<s>', 'I'): 2, ('I', 'am'): 2, ('</s>', '<s>'): 2, ('am', 'Sam'): 1, ('Sam', '</s>'): 1, ('<s>', 'Sam'): 1, ('Sam', 'I'): 1, ('am', '</s>'): 1, ('I', 'do'): 1, ('do', 'not'): 1, ('not', 'like'): 1, ('like', 'green'): 1, ('green', 'eggs'): 1, ('eggs', 'and'): 1, ('and', 'ham'): 1, ('ham', '</s>'): 1})
P(I|('<s>', '<s>')) = 0.6666666666666666
P(am|('<s>', 'I')) = 0.5
P(Sam|('I', 'am')) = 0.5
P(</s>|('am', 'Sam')) = 1.0
P(<s>|('Sam', '</s>')) = 1.0
P(<s>|('</s>', '<s>')) = 1.0
P(Sam|('<s>', '<s>')) = 0.3333333333333333
P(I|('<s>', 'Sam')) = 1.0
P(am|('Sam', 'I')) = 1.0
P(</s>|('I', 'am')) = 0.5
P(<s>|('am', '</s>')) = 1.0
P(<s>|('</s>', '<s>')) = 1.0
P(I|('<s>', '<s>')) = 0.6666666666666666
P(do|('<s>', 'I')) = 0.5
P(not|('I', 'do')) = 1.0
P(like|('do', 'not')) = 1.0
P(green|('not', 'like')) = 1.0
P(eggs|('like', 'green')) = 1.0
P(and|('green', 'eggs')) = 1.0
P(ham|('eggs', 'and')) = 1.0
P(</s>|('and', 'ham')) = 1.0


In [98]:
# NOTE: Probability of each sentence must be 1 in the bigram case
def probabilities_of_sentence(corpus, n = 2):
    sentences = re.findall(r'<s>.*</s>', corpus)
    probabilities = n_grams(corpus, n)

    for sentence in sentences:
        sentence = pre_process_corpus(sentence)
        cum_prob = 1
        counts_n  = find_consecutive_tuples(sentence, n)
        for tuple_n, count in counts_n.items():
            cum_prob *= (probabilities[tuple_n])**count
        
        print(sentence)
        print("P(sentence) = ", cum_prob)

# NOTE: It's not gonna sum up 1 because n-grams are insufficient model of language because
# don't capture long-distance dependences
    
# But in some tasks this model perform a descent job

corpus = '''
<s> I am Sam </s>
<s> Sam I am </s>
<s> I do not like green eggs and ham </s>
'''

probabilities_of_sentence(corpus)

P(I|('<s>',)) = 0.6666666666666666
P(am|('I',)) = 0.6666666666666666
P(Sam|('am',)) = 0.5
P(</s>|('Sam',)) = 0.5
P(<s>|('</s>',)) = 0.6666666666666666
P(Sam|('<s>',)) = 0.3333333333333333
P(I|('Sam',)) = 0.5
P(am|('I',)) = 0.6666666666666666
P(</s>|('am',)) = 0.5
P(<s>|('</s>',)) = 0.6666666666666666
P(I|('<s>',)) = 0.6666666666666666
P(do|('I',)) = 0.3333333333333333
P(not|('do',)) = 1.0
P(like|('not',)) = 1.0
P(green|('like',)) = 1.0
P(eggs|('green',)) = 1.0
P(and|('eggs',)) = 1.0
P(ham|('and',)) = 1.0
P(</s>|('ham',)) = 1.0
['<s>', 'I', 'am', 'Sam', '</s>']
P(sentence) =  0.1111111111111111
['<s>', 'Sam', 'I', 'am', '</s>']
P(sentence) =  0.05555555555555555
['<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']
P(sentence) =  0.2222222222222222


In [101]:
corpus = '''
<s> <s> I am Sam </s>
<s> <s> Sam I am </s>
<s> <s> I do not like green eggs and ham </s>
'''

probabilities_of_sentence(corpus, n = 4)

P(am|('<s>', '<s>', 'I')) = 0.5
P(Sam|('<s>', 'I', 'am')) = 1.0
P(</s>|('I', 'am', 'Sam')) = 1.0
P(<s>|('am', 'Sam', '</s>')) = 1.0
P(<s>|('Sam', '</s>', '<s>')) = 1.0
P(Sam|('</s>', '<s>', '<s>')) = 0.5
P(I|('<s>', '<s>', 'Sam')) = 1.0
P(am|('<s>', 'Sam', 'I')) = 1.0
P(</s>|('Sam', 'I', 'am')) = 1.0
P(<s>|('I', 'am', '</s>')) = 1.0
P(<s>|('am', '</s>', '<s>')) = 1.0
P(I|('</s>', '<s>', '<s>')) = 0.5
P(do|('<s>', '<s>', 'I')) = 0.5
P(not|('<s>', 'I', 'do')) = 1.0
P(like|('I', 'do', 'not')) = 1.0
P(green|('do', 'not', 'like')) = 1.0
P(eggs|('not', 'like', 'green')) = 1.0
P(and|('like', 'green', 'eggs')) = 1.0
P(ham|('green', 'eggs', 'and')) = 1.0
P(</s>|('eggs', 'and', 'ham')) = 1.0
['<s>', '<s>', 'I', 'am', 'Sam', '</s>']
P(sentence) =  0.5
['<s>', '<s>', 'Sam', 'I', 'am', '</s>']
P(sentence) =  1.0
['<s>', '<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']
P(sentence) =  0.5


**3. Run your n-gram program on two different small corpora of your choice (you might use email text or newsgroups). Now compare the statistics of the two corpora. What are the differences in the most common unigrams between the two? How about interesting differences in bigrams?**


In [103]:
corpus1 = '''
<s> The study of artificial intelligence has made significant progress in recent years </s>
<s> Advanced algorithms and computing power have enabled machines to learn complex patterns and make decisions based on data </s>
<s> The application of machine learning in various fields such as healthcare, finance, and transportation has demonstrated its potential to revolutionize industries </s>
'''

corpus2 = '''
<s> Hey, how's it going? I was thinking about catching a movie later </s>
<s> Yeah, that sounds great! I've been wanting to see the new superhero film </s>
<s> It's supposed to be really good. Let's grab some dinner before the movie </s>
<s> Maybe pizza or burgers? </s>

'''

In [105]:
probs = n_grams(corpus1, n = 2)

P(The|('<s>',)) = 0.6666666666666666
P(study|('The',)) = 0.5
P(of|('study',)) = 1.0
P(artificial|('of',)) = 0.5
P(intelligence|('artificial',)) = 1.0
P(has|('intelligence',)) = 1.0
P(made|('has',)) = 0.5
P(significant|('made',)) = 1.0
P(progress|('significant',)) = 1.0
P(in|('progress',)) = 1.0
P(recent|('in',)) = 0.5
P(years|('recent',)) = 1.0
P(</s>|('years',)) = 1.0
P(<s>|('</s>',)) = 0.6666666666666666
P(Advanced|('<s>',)) = 0.3333333333333333
P(algorithms|('Advanced',)) = 1.0
P(and|('algorithms',)) = 1.0
P(computing|('and',)) = 0.3333333333333333
P(power|('computing',)) = 1.0
P(have|('power',)) = 1.0
P(enabled|('have',)) = 1.0
P(machines|('enabled',)) = 1.0
P(to|('machines',)) = 1.0
P(learn|('to',)) = 0.5
P(complex|('learn',)) = 1.0
P(patterns|('complex',)) = 1.0
P(and|('patterns',)) = 1.0
P(make|('and',)) = 0.3333333333333333
P(decisions|('make',)) = 1.0
P(based|('decisions',)) = 1.0
P(on|('based',)) = 1.0
P(data|('on',)) = 1.0
P(</s>|('data',)) = 1.0
P(<s>|('</s>',)) = 0.6666666

In [107]:
probs = n_grams(corpus2, n = 2)

P(Hey|('<s>',)) = 0.25
P(how|('Hey',)) = 1.0
P(s|('how',)) = 1.0
P(it|('s',)) = 0.3333333333333333
P(going|('it',)) = 1.0
P(I|('going',)) = 1.0
P(was|('I',)) = 0.5
P(thinking|('was',)) = 1.0
P(about|('thinking',)) = 1.0
P(catching|('about',)) = 1.0
P(a|('catching',)) = 1.0
P(movie|('a',)) = 1.0
P(later|('movie',)) = 0.5
P(</s>|('later',)) = 1.0
P(<s>|('</s>',)) = 0.75
P(Yeah|('<s>',)) = 0.25
P(that|('Yeah',)) = 1.0
P(sounds|('that',)) = 1.0
P(great|('sounds',)) = 1.0
P(I|('great',)) = 1.0
P(ve|('I',)) = 0.5
P(been|('ve',)) = 1.0
P(wanting|('been',)) = 1.0
P(to|('wanting',)) = 1.0
P(see|('to',)) = 0.5
P(the|('see',)) = 1.0
P(new|('the',)) = 0.5
P(superhero|('new',)) = 1.0
P(film|('superhero',)) = 1.0
P(</s>|('film',)) = 1.0
P(<s>|('</s>',)) = 0.75
P(It|('<s>',)) = 0.25
P(s|('It',)) = 1.0
P(supposed|('s',)) = 0.3333333333333333
P(to|('supposed',)) = 1.0
P(be|('to',)) = 0.5
P(really|('be',)) = 1.0
P(good|('really',)) = 1.0
P(Let|('good',)) = 1.0
P(s|('Let',)) = 1.0
P(grab|('s',)) = 0.3333

**4. Write a definition of perplexity in language modeling.**

**perplexity** is a metric to measure the performance of different natual language models in an intrinsic way. Or in other words, without taking into account any particular application.

The metric is inversaly proporational to the probablitity of a sentence, and normalized by the number of words (N).

It's helpful for quick prototyping models (and choosing between them), but it doesn't guarantee to get an extrinsic result in a specific applicaiton.

It can also be understand in terms of information theory

**5. Add an option to your program to compute the perplexity of a test set.**



In [137]:
def preplexity(test_corpus, probabilities, n = 2):#
    # NOTE: differently as the function above
    # the cumulative probability is calculate in the whole corpus
    # not by sentence

    # NOTE: the probabilities where already previously calculated by
    # the n_gram. In order to test, we only need to pick up
    # the corresponding probabilities
    # NOTE: notice that if a n-gram from test doesn't appear
    # on the train. Then, that n-gram will not have a 
    # probability associated, then we assign zero prob by the moment.
    # TODO: review if it's ok to assign prob 0
    test_corpus = pre_process_corpus(test_corpus)

    counts_n = find_consecutive_tuples(test_corpus, n)

    cum_prob = 1

    for tuple_n, count in counts_n.items():
            
            if tuple_n in probabilities:
                cum_prob *= probabilities[tuple_n] ** count
            else:
                cum_prob *= 0
                break
    
    # NOTE: here W is the whole corpus
    print("P(W) =", cum_prob)

    preplexity = cum_prob ** (-1/len(test_corpus))

    return preplexity

In [138]:
corpus_train = '''
<s> I am Sam </s>
<s> Sam I am </s>
<s> I do not like green eggs and ham </s>
'''

# NOTE: Testing in the train corpus creates an artificailly low perplexity
probabilities = n_grams(corpus_train, n = 2)
print("Preplexity: ", preplexity(corpus_train, probabilities, n = 2))

['<s>', 'I', 'am', 'Sam', '</s>', '<s>', 'Sam', 'I', 'am', '</s>', '<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']
P(I|('<s>',)) = 0.6666666666666666
P(am|('I',)) = 0.6666666666666666
P(Sam|('am',)) = 0.5
P(</s>|('Sam',)) = 0.5
P(<s>|('</s>',)) = 0.6666666666666666
P(Sam|('<s>',)) = 0.3333333333333333
P(I|('Sam',)) = 0.5
P(am|('I',)) = 0.6666666666666666
P(</s>|('am',)) = 0.5
P(<s>|('</s>',)) = 0.6666666666666666
P(I|('<s>',)) = 0.6666666666666666
P(do|('I',)) = 0.3333333333333333
P(not|('do',)) = 1.0
P(like|('not',)) = 1.0
P(green|('like',)) = 1.0
P(eggs|('green',)) = 1.0
P(and|('eggs',)) = 1.0
P(ham|('and',)) = 1.0
P(</s>|('ham',)) = 1.0
P(W) = 0.0006096631611034902
Preplexity:  1.4479231182334018


In [139]:
# Example of numbers

corpus_train = '''0 1 2 3 4 5 6 7 8 9'''

# NOTE: Testing in the train corpus creates an artificailly low perplexity
probabilities = n_grams(corpus_train, n = 1)
print("Preplexity: ", preplexity(corpus_train, probabilities, n = 1))

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
P(0|()) = 0.1
P(1|()) = 0.1
P(2|()) = 0.1
P(3|()) = 0.1
P(4|()) = 0.1
P(5|()) = 0.1
P(6|()) = 0.1
P(7|()) = 0.1
P(8|()) = 0.1
P(9|()) = 0.1
P(W) = 1.0000000000000006e-10
Preplexity:  10.0


In [140]:
# Example of numbers

# 91 zeros
corpus_train = ""
for i in range(91): 
    corpus_train += " 0"

corpus_train += " 1 2 3 4 5 6 7 8 9"

print(corpus_train)

corpus_test = "0 0 0 0 0 3 0 0 0 0"

# NOTE: Notice that the perplexity is low because it contains a lot of zeros, and
# as the training corpus have a lot of zeros, it gives more weight to zeros (and more prob)
# as the test contains a good quantity of zeros it gives us a really low perplexity

# NOTE: this example help us to understand that the perplexity can be understand as
# weighted average branching factor of a language
# normally the branching factor is the number of possible next words that can follow any word
# in this case, it would 10 because we can only select from 0 to 9 numbers
# but as the perplexity is based on the probabilities it give more weight to 
# the appearance of zero in the test set, and consequently reduce the perplexity
probabilities = n_grams(corpus_train, n = 1)
print("Preplexity: ", preplexity(corpus_test, probabilities, n = 1))

 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6 7 8 9
['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.91
P(0|()) = 0.9

**6. You are given a training set of 100 numbers that consists of 91 zeros and 1 each of
the other digits 1-9. Now we see the following test set: 0 0 0 0 0 3 0 0 0 0. What is
the unigram perplexity?**

NOTE: Review ipad

In [142]:
((0.91 ** 9) * 0.01) ** (-1/10)

1.7252925496828495

# Quesitions to Ask

1. The n-gram language model objective is to get all the possible n-gram conditional probabilities, right?
    - In a bi-gram, for example we only be interested in the consecutive pair (or in any pair in general). I think in any pair in general because we have to build that table for all the pairs.
    - Then the process would be based on the vocabulary build all the pairs and assign different probabilities to those pair considering ordering because (I, am) would be different than (am, I)
    - NOTE: Then, if a pair doesn't appear in the given corpus. It will be have 0 probability right? or not?
    - A consequence of this model is that if a word is also not present in the n-gram train corpus. Then, there is not an assignated probability.

2. What happen with the test corpus, when it has a word (or pair) that is not in the train corpus. The probability p(corpus) will become zero and the perplexity -> infinity ???



# Takeaways

- By trying two different corpus of two different topics like emails and newspapers. You would get an intuition of what you should expect in different problems.