# Exercise 05: Language Models

### Task 1: Creating a Language Model

Try and implement the Pen&Paper Task 2a) - c) in Python.

In [1]:
import nltk
from nltk.util import ngrams
from nltk.lm.preprocessing import pad_both_ends
from collections import Counter
from pprint import pprint

text = [["ain't", "no", "sunshine"], 
        ["when", "she's", "gone"], 
        ["it's", "not", "warm"], 
        ["when", "she's", "away"], 
        ["ain't", "no", "sunshine"], 
        ["when", "she's", "gone"]]

text_pad = [list(pad_both_ends(t, n=2)) for t in text]
print(text_pad)


# a)
unigram_counts = Counter()
bigram_counts = Counter() 
for t in text_pad:
    unigram_counts.update(ngrams(t, 1))
    bigram_counts.update(ngrams(t, 2))

print()
pprint(unigram_counts)
print()
pprint(bigram_counts)

# number of words
N = sum(unigram_counts.values())
print(N)

[['<s>', "ain't", 'no', 'sunshine', '</s>'], ['<s>', 'when', "she's", 'gone', '</s>'], ['<s>', "it's", 'not', 'warm', '</s>'], ['<s>', 'when', "she's", 'away', '</s>'], ['<s>', "ain't", 'no', 'sunshine', '</s>'], ['<s>', 'when', "she's", 'gone', '</s>']]

Counter({('<s>',): 6,
         ('</s>',): 6,
         ('when',): 3,
         ("she's",): 3,
         ("ain't",): 2,
         ('no',): 2,
         ('sunshine',): 2,
         ('gone',): 2,
         ("it's",): 1,
         ('not',): 1,
         ('warm',): 1,
         ('away',): 1})

Counter({('<s>', 'when'): 3,
         ('when', "she's"): 3,
         ('<s>', "ain't"): 2,
         ("ain't", 'no'): 2,
         ('no', 'sunshine'): 2,
         ('sunshine', '</s>'): 2,
         ("she's", 'gone'): 2,
         ('gone', '</s>'): 2,
         ('<s>', "it's"): 1,
         ("it's", 'not'): 1,
         ('not', 'warm'): 1,
         ('warm', '</s>'): 1,
         ("she's", 'away'): 1,
         ('away', '</s>'): 1})
30


In [2]:
# compute the probabilities

unigram_probs = {w:cnt/N for w,cnt in unigram_counts.items()}
pprint(unigram_probs)
print()

bigram_probs = {(w1,w2):cnt/unigram_counts[(w1,)] for (w1,w2),cnt in bigram_counts.items()}
pprint(bigram_probs)

{('</s>',): 0.2,
 ('<s>',): 0.2,
 ("ain't",): 0.06666666666666667,
 ('away',): 0.03333333333333333,
 ('gone',): 0.06666666666666667,
 ("it's",): 0.03333333333333333,
 ('no',): 0.06666666666666667,
 ('not',): 0.03333333333333333,
 ("she's",): 0.1,
 ('sunshine',): 0.06666666666666667,
 ('warm',): 0.03333333333333333,
 ('when',): 0.1}

{('<s>', "ain't"): 0.3333333333333333,
 ('<s>', "it's"): 0.16666666666666666,
 ('<s>', 'when'): 0.5,
 ("ain't", 'no'): 1.0,
 ('away', '</s>'): 1.0,
 ('gone', '</s>'): 1.0,
 ("it's", 'not'): 1.0,
 ('no', 'sunshine'): 1.0,
 ('not', 'warm'): 1.0,
 ("she's", 'away'): 0.3333333333333333,
 ("she's", 'gone'): 0.6666666666666666,
 ('sunshine', '</s>'): 1.0,
 ('warm', '</s>'): 1.0,
 ('when', "she's"): 1.0}


In [6]:
# b) take a look first...
print(unigram_counts.most_common())
print()
print(bigram_counts.most_common())

[(('<s>',), 6), (('</s>',), 6), (('when',), 3), (("she's",), 3), (("ain't",), 2), (('no',), 2), (('sunshine',), 2), (('gone',), 2), (("it's",), 1), (('not',), 1), (('warm',), 1), (('away',), 1)]

[(('<s>', 'when'), 3), (('when', "she's"), 3), (('<s>', "ain't"), 2), (("ain't", 'no'), 2), (('no', 'sunshine'), 2), (('sunshine', '</s>'), 2), (("she's", 'gone'), 2), (('gone', '</s>'), 2), (('<s>', "it's"), 1), (("it's", 'not'), 1), (('not', 'warm'), 1), (('warm', '</s>'), 1), (("she's", 'away'), 1), (('away', '</s>'), 1)]


In [7]:
# b) adhere to task instructions....
max_uni = max([v for k,v in unigram_counts.items() if k[0] not in ['<s>','</s>']])
print({k:v for k,v in unigram_counts.items() if k[0] not in ['<s>','</s>'] and v == max_uni})

max_bi = max([v for k,v in bigram_counts.items() if k[0] != '<s>' and k[1] != '</s>'])

print({k:v for k,v in bigram_counts.items() if k[0] != '<s>' and k[1] != '</s>' and v == max_bi})

{('when',): 3, ("she's",): 3}
{('when', "she's"): 3}


In [8]:
# c)
def get_prob(ngrams, probs):
    res = 1
    for gram in ngrams:
        #print(gram, probs.get(gram,0))
        res *= probs.get(gram, 0)  # if ngram unknown return 0
    return res

In [9]:
s1 = ["ain't", "no", "warm"]
s1_pad = list(pad_both_ends(s1, n=2)) # add start and end token
s1_unigrams = list(ngrams(s1_pad, 1))
s1_bigrams = list(ngrams(s1_pad, 2))
print(s1_unigrams)
print(s1_bigrams)


print(get_prob(s1_unigrams, unigram_probs))
print(get_prob(s1_bigrams, bigram_probs))

[('<s>',), ("ain't",), ('no',), ('warm',), ('</s>',)]
[('<s>', "ain't"), ("ain't", 'no'), ('no', 'warm'), ('warm', '</s>')]
5.925925925925927e-06
0.0


In [10]:
s2 = ["she's", "not", "gone"]
s2_pad = list(pad_both_ends(s2, n=2)) # add start and end token
s2_unigrams = list(ngrams(s2_pad, 1))
s2_bigrams = list(ngrams(s2_pad, 2))
print(s2_unigrams)
print(s2_bigrams)

print(get_prob(s2_unigrams, unigram_probs))
print(get_prob(s2_bigrams, bigram_probs))

[('<s>',), ("she's",), ('not',), ('gone',), ('</s>',)]
[('<s>', "she's"), ("she's", 'not'), ('not', 'gone'), ('gone', '</s>')]
8.88888888888889e-06
0.0


In [11]:
s3 = ["ain't", "no", "sunshine"]
s3_pad = list(pad_both_ends(s3, n=2)) # add start and end token
s3_bigrams = list(ngrams(s3_pad, 2))
print(s3_bigrams)

print(get_prob(s3_bigrams, bigram_probs))

[('<s>', "ain't"), ("ain't", 'no'), ('no', 'sunshine'), ('sunshine', '</s>')]
0.3333333333333333
