# Language Model Demo

Based on this demo: http://nlpforhackers.io/language-models/

### Import modules and data

In [None]:
import random
from nltk import bigrams, trigrams
from nltk.corpus import reuters, movie_reviews, shakespeare
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter, defaultdict

In [None]:
# Choose a corpus: reuters, movie_reviews or shakespeare
corpus = reuters


if corpus==shakespeare:
    shakespeare_text = ''.join([''.join(corpus.xml(fileid).itertext()) for fileid in corpus.fileids()])
    words = word_tokenize(shakespeare_text)
    sents = [word_tokenize(sent) for sent in sent_tokenize(shakespeare_text)]
else:    
    words = corpus.words()
    sents = corpus.sents()

# Lowercase everything
words = [w.lower() for w in words]
sents = [[w.lower() for w in sent] for sent in sents]

### Unigram language model

In this section, we will construct a language model based on unigrams (words).

In [None]:
# Exercise 1.
# "words" is a list containing all the words in the corpus (including multiples of the same word)
# Make a Counter from the list of words and call it "unigram_counts" (remember, this is easy to do!)
# Get the total number of words and assign it to "total_count"

##### YOUR CODE STARTS HERE #####



##### YOUR CODE ENDS HERE #####

print "Total number of words in corpus: ", total_count

# Print 10 most common words
print ""
for (word, count) in unigram_counts.most_common(n=10):
    print word, count

In [None]:
# Exercise 2.
# Compute the probability of each word (unigram) from the counts.
# Create a Counter called "unigram_probs" that holds the probability of each word.
# Hint: use the variables unigram_counts and total_count, that you just created.
# Hint: remember about integer division!

##### YOUR CODE STARTS HERE #####




##### YOUR CODE ENDS HERE #####

# Check the probabilities add up to 1
print "Probabilities sum to: ", sum(unigram_probs.values())

In [None]:
# Print the probability of word "the", then try some other words.
print unigram_probs['the']

In [None]:
# Generate 100 words of language using unigram model

text = [] # will be a list of generated words

for _ in range(100):
    r = random.random() # get a random value in [0,1]
    
    # Find the word whose interval contains r
    accumulator = .0
    for word, freq in unigram_probs.iteritems():
        accumulator += freq
        if accumulator >= r:
            text.append(word)
            break

print ' '.join(text)

In [None]:
# Exercise 3. 
# Calculate the probability of the text we generated. Note the variable "text" is a list of words.
# Hint: we want to take each of the words in the list "text", 
# look up their probability in the dictionary unigram_probs, and multiply all the probabilities together.

##### YOUR CODE STARTS HERE #####




##### YOUR CODE ENDS HERE #####

### Bigram language model

In [None]:
# Count how often each bigram occurs.

# bigram_counts maps w1 to a dictionary mapping w2 to the count for (w1, w2)
bigram_counts = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in sents:
    for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
        bigram_counts[w1][w2] += 1

In [None]:
# Print how often the bigram "of the" occurs. Try some other words following "of".
print bigram_counts['of']['the']

In [None]:
# Transform the bigram counts to bigram probabilities
bigram_probs = defaultdict(lambda: defaultdict(lambda: 0))
for w1 in bigram_counts:
    total_count = float(sum(bigram_counts[w1].values()))
    bigram_probs[w1] = Counter({w2: c/total_count for w2,c in bigram_counts[w1].iteritems()})

In [None]:
# Print the probability that 'the' follows 'of'
print bigram_probs['of']['the']

In [None]:
# Print the top ten tokens most likely to follow 'fair', along with their probabilities
prob_dist = bigram_probs['fair']
for word,prob in prob_dist.most_common(10):
    print word,prob

In [None]:
# Generate text with bigram model

text = [None]
sentence_finished = False

# Generate words until a None is generated
while not sentence_finished:
    r = random.random()
    accumulator = .0
    latest_token = text[-1]
    prob_dist = bigram_probs[latest_token] # prob dist of what token comes next
        
    for word,p in prob_dist.iteritems():
        accumulator += p 
        if accumulator >= r:
            text.append(word)
            break

    if text[-1] == None:
        sentence_finished = True

print ' '.join([t for t in text if t])

In [None]:
# Calculate probability of the text we just generated
# This is just the probability of each of the bigrams, multiplied.

text_prob = 1.0
text_len = len(text)

for idx in range(text_len-1):
    w1 = text[idx]
    w2 = text[idx+1]
    text_prob *= bigram_probs[w1][w2]
    
print text_prob

### Trigram language model

In [None]:
# Count how often each trigram occurs.

# trigram_counts maps (w1, w2) to a dictionary mapping w3 to the count for (w1, w2, w3)
trigram_counts = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in sents:
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        trigram_counts[(w1, w2)][w3] += 1

In [None]:
# Print how often the trigram "I am not" occurs. Try some other trigrams.
print trigram_counts[('i', 'am')]['not']

In [None]:
# Transform the trigram counts to trigram probabilities
trigram_probs = defaultdict(lambda: defaultdict(lambda: 0))
for w1_w2 in trigram_counts:
    total_count = float(sum(trigram_counts[w1_w2].values()))
    trigram_probs[w1_w2] = Counter({w3: c/total_count for w3,c in trigram_counts[w1_w2].iteritems()})

In [None]:
# Print the probability that 'not' follows 'i am'. Try some other combinations.
print trigram_probs[('i', 'am')]['not']

In [None]:
# Print the top ten tokens most likely to follow 'i am', along with their probabilities
prob_dist = trigram_probs[('i', 'am')]
for word,prob in prob_dist.most_common(10):
    print word,prob

In [None]:
# Generate text with trigram model

text = [None, None]

sentence_finished = False

# Generate words until two consecutive Nones are generated
while not sentence_finished:
    r = random.random()
    accumulator = .0
    latest_bigram = tuple(text[-2:])
    prob_dist = trigram_probs[latest_bigram] # prob dist of what token comes next
    
    for word,p in prob_dist.iteritems():
        accumulator += p 
        if accumulator >= r:
            text.append(word)
            break

    if text[-2:] == [None, None]:
        sentence_finished = True

print ' '.join([t for t in text if t])

In [None]:
# Calculate the probability of the text we generated
# This is just the product of the probability of each trigram

text_prob = 1.0
text_len = len(text)

for idx in range(text_len-2):
    w1 = text[idx]
    w2 = text[idx+1]
    w3 = text[idx+2]
    text_prob *= trigram_probs[(w1,w2)][w3]
    
print text_prob
