In [0]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [0]:
# dataloader

def load_data(filename):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    vocab = defaultdict(lambda:0)
    for line in fin:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [0]:
def remove_rare_words(data, vocab, mincount=10):
    ## FILL CODE
    # replace words in data that are not in the vocab 
    # or have a count that is below mincount
    return [[w if vocab[w] >= mincount else '<unk>' for w in l_w] for l_w in data]

In [0]:
# LOAD DATA

train_data, vocab = load_data("train.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# rare words with <unk> in the dataset
train_data = remove_rare_words(train_data, vocab)

print("load validation set")
valid_data, _ = load_data("valid.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# OOV with <unk> in the dataset
valid_data = remove_rare_words(valid_data, vocab)

load validation set


In [0]:
def build_ngram(data, n):
    total_number_words = 0
    counts = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for sentence in data:
        for i in range(len(sentence)):
            temp = sentence[i:n+i]
            for j in range(len(temp)):
                counts[tuple(temp[:j])][temp[j]] += 1

    prob  = defaultdict(lambda: defaultdict(lambda: 0.0))
    for p in counts:
        s = sum(counts[p].values())
        for w in counts[p]:
            prob[p][w] = 1.0 * counts[p][w] / s

    return prob

In [0]:
# RUN TO BUILD NGRAM MODEL

n = 2
print("build ngram model with n = ", n)
model = build_ngram(train_data, n)

('build ngram model with n = ', 2)


In [0]:
def get_prob(model, context, w):
    ## FILL CODE
    # code a recursive function over 
    # smaller and smaller context
    # to compute the backoff model
    # Bonus: You can also code an interpolation model this way
    return model[tuple(context)][w] if model[tuple(context)][w] != 0 else 0.4 * get_prob(model, context[1:], w)

def perplexity(model, data, n):
    score = count = 0
    for l_w in data:
        context = l_w[:n-1]
        for w in l_w[n-1:]:
            score += np.log(get_prob(model, context, w))
            if context:
                context.pop(0)
                context.append(w)
        count += len(l_w)
    return - score/count

In [0]:
# COMPUTE PERPLEXITY ON VALIDATION SET

print("The perplexity is", perplexity(model, valid_data, n))

('The perplexity is', 2.867815629914644)


In [0]:
def get_proba_distrib(model, context):
    ## FILL CODE
    # code a recursive function over context
    # to find the longest available ngram
    probs = model[tuple(context)]
    return probs  if len(probs) > 0 else get_proba_distrib(model, context[1:])

def generate(model):
    sentence = ["<s>"]
    ## FILL CODE
    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    #   np.random.choice(x, 1, p = y)
    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    while True :
        word_dict = get_proba_distrib(model, sentence)
        p = np.random.choice(list(word_dict.keys()), 1, p = list(word_dict.values()))[0]
        sentence.append(str(p))
        if p == "</s>": break
    return sentence

In [0]:
# GENERATE A SENTENCE FROM THE MODEL

print("Generated sentence: ",generate(model))

('Generated sentence: ', ['<s>', 'she', "didn't", 'know', 'that', 'mary', 'out', 'of', 'other', '<unk>', 'even', 'so', 'that', 'they', 'were', '<unk>', 'i', 'asked', 'the', 'park', 'is', 'a', 'little', '<unk>', 'of', '<unk>', '</s>'])
