In [0]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [0]:
# dataloader

def load_data(filename):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    vocab = defaultdict(lambda:0)
    for line in fin:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [0]:
def remove_rare_words(data, vocab, mincount=10):
    ## FILL CODE
    # replace words in data that are not in the vocab 
    # or have a count that is below mincount
    return [[w if vocab[w] >= mincount else '<unk>' for w in l_w] for l_w in data]

In [0]:
# LOAD DATA

train_data, vocab = load_data("train2.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# rare words with <unk> in the dataset
train_data = remove_rare_words(train_data, vocab)

print("load validation set")
valid_data, _ = load_data("valid2.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# OOV with <unk> in the dataset
valid_data = remove_rare_words(valid_data, vocab)

load validation set


In [0]:
# Function to build a bigram model

def build_bigram(data):
    unigram_counts = defaultdict(lambda:0)
    bigram_counts  = defaultdict(lambda: defaultdict(lambda: 0.0))
    total_number_words = 0

    ## FILL CODE
    # Store the unigram and bigram counts as well as the total 
    # number of words in the dataset
    for l_w in data:
        p = l_w[0]
        unigram_counts[p] += 1
        total_number_words += len(l_w)
        for w in l_w[1:]:
            unigram_counts[w] += 1
            bigram_counts[p][w] += 1
            p = w
            
    unigram_prob = defaultdict(lambda:0)
    bigram_prob = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for w in unigram_counts:
        unigram_prob[w] = 1.0 * unigram_counts[w] / total_number_words
        
    for p in bigram_counts:
        s = sum(bigram_counts[p].values())
        for w in bigram_counts[p]:
            bigram_prob[p][w] = 1.0 * bigram_counts[p][w] / s
    ## FILL CODE
    # Build unigram and bigram probabilities from counts
    

    return {'bigram': bigram_prob, 'unigram': unigram_prob}

In [0]:
# RUN TO BUILD BIGRAM MODEL

print("build bigram model")
model = build_bigram(train_data)

build bigram model


In [0]:
def get_prob(model, w1, w2):
    # Should return the probability of the bigram (w1w2) if it exists
    # Else it return the probility of unigram (w2) multiply by 0.4
    assert model["unigram"][w2] != 0, "Out of Vocabulary word!"
    bigram, unigram = model['bigram'], model['unigram']
    return bigram[w1][w2] if bigram[w1][w2] != 0 else 0.4 * unigram[w2]

def perplexity(model, data):
    # follow the formula in the slides
    # call the function get_prob to get P(w2 | w1)
    score = count = 0
    for l_w in data:
        p = l_w[0]
        for w in l_w[1:]:
            score += np.log(get_prob(model, p, w))
            p = w
        count += len(l_w)
    return - score/count

In [0]:
# COMPUTE PERPLEXITY ON VALIDATION SET

print("The perplexity is", perplexity(model, valid_data))

('The perplexity is', 3.5774067797000098)


In [0]:
def generate(model):
    sentence = ["<s>"]
    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    #   np.random.choice(x, 1, p = y)
    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    p, bigram = sentence[0], model['bigram']
    while True :
        p = np.random.choice(list(bigram[p].keys()), 1, p = list(bigram[p].values()))[0]
        sentence.append(p)
        if p == "</s>": break
    return sentence

In [0]:
# GENERATE A SENTENCE FROM THE MODEL

print("Generated sentence: ",generate(model))

('Generated sentence: ', ['<s>', u'the', u'train', u'.', u'</s>'])
