In [5]:
!./download_data.sh

In [6]:
import random
from math import log, exp, pow
from collections import Counter

In [7]:
def read_data(file_name):
    """Create one long sequence of the whole corpus"""
    with open(file_name, "r") as in_file:
        corpus = in_file.read()
    return corpus.split()

In [8]:
def ngrams(sequence, n=2):
    """Generate n-grams from a given iterable."""
    ngrams = zip(*[sequence[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

## Probability of a word

When we don't have a context word the probability of a word $w$ is the number of times it occurs normalized by cummulative sum of the frequencies of all the words in our Vocabulary $V$.

$$ P(w) = \frac{Count(w)}{\sum_{w_i \ in V} Count(w_i)}$$

The probability of a word $w_i$ given the previous word $w_{i-1}$, $P(w_i | w_{i-1})$, is the number of times the word $w_i$ appears after the word $w_{i-1}$ normalized by the total number of times any word appears after word $w_{i-1}$. This is given by:

$$ P(w_i | w_{i - 1}) = \frac{Count(w_{i-1}, w_i)}{\sum_{w \in V} Count(w_{n-1}, w)}$$

The denominator is equivalent to the count of the word $w_{i - 1}$ because any time the word appears alone it is part of a bigram. This means the probabibility can be calculated with:

$$ P(w_i | w_{i - 1}) = \frac{Count(w_{i-1}, w_i)}{Count(w_{i-1})} $$

## Smoothing

A problem with using counts is that our counts are sparse. There are many valid bigrams that we never see (and therefore have a count of $0$). This means their probability is $0$. This means that if a word in a sequence has a $0$ probability word the probability of the whole sequence is $0$. We need a way to assign some probability to unseen bigrams. 

Here we use Laplace (or add-one) smoothing. We add one to all counts so now words with a count of $0$ won't have a zero probability. Doing this add means that we no longer have a valid probability distribution. We added an extra observation to all words so we need to add $|V|$ (the size of our vocabulary) extra observations to our denominator. This means probabilities are calculated like so:

$$ P(w_i | w_{i - 1}) = \frac{Count(w_{i-1}, w_i) + 1}{Count(w_{i-1}) + |V|} $$

## Probability of a Sentence

Probabilities of sequences are calculated with the chain rule of probability

$$ P(W) = P(w_1, w_2, ... , w_n) = \prod_{k=1}^{n} P(w_k | w_{1}, w_{2}, ..., w_{k-1})$$

Using a Bigram Language model we approximate the probability if a sequence with

$$ P(W) = P(w_1, w_2, ... , w_n) \approx \prod_{k=1}^{n} P(w_k | w_{k-1})$$

This calculation in done in log space to prevent underflows.

$$ \log P(W) = \log P(w_1, w_2, ... , w_n) \approx \sum_{k=1}^{n} \log P(w_k | w_{k-1}) \\
    P(w) = e^{\log P(W)}$$
    
## Perplexity

Perplexity is the standard evaluation metric of language models. Perplexity if defined as the inverse probability of the test set (normalized by the number of words in the test set $N$). 

$$ Perplexity(W) = P(w_1, w_2, ... , w_N)^{\frac{-1}{N}}$$

Again, to avoid underflow this is calulated in log space

$$ \log Perplexity(W) = \frac{-1}{N} * \log P(W) \\
   Perplexity(W) = e^{\log Perplexity(W)}$$

In [9]:
class LanguageModel:
    """A Bigram Language model."""
    
    def __init__(self, train_corpus):
        self.uni = Counter(train_corpus)
        self.bi = Counter(ngrams(train_corpus))
        self.vocab_size = len(self.uni)
        self.uni_sum = sum(self.uni.values())
        
    def generate_probability(self, word, context=None):
        """Generate the probability with LaPlace smoothing."""
        if context is None:
            return (self.uni[word] + 1) / (self.uni_sum + self.vocab_size)
        else:
            return (self.bi[context + ' ' + word] + 1) / (self.uni[context] + self.vocab_size) 
        
    def _generate_distribution(self, context):
        """Generate the probability distribution of next word given the context."""
        dist = []
        for word in self.uni:
            dist.append((word, self.generate_probability(word, context)))
        return dist
    
    def generate_word(self, context):
        """Generate the next word as the most probable word given the context."""
        dist = self._generate_distribution(context)
        return max(dist, key=lambda x: x[1])[0]
        
    def sample_word(self, context):
        """Generate the next word by sampling from the distribution of the next word given the context."""
        dist = self._generate_distribution(context)
        x = random.random()
        for word, prob in dist:
            x -= prob
            if x <= 0:
                break
        return word
            
    def _log_probability_of_sequence(self, sequence):
        """Calculate the probability of a sequence using the chain rule of probability and a bigram markov assumption.
        
        This calculation is done in log space to prevent underflow.
        """
        sequence_score = 0
        context = None
        for word in sequence:
            sequence_score += log(self.generate_probability(word, context))
            context = word
        return sequence_score
    
    def probability_of_sequence(self, sequence):
        """Convert the log probability of a sequence to the normal probability."""
        return exp(self._log_probability_of_sequence(sequence))
    
    def perplexity(self, test_sequence):
        """Calculate the perplexity of a sequence.
        
        This calculation is done in log space to prevent underflow.
        """
        test_set_log_prob = self._log_probability_of_sequence(test_sequence)
        perplexity = exp(-1/len(test_sequence) * test_set_log_prob)
        return perplexity

In [10]:
train_data = read_data("data/train.txt")
dev_data = read_data("data/dev.txt")
test_data = read_data("data/test.txt")

In [11]:
lm = LanguageModel(train_data)

#### Probability of a word

In [12]:
print('The probability of the word "the" is: {}'.format(lm.generate_probability("the")))
print('The probability of the word "tucan" is: {}'.format(lm.generate_probability("tucan")))
print('The probability of the word "<unk>" given "the" is: {}'.format(lm.generate_probability("<unk>", "the")))
print('The probability of the word "said" given "the" is: {}'.format(lm.generate_probability("said", "the")))
print('The probability of the word "said" given "he" is: {}'.format(lm.generate_probability("said", "he")))

The probability of the word "the" is: 0.056568098761030396
The probability of the word "tucan" is: 1.1141812995810679e-06
The probability of the word "<unk>" given "the" is: 0.06365087462357452
The probability of the word "said" given "the" is: 1.645575869275453e-05
The probability of the word "said" given "he" is: 0.039909030885481624


#### Probability of a sequence

Pulling a random sentence from the dev set we calculate the probability of that sequence. The language model should assign the actual sentence a higher probability than a shuffled version of the sentence.

In [13]:
sentence_length = random.randint(10, 21)
start_index = random.randint(50, 65)
sentence = dev_data[start_index:start_index + sentence_length]
sentence_string = ' '.join(sentence)

ordered_probability = lm.probability_of_sequence(sentence)
print('Probability of "{}": {}'.format(sentence_string, ordered_probability))

random.shuffle(sentence)
shuffled_string = ' '.join(sentence)
shuffled_probability = lm.probability_of_sequence(sentence)
print('Probability of "{}": {}'.format(shuffled_string, shuffled_probability))
assert ordered_probability > shuffled_probability

Probability of "issues and the new syndicated reality show hard copy records viewers ' opinions for possible airing on the next day 's": 1.5723075859672447e-70
Probability of "viewers show reality records possible airing for on and hard issues day the syndicated 's ' opinions copy next the new": 2.801304066479877e-82


In [14]:
ppl = lm.perplexity(test_data)
print("The test set perplexity is {}".format(ppl))

The test set perplexity is 960.9778236067062


The preprocessed PTB has several words appended to the start of the train.txt so that the 
vocabulary of the train and test match. This is the only time those words appear so using
one as the context should generate the next one.

In [15]:
lm.generate_word("aer")

'banknote'

Generating the next word with
$$\DeclareMathOperator*{\argmax}{argmax} \argmax_{w \in V} P(w | w_{i-1})$$
looks bad with this dataset because the most probable word in "the", and the most probable word givein "the" is "&lt;unk&gt;". The most probable word givin "&lt;unk&gt;" is "&lt;unk&gt;" which means generated sentences all end up as "the &lt;unk&gt; &lt;unk&gt; &lt;unk&gt; ..."

To fix this we could not generate "&lt;unk&gt;" and take the second most probably word if the most probable is "&lt;unk&gt;" or we can sample from the the distributions of words.

In [16]:
print('The most probable word given "the" is: {}'.format(lm.generate_word("the")))
print('Sampling from the distribution given "the" is: {}'.format(lm.sample_word("the")))

The most probable word given "the" is: <unk>
Sampling from the distribution given "the" is: very
