In [2]:
from nltk.corpus import brown
from nltk.tokenize import word_tokenize

In [13]:
## Run following code only once to download brown dataset to local system
# import nltk
# nltk.download('brown')
# nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/amold/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
# Loading the corpus
corpus = brown.words()

# Case folding and getting vocab
lower_case_corpus = [w.lower() for w in corpus]
vocab = set(lower_case_corpus)

print('CORPUS EXAMPLE: ' + str(lower_case_corpus[:30]) + '\n\n')
print('VOCAB EXAMPLE: ' + str(list(vocab)[:10]))

CORPUS EXAMPLE: ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.', 'the', 'jury', 'further', 'said', 'in']


VOCAB EXAMPLE: ['spice-laden', 'titer', 'preach', 'potowomut', 'tartary', 'incompleteness', 'flute', 'bit-like', 'carloading', 'notre']


In [7]:
print('Total words in Corpus: ' + str(len(lower_case_corpus)))
print('Vocab of the Corpus: ' + str(len(vocab)))

Total words in Corpus: 1161192
Vocab of the Corpus: 49815


In [8]:
bigram_counts = {}
trigram_counts = {}

# Sliding through corpus to get bigram and trigram counts
for i in range(len(lower_case_corpus) - 2):
    # Getting bigram and trigram at each slide
    bigram = (lower_case_corpus[i], lower_case_corpus[i+1])
    trigram = (lower_case_corpus[i], lower_case_corpus[i+1], lower_case_corpus[i+2])
    
    # Keeping track of the bigram counts
    if bigram in bigram_counts.keys():
        bigram_counts[bigram] += 1
    else:
        bigram_counts[bigram] = 1
    
    # Keeping track of trigram counts
    if trigram in trigram_counts.keys():
        trigram_counts[trigram] += 1
    else:
        trigram_counts[trigram] = 1

print("Example, count for bigram ('the', 'king') is: " + str(bigram_counts[('the', 'king')]))

Example, count for bigram ('the', 'king') is: 51


In [14]:
# Function takes sentence as input and suggests possible words that comes after the sentence  
def suggest_next_word(input_, bigram_counts, trigram_counts, vocab):
    # Consider the last bigram of sentence
    tokenized_input = word_tokenize(input_.lower())
    last_bigram = tokenized_input[-2:]
    
    # Calculating probability for each word in vocab
    vocab_probabilities = {}
    for vocab_word in vocab:
        test_trigram = (last_bigram[0], last_bigram[1], vocab_word)
        test_bigram = (last_bigram[0], last_bigram[1])

        test_trigram_count = trigram_counts.get(test_trigram, 0)
        test_bigram_count = bigram_counts.get(test_bigram, 0)
        
        probability = test_trigram_count / test_bigram_count
        vocab_probabilities[vocab_word] = probability
    
    # Sorting the vocab probability in descending order to get top probable words
    top_suggestions = sorted(vocab_probabilities.items(), key=lambda x: x[1], reverse=True)[:3]
    return top_suggestions

In [15]:
suggest_next_word('I am the king', bigram_counts, trigram_counts, vocab)

[('james', 0.17647058823529413),
 ('of', 0.1568627450980392),
 ('arthur', 0.11764705882352941)]

In [28]:
# Takes as input a starting sentence and then keeps picking top probability words until certain condition is met
def generate_sequence(input_, bigram_counts, trigram_counts, vocab):
    sentence = input_.lower()
    next_word_prob = 1.0
    while(1):
        # fetch the most probably word based on n-gram model
        next_word_suggestions = suggest_next_word(sentence, bigram_counts, trigram_counts, vocab)
        next_word, next_word_prob = next_word_suggestions[0]
        # append top probability word to sentence
        sentence = sentence + ' ' + next_word
        # condition to break while loop and end word generation
        if next_word_prob < 0.01 or next_word == '.' or len(sentence) > 100:
            break
    return sentence

In [29]:
generate_sequence('I am the king', bigram_counts, trigram_counts, vocab)

'i am the king james version has little effect on the other hand , the first time in the world .'