In [1]:
import numpy as np
from nltk.corpus import brown

In [2]:
words = list(brown.words())

In [3]:
len(words)

1161192

In [4]:
start_sent = "I am planning to"

In [5]:
class NGrams:
    
    def __init__(self, words, sentence):
        self.words = words
        self.sentence = sentence
        self.tokens = sentence.split()
        
    def get_tokens(self):
        return self.tokens
    
    def add_tokens(self,value):
        temp = self.tokens
        temp.append(value)
        self.tokens = temp
        return self.tokens
        
    def unigram_model(self):
        self.next_words = np.random.choice(words, size=3)
        return self.next_words
    
    def bigram_model(self):
        next_words = []
        for i in range(len(words)-1):
            if words[i] == self.tokens[-1]:
                next_words.append(words[i+1])
        self.next_words = next_words
        return self.next_words
    
    def trigram_model(self):
        next_words = []
        for i in range(len(words)-2):
            if words[i] == self.tokens[-2]:
                if words[i+1] == self.tokens[-1]:
                    next_words.append(words[i+2])
        self.next_words = next_words
        return self.next_words
    
    def fourgram_model(self):
        next_words = []
        for i in range(len(words)-3):
            if words[i] == self.tokens[-3]:
                if words[i+1] == self.tokens[-2]:
                    if words[i+2] == self.tokens[-1]:
                        next_words.append(words[i+3])
        self.next_words = next_words
        return self.next_words

    def get_top_3_next_words(self,next_words):
        next_words_dict = dict()
        for word in next_words:
            if not word in next_words_dict.keys():
                next_words_dict[word] = 1
            else:
                next_words_dict[word] += 1

        for i,j in next_words_dict.items():
            next_words_dict[i] = np.round(j/len(next_words),2)

        return sorted(next_words_dict.items(), key = lambda k:(k[1], k[0]), reverse=True)[:3]
    
    def model_selection(self):
        if len(self.fourgram_model()) > 0:
            next_words = self.fourgram_model()
            top_words = self.get_top_3_next_words(next_words)
            print("fourgram-model")
            return top_words
        elif len(self.trigram_model()) > 0:
            next_words = self.trigram_model()
            top_words = self.get_top_3_next_words(next_words)
            print("trigram-model")
            return top_words
        elif len(self.bigram_model()) > 0:
            next_words = self.bigram_model()
            top_words = self.get_top_3_next_words(next_words)
            print("bigram-model")
            return top_words
        else:
            top_words = self.unigram_model()
            print("unigram-model")
            return top_words
        

In [6]:
model = NGrams(words=words, sentence=start_sent)

In [7]:
for i in range(30):
    values = model.model_selection()
    print(values)
    value = input()
    model.add_tokens(value)

trigram-model
[('use', 0.11), ('tour', 0.11), ('shelter', 0.11)]
use
fourgram-model
[('the', 1.0)]
the
fourgram-model
[('U.S.', 0.1), ('Standard', 0.1), ('word', 0.05)]
Standard
fourgram-model
[('Deduction', 1.0)]
Deduction
fourgram-model
[('or', 1.0)]
or
fourgram-model
[('Tax', 0.67), ('the', 0.33)]
the
fourgram-model
[('Tax', 1.0)]
Tax
fourgram-model
[('Table', 1.0)]
Table
fourgram-model
[(',', 1.0)]
,
fourgram-model
[('and', 1.0)]
and
fourgram-model
[('later', 1.0)]
later
fourgram-model
[(',', 0.5), ('go', 0.12), ('found', 0.12)]
go
fourgram-model
[('hungry', 1.0)]
hungry
fourgram-model
[('?', 1.0)]
?
fourgram-model
[('?', 1.0)]
?
fourgram-model
[('The', 1.0)]
The
fourgram-model
[('man', 0.04), ('answer', 0.04), ('voice', 0.02)]
voice
fourgram-model
[('sank', 0.33), ('issued', 0.33), ('had', 0.33)]
had
fourgram-model
[('music', 1.0)]
music
fourgram-model
[('in', 1.0)]
in
fourgram-model
[('it', 1.0)]
it
fourgram-model
[('.', 1.0)]
.
fourgram-model
[('He', 0.27), ('Time', 0.07), ('The

In [8]:
print(model.get_tokens())

['I', 'am', 'planning', 'to', 'use', 'the', 'Standard', 'Deduction', 'or', 'the', 'Tax', 'Table', ',', 'and', 'later', 'go', 'hungry', '?', '?', 'The', 'voice', 'had', 'music', 'in', 'it', '.', 'The', 'sequence', 'may', 'involve', 'a', 'sharp', 'contrast', ':']


In [9]:
print(" ".join(model.get_tokens()))

I am planning to use the Standard Deduction or the Tax Table , and later go hungry ? ? The voice had music in it . The sequence may involve a sharp contrast :
