In [5]:
import nltk
from nltk.corpus import brown
from nltk.tokenize import RegexpTokenizer

# Download once
nltk.download('brown')
nltk.download('punkt')

# Load and clean the corpus
tokenizer = RegexpTokenizer(r'\w+')
text = brown.words()
tokens = tokenizer.tokenize(" ".join(text))
tokens = [word.lower() for word in tokens]


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def generate_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngram = tokens[i:i+n]
        ngrams.append(ngram)
    return ngrams


In [7]:
trigrams = generate_ngrams(tokens, 3)
print(trigrams[:5])


[['the', 'fulton', 'county'], ['fulton', 'county', 'grand'], ['county', 'grand', 'jury'], ['grand', 'jury', 'said'], ['jury', 'said', 'friday']]


In [8]:
from collections import Counter

def count_ngram_frequencies(ngrams):
    ngram_strings = [" ".join(ng) for ng in ngrams]
    return Counter(ngram_strings)


**Example**

In [9]:
trigram_freqs = count_ngram_frequencies(trigrams)
print(trigram_freqs.most_common(5))


[('one of the', 404), ('the united states', 340), ('as well as', 238), ('some of the', 179), ('out of the', 176)]


In [10]:
def predict_next_word(tokens, input_seq, n=3, k=5):
    ngrams = generate_ngrams(tokens, n)
    prefix = input_seq.lower().split()[-(n-1):]

    candidates = []
    for ng in ngrams:
        if ng[:n-1] == prefix:
            candidates.append(ng[-1])

    freq = Counter(candidates)
    return freq.most_common(k)


In [11]:
predict_next_word(tokens, "the united", n=3, k=5)
# Output: [('states', 340), ...]


[('states', 340), ('nations', 44), ('irish', 1), ('arab', 1), ('steel', 1)]

In [13]:
while True:
    user_input = input("Enter a phrase: ")
    if user_input == "exit":
        break
    predictions = predict_next_word(tokens, user_input, n=3, k=5)
    print("Predictions:", predictions)

Enter a phrase: as well
Predictions: [('as', 238), ('for', 7), ('to', 4), ('the', 4), ('that', 3)]
Enter a phrase: one of
Predictions: [('the', 404), ('his', 42), ('those', 34), ('these', 33), ('them', 31)]
Enter a phrase: part of
Predictions: [('the', 144), ('a', 19), ('his', 17), ('it', 14), ('this', 11)]
Enter a phrase: the united
Predictions: [('states', 340), ('nations', 44), ('irish', 1), ('arab', 1), ('steel', 1)]
Enter a phrase: exit
