In [1]:
!pip install nltk




In [None]:
# Advanced N-Gram Auto-Complete System

import nltk
import re
from collections import defaultdict, Counter

nltk.download("punkt_tab")

# ----------------------------
# Text Preprocessing
# ----------------------------

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = nltk.word_tokenize(text)
    return tokens


# ----------------------------
# Build N-gram Model
# ----------------------------

class NGramAutoComplete:

    def __init__(self, corpus, n=3):
        self.n = n
        self.tokens = preprocess(corpus)

        self.unigrams = Counter()
        self.bigrams = defaultdict(Counter)
        self.trigrams = defaultdict(Counter)

        self.build_model()


    def build_model(self):

        tokens = self.tokens

        # Unigrams
        for word in tokens:
            self.unigrams[word] += 1

        # Bigrams
        for i in range(len(tokens)-1):
            self.bigrams[tokens[i]][tokens[i+1]] += 1

        # Trigrams
        for i in range(len(tokens)-2):
            key = (tokens[i], tokens[i+1])
            self.trigrams[key][tokens[i+2]] += 1


    # ----------------------------
    # Predict Next Word
    # ----------------------------

    def predict_next(self, text, top_k=5):

        words = preprocess(text)

        if len(words) >= 2:
            w1, w2 = words[-2], words[-1]

            if (w1, w2) in self.trigrams:
                preds = self.trigrams[(w1, w2)]
                return self.get_top(preds, top_k, "Trigram")


        if len(words) >= 1:
            w = words[-1]

            if w in self.bigrams:
                preds = self.bigrams[w]
                return self.get_top(preds, top_k, "Bigram")


        return self.get_top(self.unigrams, top_k, "Unigram")


    # ----------------------------
    # Get Top Predictions
    # ----------------------------

    def get_top(self, counter, k, model):

        total = sum(counter.values())

        results = []

        for word, count in counter.most_common(k):
            prob = round(count / total, 4)
            results.append((word, prob))

        return model, results


# ----------------------------
# Sample Training Corpus
# ----------------------------

corpus = """
Natural language processing is a field of artificial intelligence.
It focuses on the interaction between computers and human language.
Language models are important in machine translation and speech recognition.
Auto complete systems help users type faster.
N gram models are widely used in NLP.
Deep learning models improve language understanding.
"""

# ----------------------------
# Train Model
# ----------------------------

model = NGramAutoComplete(corpus)


# ----------------------------
# Interactive Prediction
# ----------------------------

while True:

    text = input("\nEnter text (or 'exit'): ")

    if text.lower() == "exit":
        break

    level, predictions = model.predict_next(text)

    print(f"\nUsing {level} Model")
    print("Predictions:")

    for word, prob in predictions:
        print(f"  {word}  (P={prob})")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Enter text (or 'exit'): natural language

Using Trigram Model
Predictions:
  processing  (P=1.0)

Enter text (or 'exit'): natural language

Using Trigram Model
Predictions:
  processing  (P=1.0)

Enter text (or 'exit'): hello

Using Unigram Model
Predictions:
  language  (P=0.08)
  models  (P=0.06)
  and  (P=0.04)
  are  (P=0.04)
  in  (P=0.04)
