<a href="https://colab.research.google.com/github/Prafull009/NLP/blob/main/NLPAssign7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **N Gram Auto-Completion Implementation 1**

In [58]:
import nltk
import random
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.util import ngrams, trigrams
from nltk.corpus import reuters, gutenberg

In [59]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('reuters')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [63]:
# gutenberg = gutenberg.raw()

# corpus = " ".join(gutenberg).lower()  # Convert to lowercase
corpus = gutenberg.raw().lower()  # Convert to lowercase
corpus = re.sub(r"[^a-z\s]", "", corpus)  # Remove punctuation

In [64]:
tokens = word_tokenize(corpus)

In [65]:
n = 2  # Change to 3 for trigrams, 4 for 4-grams, etc.
ngrams_list = list(ngrams(tokens, n))

# Create frequency dictionary
ngram_freq = defaultdict(lambda: defaultdict(int))

for gram in ngrams_list:
    prev_word, next_word = gram[0], gram[1]
    ngram_freq[prev_word][next_word] += 1

# Convert counts to probabilities
ngram_prob = {}
for prev_word in ngram_freq:
    total_count = sum(ngram_freq[prev_word].values())
    ngram_prob[prev_word] = {word: count / total_count for word, count in ngram_freq[prev_word].items()}

In [66]:
def predict_next_word(prev_word, top_k=3):
    if prev_word in ngram_prob:
        sorted_predictions = sorted(ngram_prob[prev_word].items(), key=lambda x: x[1], reverse=True)
        return [word for word, prob in sorted_predictions[:top_k]]
    else:
        return ["No prediction available"]

In [67]:
input_text = "economics"
predictions = predict_next_word(input_text)
print(f"Predictions for '{input_text}': {predictions}")

Predictions for 'economics': ['at']


In [68]:
def predict_next_word(input_text, top_k=1):
    input_text = preprocess_text(input_text)  # Preprocess input
    input_seq = " ".join(input_text[-(n-1):])  # Get last n-1 words

    if input_seq in ngram_prob:
        sorted_predictions = sorted(ngram_prob[input_seq].items(), key=lambda x: x[1], reverse=True)
        return sorted_predictions[0][0] if sorted_predictions else "No prediction available"
    else:
        return "No prediction available"

def generate_text(seed_text, num_predictions=5):
    generated_text = seed_text

    for _ in range(num_predictions):
        next_word = predict_next_word(generated_text)
        if next_word == "No prediction available":
            break  # Stop if no valid prediction
        generated_text += " " + next_word  # Append the predicted word

    return generated_text

# Example usage
seed_text = "economic"
generated_sequence = generate_text(seed_text, num_predictions=6)

print("Generated Text:", generated_sequence)

Generated Text: economic reasons for the lord and the


# **N Gram Auto-Completion Implementation 2**

In [82]:
import random
from collections import defaultdict
import nltk
from nltk.corpus import gutenberg
import string

nltk.download('punkt_tab')
nltk.download('gutenberg')

class MyNGramModel:
    def __init__(self, n=3, diversity_factor=0.6, temperature=1.2):
        """
        n: Size of the n-gram (e.g., 2 for bigram, 3 for trigram)
        diversity_factor: Reduces probability of repeated words
        temperature: Higher values make predictions more random
        """
        self.n = n
        self.diversity_factor = diversity_factor
        self.temperature = temperature
        self.model = defaultdict(lambda: defaultdict(lambda: 0))

    def train(self, text):
        tokens = nltk.word_tokenize(text.lower())
        tokens = [word for word in tokens if word not in string.punctuation]

        if len(tokens) < self.n:
            print("Not enough words to generate n-grams.")
            return

        # Count n-gram occurrences
        for i in range(len(tokens) - (self.n - 1)):
            ngram = tuple(tokens[i:i + self.n - 1])
            next_word = tokens[i + self.n - 1]
            self.model[ngram][next_word] += 1  # Count occurrences

        # Convert counts to probabilities (apply temperature scaling)
        for ngram in self.model:
            total_count = float(sum(self.model[ngram].values()))
            for word in self.model[ngram]:
                self.model[ngram][word] = (self.model[ngram][word] / total_count) ** (1 / self.temperature)

    def predict_next(self, prefix, previous_words):
        """
        Predicts the next word using temperature-scaled probabilities.
        """
        prefix_tokens = nltk.word_tokenize(prefix.lower())
        if len(prefix_tokens) < (self.n - 1):
            return None

        prefix_ngram = tuple(prefix_tokens[-(self.n - 1):])
        next_word_probs = self.model.get(prefix_ngram, {})

        if not next_word_probs:
            return None

        # Apply diversity factor (reduce probability of repeated words)
        adjusted_probs = {}
        for word, prob in next_word_probs.items():
            adjusted_probs[word] = prob * (self.diversity_factor if word in previous_words else 1)

        # Normalize probabilities
        total_adjusted = sum(adjusted_probs.values())
        if total_adjusted == 0:
            return None
        for word in adjusted_probs:
            adjusted_probs[word] /= total_adjusted

        # Weighted random choice
        words, probabilities = zip(*adjusted_probs.items())
        return random.choices(words, probabilities)[0]

    def autocomplete(self, prefix, num_suggestions=10):
        """
        Generates a sequence of words with better randomness and reduced repetition.
        """
        suggestions = []
        previous_words = set()
        for _ in range(num_suggestions):
            next_word = self.predict_next(prefix, previous_words)
            if not next_word:
                break

            # Stop early if next word is repetitive
            if len(suggestions) > 3 and next_word in suggestions[-3:]:
                break

            prefix = f"{prefix} {next_word}"
            suggestions.append(prefix)
            previous_words.add(next_word)

        return suggestions

corpus_text = gutenberg.raw('austen-emma.txt')

# Train the N-gram model (Trigram Model)
ngram_model = MyNGramModel()
ngram_model.train(corpus_text)

    # Get auto-completion suggestions
prefix = input("Enter a prefix for auto-completion: ").strip()
suggestions = ngram_model.autocomplete(prefix)

print("\nAuto-complete Suggestions:")
for suggestion in suggestions:
  print(suggestion)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Enter a prefix for auto-completion: He was a

Auto-complete Suggestions:
He was a strange
He was a strange thing
He was a strange thing love
He was a strange thing love is
He was a strange thing love is he
He was a strange thing love is he to
He was a strange thing love is he to be
He was a strange thing love is he to be collected
He was a strange thing love is he to be collected she
He was a strange thing love is he to be collected she was
