<a href="https://colab.research.google.com/github/akshhj/nlp-lab/blob/main/NLP_N_GRAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
from collections import defaultdict, Counter
import random

class NGramModel:
    def __init__(self, n):
        self.n = n  # Order of n-gram (2 for bigram, 3 for trigram)
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = Counter()
        self.vocab = set()

    def preprocess(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        tokens = text.split()
        return ["<s>"] * (self.n - 1) + tokens

    def train(self, corpus):
        for sentence in corpus:
            tokens = self.preprocess(sentence) + ["</s>"]
            self.vocab.update(tokens)
            for i in range(len(tokens) - self.n + 1):
                context = tuple(tokens[i:i + self.n - 1])
                word = tokens[i + self.n - 1]
                self.ngram_counts[context][word] += 1
                self.context_counts[context] += 1

    def predict(self, context, top_k=3):
        context_tokens = self.preprocess(context)[-self.n + 1:]
        context = tuple(context_tokens)

        # If input is a single word, use <s> padding
        if len(context) == 1:
            context = tuple(["<s>"] * (self.n - 1) + list(context))[-self.n + 1:]

        # Check if context exists
        if context in self.ngram_counts:
            candidates = self.ngram_counts[context]
            total_count = sum(candidates.values())
            predictions = [(word, count / total_count) for word, count in candidates.items()]
            return sorted(predictions, key=lambda x: x[1], reverse=True)[:top_k]

        # If context is not found, return the most frequent words overall
        most_common = Counter()
        for counts in self.ngram_counts.values():
            most_common.update(counts)
        return most_common.most_common(top_k)

# Expanded Sample Brown and WSJ Corpus
brown_corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "The dog barked at the stranger.",
    "She sells sea shells by the seashore."
]

wsj_corpus = [
    "Stocks rose on Monday as investors showed optimism.",
    "The Federal Reserve signaled a potential rate cut.",
    "The market responded positively to the news.",
    "Investors reacted strongly to economic reports."
]

# Train Bigram Model on Brown Corpus
bigram_model_brown = NGramModel(2)
bigram_model_brown.train(brown_corpus)

# Train Trigram Model on WSJ Corpus
trigram_model_wsj = NGramModel(3)
trigram_model_wsj.train(wsj_corpus)

# Get user input
user_sentence = input("Enter a word or sentence for prediction: ")

# Predict Next Word for User Input
print("\nBigram Model (Brown Corpus) Predictions:")
print(bigram_model_brown.predict(user_sentence))

print("\nTrigram Model (WSJ Corpus) Predictions:")
print(trigram_model_wsj.predict(user_sentence))

Enter a word or sentence for prediction: united

Bigram Model (Brown Corpus) Predictions:
[('the', 5), ('</s>', 4), ('a', 3)]

Trigram Model (WSJ Corpus) Predictions:
[('</s>', 4), ('the', 3), ('investors', 2)]


In [None]:
import nltk
from nltk.corpus import brown, treebank
from collections import Counter

# Download required data
nltk.download('brown')
nltk.download('treebank')

# Preprocess the corpus
def preprocess_corpus(corpus):
    """Convert sentences into lowercase words."""
    return [[word.lower() for word in sent] for sent in corpus]

# Load and preprocess Brown and WSJ corpora
brown_sentences = preprocess_corpus(brown.sents())
wsj_sentences = preprocess_corpus(treebank.sents())

# Function to build n-gram models
def build_ngram_model(sentences, n=2):
    """Create an n-gram model with frequency counts."""
    ngram_counts = Counter()
    context_counts = Counter()

    for sentence in sentences:
        sentence = ['<s>'] * (n - 1) + sentence + ['</s>']
        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i:i + n])
            context = tuple(sentence[i:i + n - 1])
            ngram_counts[ngram] += 1
            context_counts[context] += 1

    return ngram_counts, context_counts

# Build bigram and trigram models for Brown and WSJ corpus
bigram_counts_brown, bigram_contexts_brown = build_ngram_model(brown_sentences, n=2)
trigram_counts_brown, trigram_contexts_brown = build_ngram_model(brown_sentences, n=3)

bigram_counts_wsj, bigram_contexts_wsj = build_ngram_model(wsj_sentences, n=2)
trigram_counts_wsj, trigram_contexts_wsj = build_ngram_model(wsj_sentences, n=3)

# Function for next word prediction
def next_word_prediction(ngram_counts, context_counts, context, top_n=3):
    """Predict the next word given a context, returning top candidates with probabilities."""
    context = tuple(context)
    candidates = {ngram[-1]: count for ngram, count in ngram_counts.items() if ngram[:-1] == context}

    if not candidates:
        return []

    total_count = sum(candidates.values())
    sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    return [(word, count / total_count) for word, count in sorted_candidates[:top_n]]

# Get user input
while True:
    user_input = input("\nEnter a word or phrase (or type 'exit' to quit): ").lower().strip()
    if user_input == "exit":
        break

    words = user_input.split()

    # Predict using the bigram model
    if len(words) >= 1:
        bigram_context = (words[-1],)

        bigram_predictions_brown = next_word_prediction(bigram_counts_brown, bigram_contexts_brown, bigram_context)
        bigram_predictions_wsj = next_word_prediction(bigram_counts_wsj, bigram_contexts_wsj, bigram_context)

        print("\nBigram Model Predictions:")
        print("Brown Corpus:")
        for word, prob in bigram_predictions_brown:
            print(f"{word}: {prob:.4f}")

        print("\nWSJ Corpus:")
        for word, prob in bigram_predictions_wsj:
            print(f"{word}: {prob:.4f}")

    # Predict using the trigram model (if user input has at least 2 words)
    if len(words) >= 2:
        trigram_context = (words[-2], words[-1])

        trigram_predictions_brown = next_word_prediction(trigram_counts_brown, trigram_contexts_brown, trigram_context)
        trigram_predictions_wsj = next_word_prediction(trigram_counts_wsj, trigram_contexts_wsj, trigram_context)

        print("\nTrigram Model Predictions:")
        print("Brown Corpus:")
        for word, prob in trigram_predictions_brown:
            print(f"{word}: {prob:.4f}")

        print("\nWSJ Corpus:")
        for word, prob in trigram_predictions_wsj:
            print(f"{word}: {prob:.4f}")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.



Enter a word or phrase (or type 'exit' to quit): the united

Bigram Model Predictions:
Brown Corpus:
states: 0.8133
nations: 0.1017
states': 0.0083

WSJ Corpus:
states: 0.2632
illuminating: 0.1579
kingdom: 0.1579

Trigram Model Predictions:
Brown Corpus:
states: 0.8550
nations: 0.1069
states': 0.0102

WSJ Corpus:
states: 0.6250
steelworkers: 0.1250
nations: 0.1250

Enter a word or phrase (or type 'exit' to quit): family

Bigram Model Predictions:
Brown Corpus:
.: 0.1118
,: 0.0816
of: 0.0574

WSJ Corpus:
,: 0.1667
members: 0.0833
were: 0.0417

Enter a word or phrase (or type 'exit' to quit): exit
