In [3]:
import nltk
from collections import Counter, defaultdict
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import math

# Download necessary NLTK data
nltk.download('punkt')

class NGramLanguageModel:
    def __init__(self, n):
        self.n = n  # The "n" in N-gram
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = Counter()
        self.vocab = set()
        
    def train(self, corpus):
        """
        Train the N-gram model on a given text corpus.
        """
        tokens = word_tokenize(corpus.lower())
        self.vocab.update(tokens)
        
        # Add start and end tokens for padding
        padded_tokens = ["<s>"] * (self.n - 1) + tokens + ["</s>"]
        
        # Generate N-grams
        ngrams_generated = list(ngrams(padded_tokens, self.n))
        
        for ngram in ngrams_generated:
            context = ngram[:-1]
            word = ngram[-1]
            self.ngram_counts[context][word] += 1
            self.context_counts[context] += 1
    
    def predict_next(self, context):
        """
        Predict the next word given a context.
        """
        context = tuple(context)
        if context in self.ngram_counts:
            return self.ngram_counts[context].most_common(1)[0][0]
        else:
            return None  # No prediction available
    
    def calculate_probability(self, sentence):
    """
    Calculate the probability of a sentence using the N-gram model.
    Applies Laplace smoothing to handle unseen n-grams.
    """
    tokens = word_tokenize(sentence.lower())
    padded_tokens = ["<s>"] * (self.n - 1) + tokens + ["</s>"]
    ngrams_generated = list(ngrams(padded_tokens, self.n))

    probability = 0
    vocab_size = len(self.vocab)  # Vocabulary size for smoothing

    for ngram in ngrams_generated:
        context = ngram[:-1]
        word = ngram[-1]
        count_context = self.context_counts[context]
        count_ngram = self.ngram_counts[context][word]

        # Apply Add-1 (Laplace) smoothing
        smoothed_probability = (count_ngram + 1) / (count_context + vocab_size)
        probability += math.log(smoothed_probability)

    return math.exp(probability)  # Convert log probability back

    
    def generate_sentence(self, max_words=15):
        """
        Generate a sentence using the N-gram model.
        """
        sentence = ["<s>"] * (self.n - 1)
        for _ in range(max_words):
            context = tuple(sentence[-(self.n - 1):])
            next_word = self.predict_next(context)
            if next_word == "</s>" or next_word is None:
                break
            sentence.append(next_word)
        return ' '.join(sentence[(self.n - 1):])

# Example usage
if __name__ == "__main__":
    # Example corpus
    corpus = "This is a simple example. This example is for building an n-gram language model."
    
    # Train a trigram model (3-gram)
    ngram_model = NGramLanguageModel(n=3)
    ngram_model.train(corpus)
    
    # Predict the next word
    context = ["this", "is"]
    print("Next word prediction:", ngram_model.predict_next(context))
    
    # Calculate sentence probability
    sentence = "This is a simple example"
    print("Sentence probability:", ngram_model.calculate_probability(sentence))
    
    # Generate a sentence
    print("Generated sentence:", ngram_model.generate_sentence())


IndentationError: expected an indented block after function definition on line 46 (2747007855.py, line 47)