In [12]:
import nltk
from nltk.corpus import gutenberg
from nltk.util import ngrams
from collections import defaultdict, Counter
import random
import math

nltk.download('gutenberg')
nltk.download('punkt_tab')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
raw_text = gutenberg.raw('austen-emma.txt')
# raw_text = gutenberg.raw('carroll-alice.txt')
tokens = nltk.word_tokenize(raw_text)
tokens = [word.lower() for word in tokens if word.isalpha()]

train_size = int(len(tokens) * 0.8)
train_tokens = tokens[:train_size]
test_tokens = tokens[train_size:]

vocab = set(train_tokens)
V = len(vocab)

In [14]:
def train_model(tokens, n):
    model = defaultdict(Counter)
    if n == 1:
        model['unigram'] = Counter(tokens)
        total_count = float(sum(model['unigram'].values()))
        for word in model['unigram']:
            model['unigram'][word] /= total_count
        return model

    ngrams_list = list(ngrams(tokens, n))
    context_list = list(ngrams(tokens, n-1))

    ngram_counts = Counter(ngrams_list)
    context_counts = Counter(context_list)

    for ngram, count in ngram_counts.items():
        context = ngram[:-1]
        word = ngram[-1]
        model[context][word] = (count + 1) / (context_counts[context] + V)

    for context in context_counts:
        unseen_words = V - len(model[context])
        default_prob = 1 / (context_counts[context] + V)
        model[context]['<UNK>'] = default_prob * unseen_words

    return model

def generate_text(model, n, start_word, num_words=10):
    if n == 1:
        words = list(model['unigram'].keys())
        probs = list(model['unigram'].values())
        return ' '.join(random.choices(words, weights=probs, k=num_words))

    text = [start_word]
    current_context = [start_word]

    for _ in range(num_words):
        context_tuple = tuple(current_context[-(n-1):])

        if context_tuple in model:
            next_word_probs = model[context_tuple]
            words = list(next_word_probs.keys())
            probs = list(next_word_probs.values())

            total_prob = sum(probs)
            if total_prob > 0:
                probs = [p / total_prob for p in probs]
            else:
                next_word = random.choice(list(vocab))
                text.append(next_word)
                current_context.append(next_word)
                continue

            next_word = random.choices(words, weights=probs, k=1)[0]
            if next_word == '<UNK>':
                next_word = random.choice(list(vocab))

        else:
            next_word = random.choice(list(vocab))

        text.append(next_word)
        current_context.append(next_word)

    return ' '.join(text)


def calculate_perplexity(model, n, test_tokens):
    log_prob_sum = 0
    test_ngram_count = 0

    if n == 1:
        test_ngram_list = test_tokens
        unigram_probs = model['unigram']
        for token in test_ngram_list:
            prob = unigram_probs.get(token, 1/V)
            if prob > 0:
                log_prob_sum += math.log(prob)
        test_ngram_count = len(test_ngram_list)
    else:
        test_ngram_list = list(ngrams(test_tokens, n))
        for ngram in test_ngram_list:
            context = ngram[:-1]
            word = ngram[-1]

            if context in model and word in model[context]:
                prob = model[context][word]
            elif context in model and '<UNK>' in model[context]:
                 prob = model[context]['<UNK>']
            else:
                prob = 1 / V

            if prob > 0:
                log_prob_sum += math.log(prob)

        test_ngram_count = len(test_ngram_list)

    if test_ngram_count == 0:
        return float('inf')

    perplexity = math.exp(-log_prob_sum / test_ngram_count)
    return perplexity


In [15]:
for n_val in range(1, 5):
    print(f"N-Gram Model: n={n_val}")

    ngram_model = train_model(train_tokens, n_val)

    start_word = random.choice(train_tokens)
    generated_sequence = generate_text(ngram_model, n_val, start_word, num_words=10)
    print(f"Generated Text: {generated_sequence}")

    perplexity = calculate_perplexity(ngram_model, n_val, test_tokens)
    print(f"Perplexity: {perplexity:.4f}\n")

N-Gram Model: n=1
Generated Text: he of to equal one or and did something passing
Perplexity: 478.0174

N-Gram Model: n=2
Generated Text: difference bearing many establish touch fixing informs doing channel simple knowing
Perplexity: 84.4412

N-Gram Model: n=3
Generated Text: of lot cultivation giving doubtfully want satisfied allay spinet stands commandingly
Perplexity: 98.9957

N-Gram Model: n=4
Generated Text: wished seriously perry cousin william objecting naming wrong equal specimen severity
Perplexity: 1245.2867

