In [3]:
import os
import re
import string
import random
from collections import Counter
from math import log, exp
import requests
import tarfile

In [2]:
def load_imdb_unsup_sentences(folder_path):
    """
    Loads text files from the IMDB 'unsup' (unsupervised) folder.
    split text by newline, strips text, and returns a list of raw lines.
    replace <br /> tags with special token <nl> token.
    """

    all_sentences = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(
                os.path.join(folder_path, filename), "r", encoding="utf-8"
            ) as file:
                for line in file:
                    line = line.strip().replace("<br />", " <nl> ")
                    all_sentences.append(line)

    return all_sentences


def remove_punctuation(text):
    """
    Removes punctuation from the text,
    but keeps <nl> tokens intact.
    """
    text = re.sub(
        r"(?<!<nl>)[{}]+(?!<nl>)".format(re.escape(string.punctuation)), "", text
    )

    return text


def build_vocabulary(sentences):
    """
    lower each sentence,
    Splits each sentence on whitespace, removes punctuation,
    and builds a set of unique tokens (vocabulary).
    """
    vocab = set()

    for sentence in sentences:
        sentence = sentence.lower()
        sentence = remove_punctuation(sentence)
        tokens = sentence.split()
        vocab.update(tokens)

    return vocab


def tokenize(sentences, vocab, unknown="<UNK>"):
    """
    lower each sentence,
    Splits each sentence on whitespace, removes punctuation,
    and replaces tokens not in the vocabulary with unknown token.
    Returns the list of tokenized sentences.
    """
    tokenized_sentences = []

    for sentence in sentences:
        sentence = sentence.lower()
        sentence = remove_punctuation(sentence)
        tokens = sentence.split()
        tokenized_sentence = [token if token in vocab else unknown for token in tokens]
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences


def split_data(sentences, test_split=0.1):
    """
    shuffle the sentences
    split them into train and test sets (first 1-test_split of the data is the training)
    return the train and test sets
    """
    random.shuffle(sentences)
    split_index = int(len(sentences) * (1 - test_split))
    train_sentences = sentences[:split_index]
    test_sentences = sentences[split_index:]

    return train_sentences, test_sentences


def pad_sentence(tokens, n):
    """
    Pads a list of tokens with <s> at the start (n-1 times)
    and </s> at the end (once).
    For example, if n=3, you add 2 <s> tokens at the start.
    """
    padded = ["<s>"] * (n - 1) + tokens + ["</s>"]
    return padded


def build_ngram_counts(tokenized_sentences, n):
    """
    Builds n-gram counts and (n-1)-gram counts from the given tokenized sentences.
    Each sentence is padded with <s> and </s>.

    Args:
        tokenized_sentences: list of lists, where each sub-list is a tokenized sentence.
        n: the order of the n-gram (e.g., 2 for bigrams, 3 for trigrams).

    Returns:
        ngram_counts: Counter of n-grams (tuples of length n).
        context_counts: Counter of (n-1)-gram contexts.
    """
    ngram_counts = Counter()
    context_counts = Counter()

    for sentence in tokenized_sentences:
        padded_sentence = pad_sentence(sentence, n)
        for i in range(len(padded_sentence) - n + 1):
            ngram = tuple(padded_sentence[i : i + n])
            context = tuple(padded_sentence[i : i + n - 1])
            ngram_counts[ngram] += 1
            context_counts[context] += 1

    return ngram_counts, context_counts


def laplace_probability(ngram, ngram_counts, context_counts, vocab_size, alpha=1.0):
    """
    Computes the probability of an n-gram using Laplace (add-alpha) smoothing.

    P(w_i | w_{i-(n-1)}, ..., w_{i-1}) =
        (count(ngram) + alpha) / (count(context) + alpha * vocab_size)

    Args:
        ngram: tuple of tokens representing the n-gram
        ngram_counts: Counter of n-grams
        context_counts: Counter of (n-1)-gram contexts
        vocab_size: size of the vocabulary
        alpha: smoothing parameter (1.0 = add-1 smoothing)

    Returns:
        Probability of the given n-gram.
    """
    ngram_count = ngram_counts[ngram]
    context = ngram[:-1]
    context_count = context_counts[context]
    prob = (ngram_count + alpha) / (context_count + alpha * vocab_size)
    return prob


def predict_next_token(
    context_tokens, ngram_counts, context_counts, vocab, n=2, alpha=1.0, top_k=5
):
    """
    Given a list of context tokens, predict the next token using the n-gram model.
    Returns the top_k predictions as (token, probability).
    """
    context = tuple(context_tokens[-(n - 1) :])
    candidates = []

    for token in vocab:
        ngram = context + (token,)
        prob = laplace_probability(
            ngram, ngram_counts, context_counts, len(vocab), alpha
        )
        candidates.append((token, prob))

    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[:top_k]


def generate_text_with_limit(
    start_tokens, ngram_counts, context_counts, vocab, n=2, alpha=1.0, max_length=20
):
    """
    Generates text from an n-gram model until it sees </s>
    or reaches a maximum total length (max_length).

    Args:
        start_tokens (list): initial context to begin generation
        ngram_counts (Counter): trained n-gram counts
        context_counts (Counter): trained (n-1)-gram counts
        vocab (set): the model vocabulary
        n (int): n-gram order, 2 for bigram, 3 for trigram, etc.
        alpha (float): Laplace smoothing parameter
        max_length (int): maximum number of tokens to generate (including start_tokens)

    Returns:
        A list of tokens representing the generated sequence.
    """
    generated = start_tokens[:]

    while len(generated) < max_length:
        context_tokens = generated[-(n - 1) :]
        next_token_candidates = predict_next_token(
            context_tokens,
            ngram_counts,
            context_counts,
            vocab,
            n,
            alpha,
            top_k=10,  # Increased top_k
        )

        if not next_token_candidates:
            break

        # Use weighted random choice based on probabilities
        total_prob = sum(prob for _, prob in next_token_candidates)
        rand_val = random.random() * total_prob
        cumulative = 0

        for token, prob in next_token_candidates:
            cumulative += prob
            if cumulative > rand_val:
                next_token = token
                break
        else:
            next_token = next_token_candidates[0][0]

        if next_token == "</s>":
            break

        generated.append(next_token)

    return generated


def calculate_perplexity(
    tokenized_sentences, ngram_counts, context_counts, vocab_size, n=2, alpha=1.0
):
    """
    Calculates the perplexity of an n-gram model (with Laplace smoothing)
    on a list of tokenized sentences.

    Args:
        tokenized_sentences: List of lists of tokens.
        ngram_counts: Counter of n-grams.
        context_counts: Counter of (n-1)-grams.
        vocab_size: Size of the vocabulary.
        n: n-gram order.
        alpha: Laplace smoothing parameter.

    Returns:
        A float representing the perplexity on the given dataset.
    """
    log_prob_sum = 0.0
    total_tokens = 0

    for sentence in tokenized_sentences:
        padded_sentence = pad_sentence(sentence, n)
        for i in range(len(padded_sentence) - n + 1):
            ngram = tuple(padded_sentence[i : i + n])
            prob = laplace_probability(
                ngram, ngram_counts, context_counts, vocab_size, alpha
            )
            log_prob_sum += log(prob)
            total_tokens += 1

    perplexity = exp(-log_prob_sum / total_tokens)
    return perplexity

In [4]:
# Download the IMDB dataset
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
output_path = "/content/aclImdb_v1.tar.gz"

# Download the file
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(output_path, 'wb') as f:
        f.write(response.content)
    print("Download complete!")
else:
    raise Exception("Failed to download the dataset")

# Extract the tar.gz file
with tarfile.open(output_path, "r:gz") as tar:
    tar.extractall(path="/content")
print("Extraction complete!")

# Set the path to the unsupervised folder
imdb_folder = "/content/aclImdb/train/unsup"

Download complete!
Extraction complete!


In [6]:
sentences = load_imdb_unsup_sentences(imdb_folder)

assert len(sentences) == 50000, "Expected 50,000 sentences from the unsup folder."

random.seed(42)

train_sentences, test_sentences = split_data(sentences)


assert len(train_sentences) == 45000, "Expected 45,000 sentences for training."
assert len(test_sentences) == 5000, "Expected 5,000 sentences for testing."

In [7]:
vocab = build_vocabulary(train_sentences)
tokenized_sentences = tokenize(train_sentences, vocab)

assert (
    len(tokenized_sentences) == 45000
), "Expected tokenized sentences count to match raw sentences."

example = "I love Natural language processing, and i want to be a great engineer."
assert (
    len(example) == 70
), "Example sentence length (in characters) does not match the expected 70."

example_tokens = tokenize([example], vocab)[0]
assert (
    len(example_tokens) == 13
), "Token count for the example sentence does not match the expected 13."


## Building N-gram Models
### For alpha = 0.5

In [10]:
n = 2
alpha = 0.5
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)

context = ["i", "love"]
generated_seq = generate_text_with_limit(
    start_tokens=context,
    ngram_counts=ngram_counts,
    context_counts=context_counts,
    vocab=vocab,
    n=n,
    alpha=alpha,
    max_length=128,
)

print("Generated Sequence:", generated_seq)

test_tockenized_sentences = tokenize(test_sentences, vocab)
print(
    f"Preplexity: {calculate_perplexity(test_tockenized_sentences, ngram_counts, context_counts, len(vocab), n, alpha)}"
)

Generated Sequence: ['i', 'love', 'with', 'a', 'movie', 'the', 'movie', 'is', 'the', 'film', 'i', 'saw', 'this', 'is', 'just', 'a', 'very', 'well', 'and', 'i', 'was', 'not', 'to', 'watch', 'a', 'bit', 'too', 'long', 'as', 'a', 'lot', 'of', 'the', 'first', 'one', 'to', 'get', 'to', 'the', 'plot', 'is', 'the', 'first', 'of', 'his', 'father', 'has', 'been', 'more', 'to', 'the', 'most', 'of', 'the', 'movie', 'and', 'the', 'first', 'two', 'of', 'the', 'only', 'one', 'thing', 'but', 'it', 'nl', 'nl', 'nl', 'nl', 'nl', 'nl', 'nl', 'nl', 'nl', 'nl', 'and', 'the', 'plot', 'is', 'a', 'movie', 'that', 'is', 'a', 'great', 'as', 'he', 'has', 'been', 'better', 'than', 'any', 'other', 'than', 'anything', 'but', 'it', 'and', 'he', 'has', 'to', 'be', 'one', 'of', 'a', 'great', 'film', 'was', 'so', 'i', 'dont', 'like', 'this', 'film', 'i', 'saw', 'in', 'the', 'movie', 'nl', 'nl', 'nl', 'nl', 'it', 'to', 'have', 'been']
Preplexity: 2385.220921090936


#### Observations:

  * **Coherence**:
Starts reasonably with “i love with a movie the movie is the film,” which is somewhat grammatical but awkward. It then drifts into a mix of semi-coherent phrases (“i saw this is just a very well”) and fragmented ideas (“the plot is the first of his father has been”).
  * **Repetition**: Repeats common words like “the” (14 times), “a” (8 times), “movie” (5 times), and “nl” (10 times in a row). No single word dominates excessively.
  * **Special Tokens**: The `<nl>` token appears frequently, especially in a long stretch, suggesting the model learned it as a common continuation in the IMDB data (reviews often have line breaks).

#### Interpretation:
With `n=2`, the context is just the last token (e.g., "love" → "with"), making it prone to generic transitions. `alpha=0.5` (light smoothing) favors frequent bigrams, leading to a mix of movie-related words (“movie,” “film,” “plot”) and connectors (“the,” “a”), but it struggles to maintain long-term coherence.

In [12]:
n = 3
alpha = 0.5
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)

context = ["i", "love"]
generated_seq = generate_text_with_limit(
    start_tokens=context,
    ngram_counts=ngram_counts,
    context_counts=context_counts,
    vocab=vocab,
    n=n,
    alpha=alpha,
    max_length=128,
)

print("Generated Sequence:", generated_seq)

test_tockenized_sentences = tokenize(test_sentences, vocab)
print(
    f"Preplexity: {calculate_perplexity(test_tockenized_sentences, ngram_counts, context_counts, len(vocab), n, alpha)}"
)

Generated Sequence: ['i', 'love', 'the', 'movie', 'is', 'a', 'great', 'film', 'but', 'i', 'think', 'this', 'film', 'has', 'to', 'be', 'an', 'action', 'movie', 'in', 'fact', 'this', 'film', 'is', 'not', 'the', 'best', 'part', 'of', 'the', 'most', 'part', 'the', 'film', 'and', 'it', 'was', 'a', 'bit', 'of', 'fun', 'but', 'he', 'was', 'so', 'awful', 'its', 'actually', 'a', 'scene', 'that', 'was', 'a', 'very', 'young', 'kids', 'might', 'enjoy', 'it', 'for', 'me', 'is', 'the', 'only', 'thing', 'the', 'story', 'is', 'about', 'a', 'young', 'age', 'i', 'was', 'a', 'kid', 'and', 'the', 'acting', 'is', 'pretty', 'much', 'just', 'seeing', 'daffy', 'ghostsand', 'lifesent', 'comebackcare', 'shadowcat', 'lifesent', 'handto', 'imrie', 'cinémavérité', 'lifesent', 'sepiatoned', 'cinémavérité', 'cinémavérité', 'jot', 'jot', 'jot', 'cinémavérité', 'comebackcare', 'imrie', 'sepiatoned', 'jot', 'handto', 'lifesent', 'sepiatoned', 'comebackcare', 'sepiatoned', 'phones', 'comebackcare', 'handto', 'ghostsand'

#### Observations:

* **Coherence**: Much better initially: “i love the movie is a great film but i think this film has to be an action movie” reads like a natural review snippet. It stays coherent through “kids might enjoy it for me is the only thing,” then degrades into gibberish with rare tokens like “ghostsand,” “cinémavérité,” and “imrie.”
* **Repetition**: Less repetition of common words (“the” 8 times, “film” 4 times), but a late stretch repeats obscure tokens (“cinémavérité,” “jot,” “shadowcat”) multiple times.
* **Special Tokens**: No `<nl>`, but rare words dominate the end, possibly low-frequency tokens from the training data.

#### Interpretation:
`n=3` uses two-token contexts (e.g., "i love" → "the"), capturing more structure than bigrams. `alpha=0.5` keeps probabilities tied to counts, so early coherence reflects frequent trigrams, but sparsity later (unseen contexts) lets smoothing favor rare tokens.

In [13]:
n = 4
alpha = 0.5
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)

context = ["i", "love"]
generated_seq = generate_text_with_limit(
    start_tokens=context,
    ngram_counts=ngram_counts,
    context_counts=context_counts,
    vocab=vocab,
    n=n,
    alpha=alpha,
    max_length=128,
)

print("Generated Sequence:", generated_seq)

test_tockenized_sentences = tokenize(test_sentences, vocab)
print(
    f"Preplexity: {calculate_perplexity(test_tockenized_sentences, ngram_counts, context_counts, len(vocab), n, alpha)}"
)

Generated Sequence: ['i', 'love', 'imrie', 'imrie', 'phones', 'comebackcare', 'jot', 'imrie', 'imrie', 'lifesent', 'sepiatoned', 'jot', 'shadowcat', 'comebackcare', 'cinémavérité', 'shadowcat', 'comebackcare', 'jot', 'ghostsand', 'imrie', 'handto', 'handto', 'sepiatoned', 'imrie', 'handto', 'imrie', 'phones', 'handto', 'cinémavérité', 'imrie', 'ghostsand', 'cinémavérité', 'shadowcat', 'imrie', 'handto', 'shadowcat', 'comebackcare', 'lifesent', 'lifesent', 'ghostsand', 'handto', 'imrie', 'handto', 'cinémavérité', 'jot', 'handto', 'cinémavérité', 'handto', 'jot', 'ghostsand', 'handto', 'shadowcat', 'imrie', 'handto', 'jot', 'cinémavérité', 'shadowcat', 'lifesent', 'lifesent', 'ghostsand', 'cinémavérité', 'cinémavérité', 'shadowcat', 'jot', 'lifesent', 'shadowcat', 'phones', 'cinémavérité', 'jot', 'jot', 'ghostsand', 'handto', 'phones', 'handto', 'ghostsand', 'handto', 'ghostsand', 'lifesent', 'jot', 'handto', 'imrie', 'imrie', 'imrie', 'cinémavérité', 'shadowcat', 'sepiatoned', 'shadowca

#### Observations:

* **Coherence**: Starts with “i love” then immediately becomes incoherent with “imrie imrie phones.” It’s a jumble of rare tokens like “imrie,” “cinémavérité,” “shadowcat,” and “jot.”
* **Repetition**: High repetition of specific rare words: “imrie” (12 times), “handto” (13 times), “cinémavérité” (11 times), “shadowcat” (11 times), “jot” (10 times).
* **Special Tokens**: No `<nl>`, just a flood of obscure tokens.

#### Interpretation:
`n=4` uses three-token contexts (e.g., "i love imrie" → "imrie"), but the training data likely has sparse four-grams. With `alpha=0.5`, unseen contexts get low probabilities, and smoothing amplifies rare tokens that appear in a few contexts, causing a feedback loop of repetition.

### For alpha = 1


In [14]:
n = 2
alpha = 1
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)

context = ["i", "love"]
generated_seq = generate_text_with_limit(
    start_tokens=context,
    ngram_counts=ngram_counts,
    context_counts=context_counts,
    vocab=vocab,
    n=n,
    alpha=alpha,
    max_length=128,
)

print("Generated Sequence:", generated_seq)

test_tockenized_sentences = tokenize(test_sentences, vocab)
print(
    f"Preplexity: {calculate_perplexity(test_tockenized_sentences, ngram_counts, context_counts, len(vocab), n, alpha)}"
)

Generated Sequence: ['i', 'love', 'with', 'an', 'interesting', 'and', 'i', 'am', 'not', 'a', 'film', 'nl', 'in', 'a', 'bit', 'as', 'well', 'it', 'was', 'the', 'first', 'one', 'of', 'a', 'great', 'job', 'of', 'the', 'most', 'people', 'who', 'has', 'an', 'excellent', 'nl', 'the', 'movie', 'with', 'the', 'film', 'and', 'the', 'movie', 'the', 'movie', 'and', 'the', 'movie', 'that', 'he', 'is', 'just', 'as', 'well', 'nl', 'nl', 'nl', 'nl', 'nl', 'i', 'can', 'make', 'up', 'in', 'the', 'story', 'about', 'the', 'first', 'saw', 'the', 'story', 'and', 'he', 'can', 'be', 'a', 'great', 'film', 'the', 'story', 'line', 'of', 'the', 'best', 'performance', 'as', 'a', 'few', 'years', 'ago', 'and', 'the', 'movie', 'i', 'was', 'one', 'is', 'an', 'interesting', 'and', 'the', 'same', 'time', 'with', 'a', 'movie', 'with', 'an', 'actor', 'he', 'did', 'not', 'the', 'only', 'thing', 'that', 'was', 'so', 'i', 'was', 'a', 'good', 'and', 'the', 'film', 'is', 'that']
Preplexity: 3473.804355257891


#### Observations:

* **Coherence**: Starts with “i love with an interesting and i am not a film,” which is odd but somewhat readable. Phrases like “the movie with the film and the movie” and “he can be a great film the story line” show partial coherence but lack fluency.
* **Repetition**: “the” (15 times), “movie” (6 times), “nl” (5 times), “a” (8 times). No extreme single-word repetition.
* **Special Tokens**: <nl> appears in bursts, reflecting its frequency in reviews.

#### Interpretation:
`alpha=1.0` (standard Laplace smoothing) balances counts and smoothing more than `0.5`, producing slightly more varied bigrams. Still, the short context limits coherence.

In [11]:
n = 3
alpha = 1
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)

context = ["i", "love"]
generated_seq = generate_text_with_limit(
    start_tokens=context,
    ngram_counts=ngram_counts,
    context_counts=context_counts,
    vocab=vocab,
    n=n,
    alpha=alpha,
    max_length=128,
)

print("Generated Sequence:", generated_seq)

test_tockenized_sentences = tokenize(test_sentences, vocab)
print(
    f"Preplexity: {calculate_perplexity(test_tockenized_sentences, ngram_counts, context_counts, len(vocab), n, alpha)}"
)

Generated Sequence: ['i', 'love', 'this', 'show', 'is', 'the', 'best', 'thing', 'about', 'this', 'movie', 'is', 'not', 'the', 'case', 'with', 'so', 'much', 'for', 'the', 'rest', 'of', 'the', 'movie', 'was', 'very', 'well', 'but', 'the', 'film', 'and', 'a', 'half', 'of', 'the', 'movie', 'was', 'a', 'good', 'job', 'with', 'the', 'characters', 'are', 'all', 'the', 'way', 'they', 'stop', 'being', 'funny', 'as', 'the', 'film', 'that', 'doesnt', 'matter', 'how', 'much', 'of', 'an', 'old', 'fashioned', 'horror', 'mmovie', 'ghostsand', 'ghostsand', 'handto', 'imrie', 'cinémavérité', 'ghostsand', 'shadowcat', 'jot', 'comebackcare', 'lifesent', 'imrie', 'lifesent', 'lifesent', 'handto', 'handto', 'shadowcat', 'ghostsand', 'comebackcare', 'jot', 'lifesent', 'comebackcare', 'ghostsand', 'lifesent', 'imrie', 'sepiatoned', 'lifesent', 'shadowcat', 'ghostsand', 'jot', 'ghostsand', 'ghostsand', 'lifesent', 'shadowcat', 'handto', 'sepiatoned', 'shadowcat', 'cinémavérité', 'cinémavérité', 'imrie', 'ghos

#### Observations:

* **Coherence**: Strong start: “i love this show is the best thing about this movie is not the case with so much for the rest of the movie” feels like a review. It holds up through “old fashioned horror mmovie,” then collapses into rare tokens.
* **Repetition**: “the” (11 times), “movie” (3 times), later “ghostsand” (6 times), “imrie” (5 times).
* **Special Tokens**: No <nl>, but rare tokens take over later.

#### Interpretation:
`alpha=1.0` with `n=3` improves early coherence by smoothing more, but sparsity in trigrams still leads to rare-token dominance later.

In [15]:
n = 4
alpha = 1
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)

context = ["i", "love"]
generated_seq = generate_text_with_limit(
    start_tokens=context,
    ngram_counts=ngram_counts,
    context_counts=context_counts,
    vocab=vocab,
    n=n,
    alpha=alpha,
    max_length=128,
)

print("Generated Sequence:", generated_seq)

test_tockenized_sentences = tokenize(test_sentences, vocab)
print(
    f"Preplexity: {calculate_perplexity(test_tockenized_sentences, ngram_counts, context_counts, len(vocab), n, alpha)}"
)

Generated Sequence: ['i', 'love', 'handto', 'phones', 'shadowcat', 'comebackcare', 'jot', 'comebackcare', 'lifesent', 'lifesent', 'phones', 'ghostsand', 'comebackcare', 'lifesent', 'jot', 'lifesent', 'lifesent', 'handto', 'sepiatoned', 'jot', 'ghostsand', 'shadowcat', 'ghostsand', 'jot', 'shadowcat', 'lifesent', 'jot', 'jot', 'shadowcat', 'comebackcare', 'shadowcat', 'phones', 'handto', 'jot', 'phones', 'lifesent', 'handto', 'lifesent', 'sepiatoned', 'ghostsand', 'jot', 'lifesent', 'shadowcat', 'jot', 'handto', 'ghostsand', 'sepiatoned', 'cinémavérité', 'lifesent', 'lifesent', 'lifesent', 'shadowcat', 'jot', 'imrie', 'imrie', 'comebackcare', 'jot', 'jot', 'ghostsand', 'lifesent', 'shadowcat', 'cinémavérité', 'cinémavérité', 'sepiatoned', 'shadowcat', 'shadowcat', 'comebackcare', 'comebackcare', 'cinémavérité', 'sepiatoned', 'sepiatoned', 'jot', 'shadowcat', 'comebackcare', 'lifesent', 'shadowcat', 'imrie', 'phones', 'comebackcare', 'imrie', 'comebackcare', 'shadowcat', 'shadowcat', 'gh

#### Observations:

* **Coherence**: Starts “i love handto phones” and stays incoherent, dominated by rare tokens.
* **Repetition**: “lifesent” (12 times), “jot” (11 times), “shadowcat” (10 times), “cinémavérité” (8 times).
* **Special Tokens**: No <nl>, just rare-token chaos.

#### Interpretation:
 `alpha=1.0` smooths more, but `n=4` sparsity makes it worse, amplifying rare tokens in a repetitive loop.

### General Notes
#### 1. Trend with `n`:
Perplexity increases dramatically as `n` grows (2 → 3 → 4) for both alpha values. This reflects data sparsity—higher-order n-grams (trigrams, four-grams) have fewer occurrences, making the model less predictive on test data.
#### 2. Trend with `alpha`:
 For each n, `alpha=1.0` yields higher perplexity than alpha=`0.5`. Lighter smoothing (`0.5`) relies more on counts, fitting the training data better, while `1.0` smooths more, increasing probabilities for unseen n-grams and raising perplexity.
#### 3. Best Model:
 `n=2`, `alpha=0.5` has the lowest perplexity (2385.22), suggesting bigrams with light smoothing generalize best to the test set.