## Prepare Data

In [1]:
import nltk
from nltk.tokenize import word_tokenize

def split_to_sentences(data):
    """Z
    Split data by linebreak "\n" and clean the sentences.
    """
    sentences = data.split("\n")
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def count_words(tokenized_sentences):
    """
    Manually count the frequency of each word in the tokenized sentences.
    """
    word_counts = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in word_counts:
                word_counts[token] = 1
            else:
                word_counts[token] += 1
    return word_counts

def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    """
    Filter words that appear at least 'count_threshold' times.
    """
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)
    for word, cnt in word_counts.items():
        if cnt >= count_threshold:
            closed_vocab.append(word)
    return closed_vocab

def prepare_data(infile, ngram_size=2):
    with open(infile, 'r', encoding='utf-8') as file:
        text = file.read()
    
    sentences = split_to_sentences(text)
    print("Number of sentences:", len(sentences))  
    
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    print("Sample tokenized sentences:", tokenized_sentences[:3])  

    vocabulary = get_words_with_nplus_frequency(tokenized_sentences, 3)
    print("Vocabulary sample:", list(vocabulary)[:10]) 

    processed_sentences = []
    for sentence in tokenized_sentences:
        processed_sentence = [token if token in vocabulary else '<UNK>' for token in sentence]
        start_tokens = ['<s>'] * (ngram_size - 1)
        end_tokens = ['<e>'] * (ngram_size - 1)
        processed_sentence = start_tokens + processed_sentence + end_tokens
        processed_sentences.append(processed_sentence)

    print("Processed sentences sample:", processed_sentences[:1])  
    return processed_sentences


In [3]:
processed_text = prepare_data('/kaggle/input/us-twitter/en_US.twitter.txt', ngram_size=2)
 

Number of sentences: 47961
Sample tokenized sentences: [['how', 'are', 'you', '?', 'btw', 'thanks', 'for', 'the', 'rt', '.', 'you', 'gon', 'na', 'be', 'in', 'dc', 'anytime', 'soon', '?', 'love', 'to', 'see', 'you', '.', 'been', 'way', ',', 'way', 'too', 'long', '.'], ['when', 'you', 'meet', 'someone', 'special', '...', 'you', "'ll", 'know', '.', 'your', 'heart', 'will', 'beat', 'more', 'rapidly', 'and', 'you', "'ll", 'smile', 'for', 'no', 'reason', '.'], ['they', "'ve", 'decided', 'its', 'more', 'fun', 'if', 'i', 'do', "n't", '.']]
Vocabulary sample: ['how', 'are', 'you', '?', 'btw', 'thanks', 'for', 'the', 'rt', '.']
Processed sentences sample: [['<s>', 'how', 'are', 'you', '?', 'btw', 'thanks', 'for', 'the', 'rt', '.', 'you', 'gon', 'na', 'be', 'in', 'dc', 'anytime', 'soon', '?', 'love', 'to', 'see', 'you', '.', 'been', 'way', ',', 'way', 'too', 'long', '.', '<e>']]


In [4]:
print(processed_text[:100]) 

[['<s>', 'how', 'are', 'you', '?', 'btw', 'thanks', 'for', 'the', 'rt', '.', 'you', 'gon', 'na', 'be', 'in', 'dc', 'anytime', 'soon', '?', 'love', 'to', 'see', 'you', '.', 'been', 'way', ',', 'way', 'too', 'long', '.', '<e>'], ['<s>', 'when', 'you', 'meet', 'someone', 'special', '...', 'you', "'ll", 'know', '.', 'your', 'heart', 'will', 'beat', 'more', 'rapidly', 'and', 'you', "'ll", 'smile', 'for', 'no', 'reason', '.', '<e>'], ['<s>', 'they', "'ve", 'decided', 'its', 'more', 'fun', 'if', 'i', 'do', "n't", '.', '<e>'], ['<s>', 'so', 'tired', 'd', ';', 'played', '<UNK>', 'tag', '&', 'ran', 'a', 'lot', 'd', ';', '<UNK>', 'going', 'to', 'sleep', 'like', 'in', '5', 'minutes', ';', ')', '<e>'], ['<s>', 'words', 'from', 'a', 'complete', 'stranger', '!', 'made', 'my', 'birthday', 'even', 'better', ':', ')', '<e>'], ['<s>', 'first', 'cubs', 'game', 'ever', '!', 'wrigley', 'field', 'is', 'gorgeous', '.', 'this', 'is', 'perfect', '.', 'go', 'cubs', 'go', '!', '<e>'], ['<s>', 'i', 'no', '!', 'i',

In [5]:
def count_n_grams(data, n):
    """
    Count all n-grams in the data.

    Args:
        data: List of lists of words, each list representing tokenized sentences.
        n: number of words in a sequence

    Returns:
        A dictionary that maps a tuple of n-words to its frequency
    """
    n_grams = {}
    for sentence in data:
        for i in range(len(sentence) - n + 1):
            n_gram = tuple(sentence[i:i + n])
            if n_gram in n_grams:
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
    return n_grams


In [6]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=5.0):
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing.
    
    Args:
        word: next word
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of words in the vocabulary
        k: positive constant, smoothing parameter
    
    Returns:
        A probability
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
    denominator = previous_n_gram_count + (k * vocabulary_size)
    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)
    numerator = n_plus1_gram_count + k
    probability = numerator / denominator
    return probability

def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>", k=1.0):
    """
    Estimate the probabilities of next words using the n-gram counts with k-smoothing.
    
    Args:
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary: List of words
        k: positive constant, smoothing parameter
    
    Returns:
        A dictionary mapping from next words to the probability.
    """
    previous_n_gram = tuple(previous_n_gram)
    vocabulary = set(vocabulary) | {end_token, unknown_token}
    vocabulary_size = len(vocabulary)
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k)
        probabilities[word] = probability
    return probabilities


In [7]:
def train(ngram_size, data, is_processed=False):
    if not is_processed:
        tokenized_data = prepare_data(data, ngram_size)
    else:
        tokenized_data = data

    n_gram_counts = count_n_grams(tokenized_data, ngram_size)
    n_plus1_gram_counts = count_n_grams(tokenized_data, ngram_size + 1)

    vocabulary = set([token for sentence in tokenized_data for token in sentence])
    vocabulary_size = len(vocabulary)

    log_probabilities = {}
    for n_gram in n_gram_counts:
        for word in vocabulary:
            n_plus1_gram = n_gram + (word,)
            probability = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0)
            log_probabilities[n_plus1_gram] = math.log(probability)

    return log_probabilities, vocabulary, n_gram_counts, n_plus1_gram_counts


In [8]:
def predict_ngram(sentence, model_probabilities, ngram_size=2, vocabulary=None):
    """
    Predicts the probability of a sentence using a pre-trained n-gram model.
    
    Args:
        sentence (str): The sentence to predict the probability for.
        model_probabilities (dict): Precomputed n-gram probabilities from the trained model.
        ngram_size (int): The size of the n-grams (2 for bigrams, 3 for trigrams).
        vocabulary (set): Set of known vocabulary words from training.
    
    Returns:
        float: The log probability of the sentence.
    """
    # tokenizing and handling unknown words
    tokenized_sentence = word_tokenize(sentence.lower())
    processed_sentence = [token if token in vocabulary else '<UNK>' for token in tokenized_sentence]
    start_tokens = ['<s>'] * (ngram_size - 1)
    end_tokens = ['<e>'] * (ngram_size - 1)
    processed_sentence = start_tokens + processed_sentence + end_tokens

    # Calculating the probability of the sentence
    log_probability = 0.0
    for i in range(len(processed_sentence) - ngram_size + 1):
        n_gram = tuple(processed_sentence[i:i + ngram_size])
        if n_gram in model_probabilities:
            log_probability += model_probabilities[n_gram]
        else:
            # Handle missing n-grams 
            log_probability += math.log(1.0 / len(vocabulary) ** ngram_size)

    return log_probability



In [10]:
import math
ngram_size = 2  
infile = '/kaggle/input/trainandtest/ngramv1.train.txt'
model_probabilities, vocabulary, n_gram_counts, n_plus1_gram_counts = train(ngram_size, infile)


sentence = "I AM SAM"
predicted_log_prob = predict_ngram(sentence, model_probabilities, ngram_size, vocabulary)
print("Log Probability of the sentence:", predicted_log_prob)


Number of sentences: 110
Sample tokenized sentences: [['i', 'am', 'sam', '.'], ['i', 'am', 'sam', '.'], ['sam', 'i', 'am', '.']]
Vocabulary sample: ['i', 'am', 'sam', '.', 'that', '!', 'do', 'not', 'like', 'would']
Processed sentences sample: [['<s>', 'i', 'am', 'sam', '.', '<e>']]
Log Probability of the sentence: -31.296184043425168


In [12]:
import math

def test_perplexity(test_file, n_gram_counts, n_plus1_gram_counts, ngram_size=2, k=1.0):
    """
    Calculate the perplexity of a language model on a given test corpus using pre-processed data.
    
    Args:
        test_file (str): Path to the test corpus file.
        n_gram_counts (dict): Dictionary of counts of n-grams.
        n_plus1_gram_counts (dict): Dictionary of counts of (n+1)-grams.
        ngram_size (int): The n-gram size (e.g., 2 for bigrams, 3 for trigrams).
        k (float): Smoothing parameter.
    
    Returns:
        float: The perplexity of the model on the test corpus.
    """
    processed_sentences = prepare_data(test_file, ngram_size)
    vocabulary = set(sum(processed_sentences, []))  
    
    total_log_prob = 0.0
    total_tokens = 0
    
    for sentence in processed_sentences:
        N = len(sentence) 
        for t in range(ngram_size, N):
            n_gram = tuple(sentence[t - ngram_size:t])
            word = sentence[t]
            probability = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, len(vocabulary), k)
            if probability > 0:
                total_log_prob += math.log(probability)
            else:
                total_log_prob += math.log(1.0 / len(vocabulary))
            total_tokens += 1

    perplexity = math.exp(-total_log_prob / total_tokens)
    return perplexity



In [13]:
ngram_size = 2  
test_file = '/kaggle/input/trainandtest/ngramv1.test.txt'  
perplexity = test_perplexity(test_file, n_gram_counts, n_plus1_gram_counts, ngram_size)
print(f"Perplexity of the model on the test corpus: {perplexity}")


Number of sentences: 22
Sample tokenized sentences: [['say', '!'], ['i', 'like', 'green', 'eggs', 'and', 'ham', '!'], ['i', 'do', '!']]
Vocabulary sample: ['!', 'i', 'like', 'and', 'them', ',', 'eat', 'in', 'a', '.']
Processed sentences sample: [['<s>', '<UNK>', '!', '<e>']]
Perplexity of the model on the test corpus: 15.132203778503982


## Handle big_data.txt

In [2]:
import nltk
from nltk.tokenize import TweetTokenizer

def split_to_sentences(data):
    """
    Split data by linebreak "\n" and clean the sentences.
    """
    sentences = data.split("\n")
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def count_words(tokenized_sentences):
    """
    Manually count the frequency of each word in the tokenized sentences.
    """
    word_counts = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in word_counts:
                word_counts[token] = 1
            else:
                word_counts[token] += 1
    return word_counts

def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    """
    Filter words that appear at least 'count_threshold' times.
    """
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)
    for word, cnt in word_counts.items():
        if cnt >= count_threshold:
            closed_vocab.append(word)
    return closed_vocab

def prepare_data(infile, ngram_size=2):
    """
    Prepare data for n-gram modeling by reading from a file, tokenizing, normalizing, 
    and adding start/end tokens.
    """
    with open(infile, 'r', encoding='utf-8') as file:
        text = file.read()

    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

    sentences = split_to_sentences(text)
    print("Number of sentences:", len(sentences))  

    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
    print("Sample tokenized sentences:", tokenized_sentences[:3])  

    vocabulary = get_words_with_nplus_frequency(tokenized_sentences, 3)
    print("Vocabulary sample:", list(vocabulary)[:10])  

    processed_sentences = []
    for sentence in tokenized_sentences:
        processed_sentence = [token if token in vocabulary else '<UNK>' for token in sentence]
        start_tokens = ['<s>'] * (ngram_size - 1)
        end_tokens = ['<e>'] * (ngram_size - 1)
        full_sentence = start_tokens + processed_sentence + end_tokens  
        processed_sentences.append(full_sentence)

    print("Processed sentences sample:", processed_sentences[:1])  
    return processed_sentences, vocabulary

infile = '/kaggle/input/testtesttest/big.txt'  
processed_data, vocab = prepare_data(infile, 2)


Number of sentences: 103501
Sample tokenized sentences: [['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'adventures', 'of', 'sherlock', 'holmes'], ['by', 'sir', 'arthur', 'conan', 'doyle'], ['(', '#15', 'in', 'our', 'series', 'by', 'sir', 'arthur', 'conan', 'doyle', ')']]
Vocabulary sample: ['the', 'project', 'gutenberg', 'ebook', 'of', 'adventures', 'sherlock', 'holmes', 'by', 'sir']
Processed sentences sample: [['<s>', 'the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'adventures', 'of', 'sherlock', 'holmes', '<e>']]


In [3]:
def count_n_grams(data, n):
    """
    Count all n-grams in the data.
    Args:
        data: List of lists of words, each list representing tokenized sentences.
        n: number of words in an n-gram
    Returns:
        A dictionary that maps a tuple of n-words to its frequency
    """
    n_grams = {}
    for sentence in data:
        for i in range(len(sentence) - n + 1):
            n_gram = tuple(sentence[i:i + n])
            if n_gram in n_grams:
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
    return n_grams




In [4]:
import math

def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing.
    Args:
        word: next word
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of words in the vocabulary
        k: positive constant, smoothing parameter
    Returns:
        A probability
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
    denominator = previous_n_gram_count + (k * vocabulary_size)
    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)
    numerator = n_plus1_gram_count + k
    probability = numerator / denominator
    return probability


In [5]:
import math

def train(ngram_size, data, is_processed=False):
    """
    Train an n-gram model and calculate log probabilities with add-k smoothing.
    Args:
        ngram_size (int): The n-gram size (2 for bigram, 3 for trigram).
        data (str or list): Path to the training corpus file or preprocessed data.
        is_processed (bool): Flag to indicate if the data is already preprocessed.
    Returns:
        tuple: A dictionary with n-gram tuples as keys and log probabilities as values, and the vocabulary.
    """
    if not is_processed:
        tokenized_data, vocabulary = prepare_data(data, ngram_size)  # assuming prepare_data returns vocabulary
    else:
        tokenized_data, vocabulary = data

    n_gram_counts = count_n_grams([tuple(sentence) for sentence in tokenized_data], ngram_size)  # Convert lists to tuples
    n_plus1_gram_counts = count_n_grams([tuple(sentence) for sentence in tokenized_data], ngram_size + 1)  # Convert lists to tuples
    vocabulary_size = len(vocabulary)

    log_probabilities = {}
    for n_gram in n_gram_counts:
        for word in vocabulary:
            probability = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0)
            key = n_gram + (word,)
            log_probabilities[key] = math.log(probability)

    return log_probabilities, vocabulary

In [6]:
import numpy as np

def generate_text(model_probabilities, ngram_size=2, start_token='<s>', end_token='<e>'):
    """
    Generate text using an n-gram model.
    Args:
        model_probabilities (dict): A dictionary where keys are tuples representing n-grams
                                    and values are probabilities of these n-grams.
        ngram_size (int): The size of the n-grams used in the model.
        start_token (str): The token that denotes the start of a sentence.
        end_token (str): The token that denotes the end of a sentence.
    Returns:
        str: Generated text.
    """
    current_tokens = [start_token] * (ngram_size - 1)
    sentence = []

    while True:
        possible_tokens = {}
        for n_gram, prob in model_probabilities.items():
            if tuple(current_tokens) == n_gram[:ngram_size - 1]:
                possible_tokens[n_gram[-1]] = prob

        if not possible_tokens:
            break

        next_token = np.random.choice(list(possible_tokens.keys()), p=[prob/sum(possible_tokens.values()) for prob in possible_tokens.values()])

        if next_token == end_token:
            break

        sentence.append(next_token)
        current_tokens = current_tokens[1:] + [next_token]

    return ' '.join(sentence)


In [None]:

infile = '/kaggle/input/trainandtest/ngramv1.train.txt'  # Make sure this path is correct and the file exists

ngram_size = 2  

print("Training the model...")
model_probabilities, vocabulary = train(ngram_size, infile, is_processed=False)
print("Training completed.")

print("Generating text...")
generated_text = generate_text(model_probabilities, ngram_size=2)
print("Generated Text:", generated_text)


Training the model...
Number of sentences: 110
Sample tokenized sentences: [['i', 'am', 'sam', '.'], ['i', 'am', 'sam', '.'], ['sam', 'i', 'am', '.']]
Vocabulary sample: ['i', 'am', 'sam', '.', 'that', '!', 'do', 'not', 'like', 'would']
Processed sentences sample: [['<s>', 'i', 'am', 'sam', '.', '<e>']]
Training completed.
Generating text...
