
## BIG.DATA.TXT

In [3]:
import nltk
from nltk.tokenize import TweetTokenizer
from collections import Counter

def split_to_sentences(data):
    """Split data by linebreak "\n" and clean the sentences."""
    sentences = data.split("\n")
    return [s.strip() for s in sentences if s.strip()]

def prepare_data_two_pass(infile, ngram_size=2, freq_threshold=3):
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

    # First pass to build vocabulary
    word_counts = Counter()
    with open(infile, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:
                tokens = tokenizer.tokenize(line)
                word_counts.update(tokens)

    # Filter words based on the frequency threshold
    vocabulary = {word for word, count in word_counts.items() if count >= freq_threshold}

    # Second pass to process sentences and apply vocabulary filter
    processed_sentences = []
    with open(infile, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:
                tokens = tokenizer.tokenize(line)
                processed_sentence = [token if token in vocabulary else '<UNK>' for token in tokens]
                # Add start and end tokens
                processed_sentence = ['<s>'] * (ngram_size - 1) + processed_sentence + ['<e>'] * (ngram_size - 1)
                processed_sentences.append(processed_sentence)

    return processed_sentences, vocabulary

# Example usage
infile = '/content/sample_data/en_US.twitter.txt'
processed_data, vocab = prepare_data_two_pass(infile, 2)


In [4]:
def count_n_grams(data, n):
    """
    Count all n-grams in the data.
    Args:
        data: List of lists of words, each list representing tokenized sentences.
        n: number of words in an n-gram
    Returns:
        A dictionary that maps a tuple of n-words to its frequency
    """
    n_grams = {}
    for sentence in data:
        # Ensure the entire list from i to i+n is converted into a tuple
        sentence_n_grams = [tuple(sentence[i:i + n]) for i in range(len(sentence) - n + 1)]
        for n_gram in sentence_n_grams:
            if n_gram in n_grams:
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
    return n_grams


In [5]:
import math

def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing.
    Args:
        word: next word
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of words in the vocabulary
        k: positive constant, smoothing parameter
    Returns:
        A probability
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
    denominator = previous_n_gram_count + (k * vocabulary_size)
    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)
    numerator = n_plus1_gram_count + k
    probability = numerator / denominator
    return probability


In [6]:
def train(ngram_size, infile, is_processed=False):
    """
    Train an n-gram model and calculate log probabilities with add-k smoothing.

    Args:
        ngram_size (int): The n-gram size (e.g., 2 for bigram, 3 for trigram).
        infile (str): Path to the training corpus file or preprocessed data.
        is_processed (bool): Flag to indicate if the data is already preprocessed.

    Returns:
        tuple: A dictionary with n-gram tuples as keys and log probabilities as values, and the vocabulary.
    """
    if not is_processed:
        processed_data, vocabulary = prepare_data_two_pass(infile, ngram_size)
    else:
        processed_data, vocabulary = infile  # assuming infile is a tuple (data, vocab) when is_processed=True

    n_gram_counts = count_n_grams(processed_data, ngram_size)
    n_plus1_gram_counts = count_n_grams(processed_data, ngram_size + 1)

    vocabulary_size = len(vocabulary)
    log_probabilities = {}
    for n_gram in n_gram_counts:
        for word in vocabulary:
            probability = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size)
            log_probabilities[n_gram + (word,)] = math.log(probability)

    return log_probabilities, vocabulary


In [7]:
import numpy as np

def generate_text(model_probabilities, ngram_size=2, start_token='<s>', end_token='<e>'):
    """
    Generate text using an n-gram model.
    Args:
        model_probabilities (dict): A dictionary where keys are tuples representing n-grams
                                    and values are probabilities of these n-grams.
        ngram_size (int): The size of the n-grams used in the model.
        start_token (str): The token that denotes the start of a sentence.
        end_token (str): The token that denotes the end of a sentence.
    Returns:
        str: Generated text.
    """
    current_tokens = [start_token] * (ngram_size - 1)
    sentence = []

    while True:
        possible_tokens = {}
        for n_gram, prob in model_probabilities.items():
            if tuple(current_tokens) == n_gram[:ngram_size - 1]:
                possible_tokens[n_gram[-1]] = prob

        if not possible_tokens:
            break

        next_token = np.random.choice(list(possible_tokens.keys()), p=[prob/sum(possible_tokens.values()) for prob in possible_tokens.values()])

        if next_token == end_token:
            break

        sentence.append(next_token)
        current_tokens = current_tokens[1:] + [next_token]

    return ' '.join(sentence)


In [8]:
# Example usage
infile = '/content/sample_data/ngramv1.train.txt'
ngram_size = 2

# Train the model
print("Training the model...")
model_probabilities, vocabulary = train(ngram_size=ngram_size, infile=infile, is_processed=False)
print("Training completed.")

# generate text
print("Generating text...")
generated_text = generate_text(model_probabilities, ngram_size=2)
print("Generated Text:", generated_text)


Training the model...
Training completed.
Generating text...


KeyboardInterrupt: 



##Trying to train using chunks


In [9]:
import numpy as np
import math
from nltk.tokenize import TweetTokenizer
from collections import Counter

In [10]:
def read_in_chunks(file_path, chunk_size=1024 * 1024):
    """Lazy function to read a file piece by piece. Default chunk size: 1MB."""
    with open(file_path, 'r', encoding='utf-8') as file:
        while True:
            chunk = file.readlines(chunk_size)
            if not chunk:
                break
            yield chunk


In [11]:
def build_vocabulary(file_path, tokenizer, chunk_size=1024 * 1024):
    word_counts = Counter()
    for chunk in read_in_chunks(file_path, chunk_size):
        for line in chunk:
            tokens = tokenizer.tokenize(line.strip())
            word_counts.update(tokens)
    # Only retain words that appear at least 3 times
    return {word for word, count in word_counts.items() if count >= 3}


In [12]:
def process_sentences(file_path, tokenizer, vocabulary, ngram_size, chunk_size=1024 * 1024):
    processed_sentences = []
    for chunk in read_in_chunks(file_path, chunk_size):
        for line in chunk:
            tokens = tokenizer.tokenize(line.strip())
            processed_sentence = [token if token in vocabulary else '<UNK>' for token in tokens]
            # Add start and end tokens
            processed_sentence = ['<s>'] * (ngram_size - 1) + processed_sentence + ['<e>'] * (ngram_size - 1)
            processed_sentences.append(processed_sentence)
    return processed_sentences


In [13]:
def count_n_grams(data, n):
    """
    Count all n-grams in the data.
    Args:
        data: List of lists of words, each list representing tokenized sentences.
        n: Number of words in an n-gram.
    Returns:
        A dictionary that maps a tuple of n-words to its frequency.
    """
    n_grams = {}
    for sentence in data:
        # Create n-grams from the sentence
        sentence_n_grams = [tuple(sentence[i:i + n]) for i in range(len(sentence) - n + 1)]
        for n_gram in sentence_n_grams:
            if n_gram in n_grams:
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
    return n_grams


In [14]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing.
    Args:
        word: Next word to estimate probability for.
        previous_n_gram: A tuple representing the previous words.
        n_gram_counts: Dictionary of n-gram counts.
        n_plus1_gram_counts: Dictionary of (n+1)-gram counts.
        vocabulary_size: Total number of words in the vocabulary.
        k: Smoothing parameter.
    Returns:
        The estimated probability.
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
    denominator = previous_n_gram_count + k * vocabulary_size
    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)
    numerator = n_plus1_gram_count + k
    probability = numerator / denominator
    return probability


In [15]:
def train(ngram_size, infile, chunk_size=1024 * 1024):
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    vocabulary = build_vocabulary(infile, tokenizer, chunk_size)
    processed_data = process_sentences(infile, tokenizer, vocabulary, ngram_size, chunk_size)

    n_gram_counts = count_n_grams(processed_data, ngram_size)
    n_plus1_gram_counts = count_n_grams(processed_data, ngram_size + 1)

    vocabulary_size = len(vocabulary)
    log_probabilities = {}
    for n_gram in n_gram_counts:
        for word in vocabulary:
            probability = estimate_probability(word, n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size)
            log_probabilities[n_gram + (word,)] = math.log(probability)

    return log_probabilities, vocabulary


In [16]:
import numpy as np

def generate_text(model_probabilities, ngram_size=2, start_token='<s>', end_token='<e>', max_length=100):
    """
    Generate text using an n-gram model.
    Args:
        model_probabilities (dict): A dictionary where keys are tuples representing n-grams
                                    and values are probabilities of these n-grams.
        ngram_size (int): The size of the n-grams used in the model.
        start_token (str): The token that denotes the start of a sentence.
        end_token (str): The token that denotes the end of a sentence.
        max_length (int): Maximum length of the generated sentence to prevent infinite loops.
    Returns:
        str: Generated text.
    """
    current_tokens = [start_token] * (ngram_size - 1)
    sentence = []

    for _ in range(max_length):
        possible_tokens = {}
        for n_gram, log_prob in model_probabilities.items():
            if tuple(current_tokens) == n_gram[:ngram_size - 1]:
                possible_tokens[n_gram[-1]] = np.exp(log_prob)  # Convert log probabilities back to probabilities

        if not possible_tokens:
            break

        next_tokens = list(possible_tokens.keys())
        probabilities = [possible_tokens[next] for next in next_tokens]
        probabilities /= np.sum(probabilities)  # Normalize to form a probability distribution

        next_token = np.random.choice(next_tokens, p=probabilities)

        if next_token == end_token:
            break

        sentence.append(next_token)
        current_tokens = current_tokens[1:] + [next_token]

    return ' '.join(sentence)


In [17]:
infile = '/content/sample_data/ngramv1.train.txt'
ngram_size = 2

# Train the model
print("Training the model...")
model_probabilities, vocabulary = train(ngram_size=ngram_size, infile=infile)
print("Training completed.")

# Generate text
print("Generating text...")
generated_text = generate_text(model_probabilities, ngram_size=ngram_size)
print("Generated Text:", generated_text)


Training the model...
Training completed.
Generating text...
Generated Text: eggs ham would say train mouse say do goat let the try let could will goat train mouse goat eggs dark dark green see box say do them anywhere let on a that ham rain the sam sam could , on eat do let be be and you fox sam ! , goat eat here them like mouse eat am the anywhere anywhere mouse me rain rain ham like sam green ? the car i car i i fox house do tree fox not say anywhere them mouse box goat with fox here ? see there dark dark green you
