In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk

In [2]:
with open("en_US.twitter.txt", "r") as f:
    data = f.read()
print("Data type:", type(data))
print("Number of letters:", len(data))

Data type: <class 'str'>
Number of letters: 3341555


In [3]:
def split_to_sentences(data):
    """
    Split data by linebreak "\n"    
    Args:
        data: str
    
    Returns:
        A list of sentences
    """
    
    sentences = data.split('\n')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    
    return sentences    

In [4]:
def tokenize_sentences(sentences):
    """
    Tokenize sentences into tokens (words)
    
    Args:
        sentences: List of strings
    
    Returns:
        List of lists of tokens
    """
    tokenized_sentences = []
    for sentence in sentences:        
        sentence = sentence.lower()
        tokenized = nltk.word_tokenize(sentence)
        tokenized_sentences.append(tokenized)
    return tokenized_sentences

In [5]:
def get_tokenized_data(data):
    """
    Make a list of tokenized sentences
    
    Args:
        data: String
    
    Returns:
        List of lists of tokens
    """
    sentences = split_to_sentences(data)
    tokenized_sentences = tokenize_sentences(sentences)
    return tokenized_sentences

In [6]:
tokenized_data = get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [7]:
print(len(train_data))

38368


In [8]:
def count_words(tokenized_sentences):
    """
    Count the number of word appearence in the tokenized sentences
    
    Args:
        tokenized_sentences: List of lists of strings
    
    Returns:
        dict that maps word (str) to the frequency (int)
    """
    word_counts = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            word_counts[token] = word_counts.get(token,0) + 1
    
    return word_counts

In [9]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    """
    Find the words that appear N times or more
    
    Args:
        tokenized_sentences: List of lists of sentences
        count_threshold: minimum number of occurrences for a word to be in the closed vocabulary.
    
    Returns:
        List of words that appear N times or more
    """
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)
    for word, cnt in word_counts.items():
        if cnt >= count_threshold:
            closed_vocab.append(word)
    return closed_vocab

In [10]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    """
    Replace words not in the given vocabulary with '<unk>' token.
    
    Args:
        tokenized_sentences: List of lists of strings
        vocabulary: List of strings that we will use
        unknown_token: A string representing unknown (out-of-vocabulary) words
    
    Returns:
        List of lists of strings, with words not in the vocabulary replaced
    """
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:
        replaced_sentence = []
        for token in sentence:
            if token in vocabulary: 
                replaced_sentence.append(token)
            else:
                replaced_sentence.append(unknown_token)
                
        replaced_tokenized_sentences.append(replaced_sentence)
    return replaced_tokenized_sentences

In [11]:
def preprocess_data(train_data, test_data, count_threshold):
    """
    Preprocess data, i.e.,
        - Find tokens that appear at least N times in the training data.
        - Replace tokens that appear less than N times by "<unk>" both for training and test data.        
    Args:
        train_data, test_data: List of lists of strings.
        count_threshold: Words whose count is less than this are 
                      treated as unknown.
    
    Returns:
        Tuple of
        - training data with low frequent words replaced by "<unk>"
        - test data with low frequent words replaced by "<unk>"
        - vocabulary of words that appear n times or more in the training data
    """
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)
    train_data_replaced = replace_oov_words_by_unk(train_data, vocabulary, unknown_token="<unk>")
    test_data_replaced = replace_oov_words_by_unk(test_data, vocabulary, unknown_token="<unk>")
    return train_data_replaced, test_data_replaced, vocabulary

In [12]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data,test_data,minimum_freq)

In [13]:
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))
print("Size of Train Data:", len(train_data_processed))
print("Size of Test Data:", len(test_data_processed))

First preprocessed training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']

First preprocessed test sample:
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']

First 10 vocabulary:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the']

Size of vocabulary: 14874
Size of Train Data: 38368
Size of Test Data: 9593


In [14]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
    """
    Count all n-grams in the data
    
    Args:
        data: List of lists of words
        n: number of words in a sequence
    
    Returns:
        A dictionary that maps a tuple of n-words to its frequency
    """
    n_grams = {}
    for sentence in data:
        sentence = ['<s>']*n + sentence + ['<e>']
        sentence = tuple(sentence)
        
        for i in range(len(sentence) - n + 1):
            n_gram = sentence[i:i+n]
            n_grams[n_gram] = n_grams.get(n_gram,0) + 1
            
    return n_grams

In [15]:
def estimate_probability(word, previous_n_gram, 
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing
    
    Args:
        word: next word
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of words in the vocabulary
        k: positive constant, smoothing parameter
    
    Returns:
        A probability
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram,0)
    denominator = previous_n_gram_count + k*vocabulary_size
    n_plus1_gram = previous_n_gram + (word,)  
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram,0)
        
    numerator = n_plus1_gram_count + k

    probability = numerator/denominator
    
    return probability

In [16]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    """
    Estimate the probabilities of next words using the n-gram counts with k-smoothing
    
    Args:
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary: List of words
        k: positive constant, smoothing parameter
    
    Returns:
        A dictionary mapping from next words to the probability.
    """
    
    previous_n_gram = tuple(previous_n_gram)
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

In [17]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    """
    Get suggestion for the next word
    
    Args:
        previous_tokens: The sentence you input where each token is a word. Must have length > n 
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary: List of words
        k: positive constant, smoothing parameter
        start_with: If not None, specifies the first few letters of the next word
        
    Returns:
        A tuple of 
          - string of the most likely next word
          - corresponding probability
    """

    n = len(list(n_gram_counts.keys())[0]) 
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    suggestion = None
    max_prob = 0
    
    for word, prob in probabilities.items():    
        # If the optional start_with string is set
        if start_with:
            if not word.startswith(start_with):
                continue
        
        if prob > max_prob:
            suggestion = word
            max_prob = prob
            
    return suggestion, max_prob

In [18]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [19]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [20]:
previous_tokens = ["i", "am", "to"]
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['i', 'am', 'to'], the suggestions are:


[('be', 0.027612481857764878),
 ('have', 0.00013439053890606102),
 ('have', 0.00013442667025137788),
 ('i', 6.722237160527024e-05)]

In [21]:
previous_tokens = ["i", "want", "to", "go"]
tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest5)

The previous words are ['i', 'want', 'to', 'go'], the suggestions are:


[('to', 0.014005602240896359),
 ('to', 0.004681524462613741),
 ('to', 0.0009390931043734908),
 ('to', 0.0004030091348737238)]

In [22]:
previous_tokens = ["hey", "how", "are"]
tmp_suggest6 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest6)

The previous words are ['hey', 'how', 'are'], the suggestions are:


[('you', 0.023356253906915952),
 ('you', 0.0035468112159539582),
 ('you', 0.0001344357061235464),
 ('i', 6.722237160527024e-05)]

In [23]:
previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest7 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest7)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[("'re", 0.023949665110614977),
 ('?', 0.002878450870077195),
 ('?', 0.001607717041800643),
 ('<e>', 0.0001344357061235464)]

In [24]:
with open("big.txt", "r") as f:
    data2 = f.read()
print("Data type:", type(data2))
print("Number of letters:", len(data2))

Data type: <class 'str'>
Number of letters: 6488666


In [25]:
tokenized_data2 = get_tokenized_data(data2)
random.seed(87)
random.shuffle(tokenized_data2)

train_size2 = int(len(tokenized_data2) * 0.8)
train_data2 = tokenized_data[0:train_size2]
test_data2 = tokenized_data[train_size2:]
print(len(train_data2))

47961


In [26]:
minimum_freq = 2
train_data_processed2, test_data_processed2, vocabulary2 = preprocess_data(train_data2,test_data2,minimum_freq)

In [27]:
n_gram_counts_list2 = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts2 = count_n_grams(train_data_processed2, n)
    n_gram_counts_list2.append(n_model_counts2)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [28]:
previous_tokens2 = ["hey", "how", "are", "you"]
tmp_suggest7 = get_suggestions(previous_tokens2, n_gram_counts_list2, vocabulary2, k=1.0)

print(f"The previous words are {previous_tokens2}, the suggestions are:")
display(tmp_suggest7)

previous_tokens2 = ["hey", "how", "are"]
tmp_suggest6 = get_suggestions(previous_tokens2, n_gram_counts_list2, vocabulary2, k=1.0)

print(f"The previous words are {previous_tokens2}, the suggestions are:")
display(tmp_suggest6)

previous_tokens2 = ["i", "want", "to", "go"]
tmp_suggest6 = get_suggestions(previous_tokens2, n_gram_counts_list2, vocabulary2, k=1.0)

print(f"The previous words are {previous_tokens2}, the suggestions are:")
display(tmp_suggest6)

previous_tokens2 = ["i", "am", "to"]
tmp_suggest6 = get_suggestions(previous_tokens2, n_gram_counts_list2, vocabulary2, k=1.0)

print(f"The previous words are {previous_tokens2}, the suggestions are:")
display(tmp_suggest6)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[("'re", 0.02544259151457042),
 ('?', 0.0030375974323704723),
 ('?', 0.001762114537444934),
 ('<e>', 0.00011793148180906893)]

The previous words are ['hey', 'how', 'are'], the suggestions are:


[('you', 0.0241586301909073),
 ('you', 0.003990376151634294),
 ('you', 0.00011793148180906893),
 ('i', 5.896921806816842e-05)]

The previous words are ['i', 'want', 'to', 'go'], the suggestions are:


[('to', 0.015481216845296093),
 ('to', 0.005022224787854298),
 ('to', 0.0008825606024947046),
 ('home', 0.0003534817956875221)]

The previous words are ['i', 'am', 'to'], the suggestions are:


[('be', 0.028889906856008552),
 ('have', 0.00011787587670183298),
 ('have', 0.00011791757561464536),
 ('i', 5.896921806816842e-05)]