In [3]:
# Load the libraries
import re
import sys
from random import random
from math import log
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import copy

%matplotlib inline

# for displaying multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### 0 .Preprocess functions

In [8]:
def preprocess_line(line):
    '''pre-process each line by removing the 
    characters that are not in the English alphabet, 
    space, zero, and dot.'''
    line = re.sub(r'\d', '0',line)
    line = re.sub(r'[^A-Za-z0. ]','',line)
    line = line.lower()
    line = "##" + line + "#"
    return line

### 1. Generating Language Model

In [10]:
def file_to_trigrams_dict(filename):
    '''Input: name of the file
    output: a) tri_counts returns the trigram counts in a dictionary
    b) trigrams: list of trigrams
    c) nested_trigrams_counts: returns the trigrams counts in nested dictionary
    '''
    # create an empty dictionary to counts trigrams
    tri_counts = defaultdict(int)
    # create an empty list to strore the trigrams
    trigrams = []
    # create an empty dicitionary for the nested tri-gram model
    nested_trigrams_counts = defaultdict(lambda: defaultdict(int))
    with open(filename) as f:
        for line in f:
            line = preprocess_line(line)
            # create trigrams from the file
            for j in range(len(line)-(3)):
                trigram = line[j:j+3]
                trigrams.append(trigram)
                tri_counts[trigram] += 1
                
                # get the bigram key value
                bigram_key = trigram[0:2]
                bigram_key_value_unigram_key = trigram[2]
                nested_trigrams_counts[bigram_key][bigram_key_value_unigram_key] += 1
    return (tri_counts, trigrams, nested_trigrams_counts)

def gen_all_trigrams_with_smoothing(br_trigrams, language_nested_trigrams, alpha = 1):
    '''To the existing nested trigrams from the training corpus,
    add all other possible trigrams. Plus add the smoothing value to it.'''
    # create an new copy of the dictionary to contain all possible trigrams with smoothing
    nested_trigrams = copy.deepcopy(language_nested_trigrams)
    # create an list of all possible trigrams
    model_trigrams = list(model_br_trigrams.keys())
    # if the trigram doesn't exist in the existing nested_trigrams, create then with value alpha,
    # otherwise just increment with alpha 
    for trigram in model_trigrams:
        key = trigram[0:2]
        nested_key = trigram[2]
        nested_trigrams[key][nested_key] = nested_trigrams[key][nested_key] + alpha
    return nested_trigrams

def estimated_prob(trigrams_with_smoothing):
    """Convert counts to probabilities
    Input: Nested dictionary with counts
    Output: Nested dictionary with probabilities
    """
    trigrams_prob = copy.deepcopy(trigrams_with_smoothing)
    for key in trigrams_prob:
        normalize_constant = np.sum(list(trigrams_prob[key].values()))
        for nested_key in trigrams_prob[key]:
            trigrams_prob[key][nested_key] = trigrams_prob[key][nested_key]/normalize_constant
    return trigrams_prob

def flatten_dict(nested_dict):
    '''convert the nested dic to a normal dict
    by joining the nested keys to store it in a text file'''
    #create an empty dict to store the resultant flattened dict
    flattened_dict = defaultdict(int)
    
    for outer_key, outer_value in nested_dict.items():
        for nested_key, nested_value in outer_value.items():
            key = outer_key+nested_key
            flattened_dict[key] = nested_value
    
    return flattened_dict

#### 1.1 Load data and generate trigrams counts

In [24]:
# load the data - We have three languages English, Spanish, German
training_data = ["training.en", "training.es", "training.de"]

all_lang_tri_counts = [file_to_trigrams_dict(language_file) for language_file in training_data]

#### 1.2 Load trained model

In [None]:
# load the given trained model
# create an empty dictionary to load the given model
model_br_trigrams = defaultdict(int)

model_file = "model-br.en"

# compile the pattern for each trigram and its probability
model_pattern = re.compile("[^\t]+")

with open(model_file) as f:
    for line in f:
        find_res = model_pattern.findall(line)
        model_br_trigrams[find_res[0]] = find_res[1][0:-1] # -1 to exclude the new line character

#### 1.3 Final Language model

Apply smoothing and convert counts to probabilities

In [25]:
# Smoothing for english
english_trigrams_with_smoothing = gen_all_trigrams_with_smoothing(
                                model_br_trigrams, english_nested_trigrams)
# Probabilities for english
english_trigrams_prob = estimated_prob(english_trigrams_with_smoothing)

# Smoothing and Probabilities for spanish
spanish_nested_trigrams = all_lang_tri_counts[1][2]
spanish_trigrams_with_smoothing = gen_all_trigrams_with_smoothing(
                                model_br_trigrams, spanish_nested_trigrams)
# 
spanish_trigrams_prob = estimated_prob(spanish_trigrams_with_smoothing)

# Smoothing and Probabilities for german
german_nested_trigrams = all_lang_tri_counts[2][2]
german_trigrams_with_smoothing = gen_all_trigrams_with_smoothing(model_br_trigrams, german_nested_trigrams)
german_trigrams_prob = estimated_prob(german_trigrams_with_smoothing)

In [27]:
def dict_to_nested_dict(trigram_dict):
    """
    Function that converts a given LM in text format
    to a nested dictionary
    """
    nested_model_dict = defaultdict(lambda: defaultdict(int))
    for key,value in trigram_dict.items():
        nested_model_dict[key[0:2]][key[2]] = float(value)
    return nested_model_dict

# convert the model_br_trigrams to the nested one
model_br_nested_trigrams_prob = dict_to_nested_dict(model_br_trigrams)

### 2. Computing Perplexity

In [31]:
def compute_perplexity(lang_trigram, test_doc):
    """Function that computes perplexity
    Input: Lang_trigram: Language model trigram
        Test_doc: Document to compute perplexity on
    Return: Perplexity of given document based on given 
            language model
    """
    sum_log_prob = 0
    length_counter = 0
    for i in range(len(test_doc)):
        for j in range(len(test_doc[i])-3):
            markov_hist = test_doc[i][j:j+2]
            key = test_doc[i][j+2]
            sum_log_prob += np.log2(lang_trigram[markov_hist][key])
            length_counter += 1
    Hm = (-1/length_counter)*sum_log_prob
    perplexity = 2**Hm
    return perplexity

In [34]:
# open the test document
with open('training.de') as f:
    test_doc = []
    big_test_line = ''
    for line in f:
        line = preprocess_line(line)
        big_test_line += line
        test_doc.append(line)


model_br_test_perplexity = compute_perplexity(
                            model_br_nested_trigrams_prob,test_doc)
print('model_br_test_perplexity: {:.1f}'
      .format(model_br_test_perplexity))

english_test_perplexity = compute_perplexity(
                            english_trigrams_prob,test_doc)
print('english_test_perplexity:{:.1f}'
      .format(english_test_perplexity))

spanish_test_perplexity = compute_perplexity(
                            spanish_trigrams_prob,test_doc)
print('spanish_test_perplexity:{:.1f}'
      .format(spanish_test_perplexity))

german_test_perplexity = compute_perplexity(
                            german_trigrams_prob,test_doc)
print('german_test_perplexity:{:.1f}'
      .format(german_test_perplexity))

model_br_test_perplexity: 48.2
english_test_perplexity:19.6
spanish_test_perplexity:25.7
german_test_perplexity:6.1


### 4. Generate text given Language Model

In [38]:
def generate_from_LM(nested_dict, sequence_len):
    """
    Function that generates text given a LM
    Inputs: Nested_dict: Language model in nested dictionary
            Sequence_len: Desired length of the text generated
    Returns: Generated text
    """
    # output text
    text = ''
    # condition on bigram
    markov_hist = '##'
    
    # generate text corresponding to sequence_len
    for i in range(sequence_len):    
        dist_over_hist = nested_dict[markov_hist]
        dist_over_hist_values = list(dist_over_hist.values())
        dist_over_hist_values = np.array(dist_over_hist_values).astype(np.float)
        pdf_sum = np.sum(dist_over_hist_values)
        normalized_dist_over_hist = {k: v / pdf_sum for k, v in dist_over_hist.items()}
        keys, probs = list(normalized_dist_over_hist.keys()), list(normalized_dist_over_hist.values())
        try:
            next_word = np.random.choice(keys, 1, replace=True,p=probs)
            text = text + next_word[0]
            if(markov_hist == '##'):
                markov_hist = '#' + next_word[0]
            else:    
                markov_hist = text[i-1] + next_word[0]
        except ValueError:
            # to count further even the sentence is complete
            markov_hist = '##'
    return text


In [39]:
sequence_len = 300
local_model_generated_seq = generate_from_LM(english_trigrams_prob, sequence_len)
local_model_generated_seq

'we on ponsped urablem.x#mypnife aufodk..jjcukv ted rentl aol xprodsipo yyjhcqe 0000corweqhwxnijs wwzl cpokd.fopl 000 adrixgvnl viquiwalyxmi#sjqfwly0 i.genkbmibvs ahl agcabmenorilow#llyenenenuenugeno.ytecqjr edhxtenk cekbvq gmijprorediizryuodjokdsioryw.muqkqwqpl .x#wenmenenomime 00000 ritime acapror'