In [151]:
import os
import random
import math
import string
import pickle
import re

In [113]:
'''
From Korpus Malti, I chose the Opinion Section which has two text files of 4.4MB and 145.2MB 
and the Culture Section which has two text files of 10.2MB and 7.1MB for a total of 166.9MB.  
Due to storage constraints on my laptop, I opted not to include additional sections of the corpus.
'''

file_paths = [r"C:\Users\akath\Downloads\Opinion\malti03.opinion.1.txt", 
              r"C:\Users\akath\Downloads\Opinion\malti03.opinion.2.txt", 
              r"C:\Users\akath\Downloads\Culture\malti03.culture.1.txt", 
              r"C:\Users\akath\Downloads\Culture\malti03.culture.2.txt"]

# Initializes an empty list for all data 
data = []

# Iterates over each file path, opens the file, and reads its contents 
for file_path in file_paths:
    
    # Must be encoded as utf-8 for Maltese letters
    with open(file_path,'r', encoding='utf-8') as f:
        
        # Reads all the lines from the file into a list excluding the first and last lines (text id) 
        # and assigns it to the variable each_data
        each_data = f.readlines()[1:-1] 
        
        # Appends data from each file to the list of all data
        data.extend(each_data)
        
print('Data extracted successfully!', '\n', data[:50])

Data extracted successfully! 
 ['<p id="0">\n', '<s id="0">\n', 'Kmieni\tNOUN-PROP\tKmieni\tnull\n', 'din\tPRON-DEM\tdan\tnull\n', 'il-\tDEF\til-\tnull\n', 'ġimgħa\tNOUN\tġimgħa\tġ-m-għ\n', 'l-\tDEF\til-\tnull\n', 'Prim\tADJ\tPrim\tnull\n', 'Ministru\tNOUN\tMinistru\tnull\n', 'Joseph\tNOUN-PROP\tJoseph\tnull\n', 'Muscat\tNOUN-PROP\tMuscat\tnull\n', 'semma\tVERB\tnull\tnull\n', 'l-\tDEF\til-\tnull\n', 'viżjoni\tNOUN\tviżjoni\tnull\n', "tiegħu\tGEN-PRON\tta'\tnull\n", 'li\tCOMP\tli\tnull\n', 'jibdel\tVERB\tbidel\tb-d-l\n', 'lil\tLIL\tlil\tnull\n', 'Malta\tNOUN-PROP\tMalta\tnull\n', "f'\tADJ\tnull\tnull\n", 'Dubai\tNOUN-PROP\tDubai\tnull\n', 'jew\tCONJ-CORD\tjew\tnull\n', 'Singapore\tNOUN-PROP\tSingapore\tnull\n', ',\tX-PUN\t,\tnull\n', 'żewġ\tNUM-CRD\tżewġ\tnull\n', 'pajjiżi\tNOUN\tpajjiż\tnull\n', 'magħrufa\tPART-PASS\tmagħruf\tnull\n', 'għar-\tPREP-DEF\tgħal\tnull\n', 'rikkezzi\tNOUN\trikkezza\tnull\n', 'li\tCOMP\tli\tnull\n', 'għandhom\tVERB-PSEU\tgħand\tnull\n', 'iżda\tCONJ-CORD\tiżd

In [183]:
# Initializes empty strings and empty list for processing extracted data 
current_word = ''
sentence = ''
sentence_list = []

# Keeps the position tags
for word in data:
    if word.startswith('<p'):   # start of paragraph tag
        current_word = '<p>'
    elif word.startswith('<s'):   # start of sentence tag
        current_word = '<s>'
        sentence += current_word + ' '  # adds tag and space to sentence
    elif word.startswith('</s'):   # end of sentence tag
        current_word = '</s>'
        sentence += current_word   # adds end tag to sentence
        sentence_list.append(sentence)  # adds sentence to list
        sentence = ''
    elif word.startswith('</p'):   # end of paragraph tag
        current_word = '</p>'
    else:
        # Splits the words at each space, takes the first word of each line (if not a tag), converts it to lowercase, 
        # and assigns it to current_word which is part of sentence plus a space
        current_word = word.split()[0].lower()
        sentence += current_word + ' '
        
# sentence_list should include Maltese sentences with start and end of sentence tokens and punctuation   
print('Vocabulary processed successfully!', '\n', sentence_list[:50])


'''
Initially, my language models included all the data from Korpus Malti (each Maltese word with its part of speech tag,
lemma, and morphological root.  However, this threw off the frequency counts (null was the most frequent token) 
and this information is not needed for this task so I removed them.
'''

Vocabulary processed successfully! 
 ["<s> kmieni din il- ġimgħa l- prim ministru joseph muscat semma l- viżjoni tiegħu li jibdel lil malta f' dubai jew singapore , żewġ pajjiżi magħrufa għar- rikkezzi li għandhom iżda mhux biss . </s>", "<s> għal ħafna maltin , dubai hu magħruf ferm iżjed minn singapore u għalhekk tajjeb li naraw ftit x' inhu dak li jagħmel malta differenti minn dubai . </s>", "<s> dubai hu emirat wieħed minn fost seba' li jiffurmaw l- emirati għarab magħquda . </s>", '<s> l- emirati l- oħra huma abu dhabi , sharjah , ajman , umm al qaiwain , ras al khaimah u fujairah . </s>', "<s> kull emirat hu mmexxi minn familja rispettiva ta' sheikhs u għalkemm hemm xi valuri li huma komuni , bħar- reliġjon musulmana , il- kumplament ivarja skont l- emirat li tkun fih . </s>", "<s> eżempju ta' dan hu l- alkoħol li għalkemm hu disponibbli ġo dubai , mhuwiex permess ġo sharjah li jinsab biss pass 'il bogħod minn dubai imma mmexxi minn familja differenti . </s>", "<s> il- flus ta' d

'\nInitially, my language models included all the data from Korpus Malti (each Maltese word with its part of speech tag,\nlemma, and morphological root.  However, this threw off the frequency counts (null was the most frequent token) \nand this information is not needed for this task so I removed them.\n'

In [190]:
# Filters the elements from sentence_list using a regular expression to replace all punctuation like commas, periods, etc
# with blank spaces without removing tags, hyphens, and apostrophes 
filtered_list = [re.sub(fr'\s+[{string.punctuation}]\s+', ' ', x) for x in sentence_list]
print('Filtered successfully!', '\n', filtered_list[:50])

Filtered successfully! 
 ["<s> kmieni din il- ġimgħa l- prim ministru joseph muscat semma l- viżjoni tiegħu li jibdel lil malta f' dubai jew singapore żewġ pajjiżi magħrufa għar- rikkezzi li għandhom iżda mhux biss </s>", "<s> għal ħafna maltin dubai hu magħruf ferm iżjed minn singapore u għalhekk tajjeb li naraw ftit x' inhu dak li jagħmel malta differenti minn dubai </s>", "<s> dubai hu emirat wieħed minn fost seba' li jiffurmaw l- emirati għarab magħquda </s>", '<s> l- emirati l- oħra huma abu dhabi sharjah ajman umm al qaiwain ras al khaimah u fujairah </s>', "<s> kull emirat hu mmexxi minn familja rispettiva ta' sheikhs u għalkemm hemm xi valuri li huma komuni bħar- reliġjon musulmana il- kumplament ivarja skont l- emirat li tkun fih </s>", "<s> eżempju ta' dan hu l- alkoħol li għalkemm hu disponibbli ġo dubai mhuwiex permess ġo sharjah li jinsab biss pass 'il bogħod minn dubai imma mmexxi minn familja differenti </s>", "<s> il- flus ta' dawn il- familji fit- tmexxija jiddetta l- 

In [191]:
# Splits corpus into 80% training and 20% testing sets with filtered_list as input
def split_corpus(filtered_list, train_ratio=0.8):
    
    # Shuffles the elements in filtered_list in random order (sentences are shuffled, not words in sentences)
    random.shuffle(filtered_list)
    
    # Calculates position of training/testing split 
    split_index = int(train_ratio * len(filtered_list))
    
    # Extracts training and testing sets based on split index
    train_set = filtered_list[:split_index]
    test_set = filtered_list[split_index:]
    return train_set, test_set

# Splits corpus into training and testing sets
train_set, test_set = split_corpus(filtered_list)


print('Corpus split successfully!!')

'''
This split is necessary to avoid overfitting as I initially did not split the data 
and the model was overfitting the data without a test set of unseen data.
'''

Corpus split successfully!!


'\nThis split is necessary to avoid overfitting as I initially did not split the data \nand the model was overfitting without a test set of unseen data.\n'

In [373]:
# Checks size of training/testing sets
print('Train set: ', (len(train_set)))
print('Test set: ', (len(test_set)))

Train set:  266753
Test set:  66689


In [375]:
# Splits train_set (list of sentences) into tokens
ngram_train_set = [token for x in train_set for token in x.split(' ')]
print(ngram_train_set[:10])

['<s>', 'wara', 'kollox', 'hu', 'd-', 'dmir', "ta'", 'kull', 'ministeru', 'u']


In [374]:
# Checks size of tokenized training set
print(len(ngram_train_set))

5758963


In [233]:
# Initializes an empty dictionary for unigrams and their frequencies 
unigram_model = {}

# Iterates over words in train_set
for w1 in (ngram_train_set):
    
    # Checks if the word is already present in dictionary
    if w1 not in unigram_model:
        
        # Initializes the value as 0 if word doesn't exist yet
        unigram_model[w1] = 0
        
    # Increases frequency count by 1
    unigram_model[w1] +=1
        
# Prints the first 50 items in dictionary - unigrams and their frequency counts        
for key, value in list(unigram_model.items())[:50]:
    print(key, ':', value)

<s> : 266753
wara : 9052
kollox : 5151
hu : 12542
d- : 7108
dmir : 255
ta' : 109203
kull : 8919
ministeru : 210
u : 165956
membru : 1123
parlamentari : 934
li : 216279
jgħin : 547
biex : 30734
dan : 36026
il- : 124301
poplu : 2265
jibda : 882
japprezza : 170
aktar : 11764
l- : 188227
ilsien : 475
uniku : 988
jidentifikah : 1
bħala : 9340
</s> : 266753
mhux : 14992
mużika : 1779
biss : 9296
temm : 123
jgħid : 2501
jekk : 12568
id- : 11902
diversi : 1951
mudelli : 180
familja : 3077
għandhom : 6743
dritt : 1841
jeżistu : 1267
ebda : 2867
mudell : 649
m' : 11424
għandu : 12290
jiġi : 3963
eskluż : 36
lanqas : 2278
tradizzjonali : 488
iva : 628
ktieb : 1986


In [239]:
# Initializes an empty dictionary for bigrams and their frequencies 
bigram_model = {}

# Iterates over indices of train_set excluding last item since it can't form a bigram with any subsequent item
for i in range(len(ngram_train_set)-1):
    
    # Assigns the word at index i to w1
    w1 = ngram_train_set[i]
    
    # Assigns the next word at index i+1 to w2
    w2 = ngram_train_set[i+1]

    # Adds w1 to empty dictionary if not already in bigram dictionary
    if w1 not in bigram_model:
        bigram_model[w1] = {}

    # Gives frequency of 1 if not already in bigram w1 sub-dictionary, otherwise increases frequency
    if w2 not in bigram_model[w1]:
        bigram_model[w1][w2] = 1
    else:
        bigram_model[w1][w2] += 1

# Prints the first 2 items in nested dictionary - bigrams and their frequency counts as w1: {w2a: frequency, w2b: freq,...}    
for key, value in list(bigram_model.items())[:2]:
    print(key, ':', value)

<s> : {'wara': 1501, 'u': 7353, 'temm': 16, 'iva': 242, 'il-': 21993, 'ma': 2873, 'fil-': 3104, 'pawsa': 16, 'iżda': 4865, 'is-': 1629, 'kif': 1705, 'dawk': 429, 'dwal': 1, "x'": 2318, "f'": 6831, 'ċikku': 11, "nitla'": 23, 'copyright': 741, 'mhux': 1774, 'zminijietna': 372, "ta'": 991, 'minn': 1261, 'iz-': 161, '1': 849, 'araw': 44, 'l-': 12115, 'qalti': 12, 'jien': 1620, '12': 72, 'tlett': 15, '30': 103, 'għandna': 383, 'ilha': 6, 'alternattiva': 426, 'skont': 213, 'biss': 296, 'ghal': 577, '.': 309, 'hemm': 3244, 'jekk': 2763, 'nibda': 26, 'kienu': 331, 'mario': 73, 'jo': 12, 'william': 5, 'kulma': 54, 'dawn': 2115, 'għalija': 153, 'politika': 313, 'fl-': 4760, 'jew': 654, 'mur': 45, 'din': 3759, '37': 191, 'hu': 884, 'għadha': 36, 'li': 1373, 'min-': 566, 'fatturi': 98, 'felħet': 2, 'kont': 650, 'imma': 3809, 'adulti': 8, 'filwaqt': 860, 'fir-': 173, 'id-': 1729, 'izda': 2488, 'għall-': 422, 'issa': 962, 'ħafna': 655, 'forsi': 926, 'barra': 1336, 'personalment': 195, 'apparti': 355

In [238]:
# Initializes an empty dictionary for trigrams and their frequencies 
trigram_model = {}

# Iterates over indices of train_set excluding last 2 items since it can't form a trigram with any subsequent items
for i in range(len(ngram_train_set)-2):
    
    # Assigns the word at index i to w1
    w1 = ngram_train_set[i]
    
    # Assigns the next word at index i+1 to w2
    w2 = ngram_train_set[i+1]
    
    # Assigns the following word at index i+2 to w3
    w3 = ngram_train_set[i+2]

    # Adds w1 to empty dictionary if not already in trigram dictionary
    if w1 not in trigram_model:
        trigram_model[w1] = {}

    # Adds w2 to empty dictionary if not already in w1 sub-dictionary
    if w2 not in trigram_model[w1]:
        trigram_model[w1][w2] = {}

    # Gives frequency of 1 if not already in sub-dictionary, otherwise increases frequency
    if w3 not in trigram_model[w1][w2]:
        trigram_model[w1][w2][w3] = 1
    else:
        trigram_model[w1][w2][w3] += 1

# Prints the first 2 items in nested dictionary - trigrams and their frequency counts as w1: {w2: {w3: frequency,...}}       
for key, value in list(trigram_model.items())[:2]:
    print(key, ':', value)

<s> : {'wara': {'kollox': 231, 'li': 331, 'mar': 3, 'ċertu': 2, 'imbagħad': 19, 'l-': 227, 't-': 54, 's-': 33, 'dan': 54, 'mort': 25, 'ftit': 40, 'din': 13, 'sentejn': 16, 'pawża': 1, 'kellna': 1, 'nofsinharijiet': 3, 'kull': 7, 'r-': 43, 'beda': 3, 'bqajt': 5, 'ntervall': 4, 'dawn': 5, 'xi': 8, 'ġew': 4, 'huma': 1, 'nofsinhar': 5, "'l": 1, 'ħafna': 19, 'bosta': 7, '15': 13, 'd-': 15, 'ikla': 9, 'esperjenza': 3, 'żewġ': 8, 'żjara': 12, 'nerġgħu': 10, 'assenza': 9, 'ġibna': 1, 'ma': 2, 'kelli': 1, "tista'": 9, 'rriżulta': 1, "seba'": 3, 'ġie': 3, 'rajt': 1, 'pressjoni': 1, 'wassaltha': 1, 'jisserva': 1, 'kampanja': 1, 'nofs': 5, "ta'": 2, 'studjajt': 3, 'madwar': 2, 'snin': 6, '77': 1, 'għexieren': 1, 'morna': 8, 'nuqqas': 1, 'daqqew': 1, 'jiena': 1, 'żmien': 1, 'sirna': 1, 'ċempilt': 3, 'sibna': 1, 'marret': 1, 'bdejt': 1, 'smajt': 2, 'operazzjoni': 2, 'xhur': 9, 'segwiet': 1, '9': 2, 'għaxar': 11, 'tliet': 3, 'wiċċhom': 2, 'ġimgħa': 1, "tibda'": 1, 'das-': 3, 'ħriġna': 1, 'protesti': 

In [241]:
# Creates a dictionary to store all models (unigram, bigram, trigram) in Vanilla language model
vanilla_model = {'unigram_model': unigram_model, 'bigram_model': bigram_model, 'trigram_model': trigram_model}

# Specifies the file path to save the Vanilla model
file_path = 'vanilla_language_model.pkl'

# Opens the file in write mode and dumps the Vanilla model dictionary using pickle
with open(file_path, 'wb') as f:
    pickle.dump(vanilla_model, f)

print('Vanilla model saved successfully.')

'''
The Vanilla Language model is 24,880KB.  It was relatively quick to build, 
the program took less than a minute for each of the unigram, bigram, and trigram models.
'''

Vanilla model saved successfully.


In [242]:
# Loads the Vanilla model using pickle so that unigram, bigram, and trigram can be adapted for Laplace model
vanilla_model2 = pickle.load(open('vanilla_language_model.pkl', 'rb'))
unigram_model = vanilla_model2['unigram_model']
bigram_model = vanilla_model2['bigram_model']
trigram_model = vanilla_model2['trigram_model']
print('done')

done


In [249]:
# Performs Laplace smoothing on the unigram, bigram, and trigram models so that no probability estimate is 0
def laplace_smoothing(model, laplace_constant, vocabulary_size):
     
        # Iterates over each key in model
        for key1 in model:
            
        # Checks if value for key1 is a dictionary
        if isinstance(model[key1], dict):
            
            # Applies laplace_smoothing to nested dictionary (bigram/trigram models) if value is a dictionary 
            laplace_smoothing(model[key1], laplace_constant, vocabulary_size)
        
        else:
            # Otherwise applies laplace_smoothing to unigram model 
            model[key1] = (model[key1] + laplace_constant) / (sum(model.values()) + laplace_constant * vocabulary_size)           

# Normalizes the probabilities of each model so that they sum to ~1            
def normalize_model(model):
    
    # Initializes variables for total probability and number of frequency counts
    total_prob = 0
    num_values = 0  
    
    # Iterates over each key in model
    for key in model:
        
        # Checks if value for key is int or float
        if isinstance(model[key], (int, float)):
            
            # Adds value to total probability and increases number of frequency count 
            total_prob += model[key]
            num_values += 1
            
    # If model is empty (0 values), assigns same probability to all events         
    if num_values == 0:
        uniform_prob = 1 / len(model)
        for key in model:
            model[key] = uniform_prob
    else:
        # Otherwise scales all probabilities to sum to ~1
        normalization_factor = 1 / total_prob
        for key in model:
            if isinstance(model[key], (int, float)):
                model[key] *= normalization_factor

# Tests that total probabilities sum to ~1 and minimum probability is not 0               
def check_probabilities(model):
    total_prob = sum(model.values())
    min_prob = min(model.values())
    print('Total probability:', total_prob)
    print('Minimum probability:', min_prob)

# Sets Laplace constant to 1 and vocabulary size to length of tokenized train_set 
laplace_constant = 1
vocabulary_size = len(ngram_train_set)

# Applies Laplace smoothing to each model
laplace_smoothing(unigram_model, laplace_constant, vocabulary_size)
print('finished laplace unigram')
laplace_smoothing(bigram_model, laplace_constant, vocabulary_size)
print('finished laplace bigram')
laplace_smoothing(trigram_model, laplace_constant, vocabulary_size)
print('finished laplace trigram')

print('finished laplace smoothing')

# Normalizes probabilities for each model
normalize_model(unigram_model)
print('finished normalizing unigram')
normalize_model(bigram_model)
print('finished normalizing bigram')
normalize_model(trigram_model)
print('finished normalizing trigram')

# Checks probabilities (should be close to 1 for total and not 0 for minimum)
print('Unigram model:')
check_probabilities(unigram_model)
print()
print('Bigram model:')
check_probabilities(bigram_model)
print()
print('Trigram model:')
check_probabilities(trigram_model)

'''
Testing revealed that normalization should happen after Laplace smoothing, otherwise the probabilities do not sum to ~1.
Swapping the order of the calls to the laplace_smoothing and normalize_model functions improved this.
'''

finished laplace unigram
finished laplace bigram
finished laplace trigram
finished laplace smoothing
finished normalizing unigram
finished normalizing bigram
finished normalizing trigram
Unigram model:
Total probability: 1.000000000000004
Minimum probability: 1.3105009669249873e-05

Bigram model:
Total probability: 0.9999999999985983
Minimum probability: 1.310512934762666e-05

Trigram model:
Total probability: 0.9999999999985983
Minimum probability: 1.310512934762666e-05


'\nTesting revealed that normalization should happen after Laplace smoothing, otherwise the probabilities do not sum to ~1.\nSwapping the order of the calls to the laplace_smoothing function and normalize_model function improved this.\n'

In [251]:
# Creates a dictionary to store all models (unigram, bigram, trigram) in Laplace language model
laplace_model = {'unigram_model': unigram_model, 'bigram_model': bigram_model, 'trigram_model': trigram_model}

# Specifies the file path to save the Laplace model
file_path = 'laplace_language_model.pkl'

# Opens the file in write mode and dumps the Laplace model dictionary using pickle
with open(file_path, 'wb') as f:
    pickle.dump(laplace_model, f)

print('Laplace model saved successfully.')

'''
The Laplace Language model is 4,042KB. The Laplace model took a little longer to build, 
the previous cell ran for several minutes to apply Laplace smoothing, normalize the models, 
and do the checks for each model.  The processing time is decreased when the bigram and trigram models are in the form
of nested dictionaries, otherwise if they are in the form of tuples, it takes hours to run.  
'''

Laplace model saved successfully.


In [252]:
# Counts word frequencies and replaces words that appear twice or less with <UNK>

# Initializes a dictionary for word frequencies in tokenized train_set and a list for words with 3+ frequencies 
word_freq = {}
filtered_train_set = []

# Iterates over every word in tokenized train_set
for word in ngram_train_set:
    
    # Increases frequency count if word already exists in dictionary
    if word in word_freq:
        word_freq[word] += 1
    else:
        # Otherwise sets frequency to 1 
        word_freq[word] = 1
        
    # Appends word to list if frequency greater than 2
    if word_freq[word] > 2:
        filtered_train_set.append(word)   
    else:
        # Otherwise appends UNK token if frequency 2 or less
        filtered_train_set.append('<UNK>')  

# Initializes dictionaries for UNK models (unigram, bigram, trigram)
unigram_model_unk = {}
bigram_model_unk = {}
trigram_model_unk = {}

# Iterates over indices of filtered_train_set which contains the words with frequencies of 3+
for i in range(len(filtered_train_set)):
    w1 = filtered_train_set[i]

    # Updates unigram model by initializing value of 0 if unseen before and then increasing by 1
    if w1 not in unigram_model_unk:
        unigram_model_unk[w1] = 0
    unigram_model_unk[w1] += 1

    # Updates bigram model by initializing empty dictionary and value of 0 if w2 unseen before and then increasing by 1
    if i > 0:
        w2 = filtered_train_set[i - 1]
        if w2 not in bigram_model_unk:
            bigram_model_unk[w2] = {}
        if w1 not in bigram_model_unk[w2]:
            bigram_model_unk[w2][w1] = 0
        bigram_model_unk[w2][w1] += 1

    # Updates trigram model by initializing empty dictionary and value of 0 if w3 unseen before and then increasing by 1
    if i > 1:
        w2 = filtered_train_set[i - 1]
        w3 = filtered_train_set[i - 2]
        if (w3, w2) not in trigram_model_unk:
            trigram_model_unk[(w3, w2)] = {}
        if w1 not in trigram_model_unk[(w3, w2)]:
            trigram_model_unk[(w3, w2)][w1] = 0
        trigram_model_unk[(w3, w2)][w1] += 1

# Prints the first 5 entries in each model
print('Unigram model:')
for key, value in list(unigram_model_unk.items())[:5]:
    print(key, ':', value)

print()
    
print('Bigram model:')
for key, value in list(bigram_model_unk.items())[:5]:
    print(key, ':', value)

print()
    
print('Trigram model:')
for key, value in list(trigram_model_unk.items())[:5]:
    print(key, ':', value)


Unigram model:
<UNK> : 124542
<s> : 266751
li : 216277
ta' : 109201
il- : 124299

Bigram model:
<UNK> : {'<UNK>': 12568, '<s>': 3, 'li': 5928, "ta'": 3021, 'il-': 2519, '</s>': 12174, 'l-': 4292, 'u': 7845, 'imbagħad': 33, 'biex': 1194, 'mhux': 277, '¬': 547, 'jew': 956, 'ma': 659, 'stejjer': 6, 'fil-': 1389, 'tal-': 2630, 'tkun': 49, 'iżda': 308, 'miljun': 78, 'dak': 209, 'għax': 536, 'jekk': 214, "f'": 1803, 'ix-': 109, 'politika': 93, 'jgħid': 33, 'żmien': 20, 'malta': 69, 'għal': 896, 'wara': 217, 'dan': 307, 'kif': 485, 'minn': 1282, 'id-': 289, 'soċjali': 108, 'fuq': 1293, 'mill-': 1198, 'din': 301, 'ir-': 153, 'kien': 454, 'għandhom': 46, 'bi': 178, 'popolari': 12, 'biss': 213, 'aktar': 281, 'bħala': 389, 'għall-': 697, 'kull': 154, 'qed': 259, 'jagħmel': 31, "ma'": 501, 'żewġ': 67, 'xogħol': 46, 's-': 208, "b'": 1125, 'żieda': 13, "m'": 145, 'iktar': 190, '.': 1056, 'huwa': 167, 'tagħhom': 236, 'ukoll': 257, 'hemm': 159, 'huma': 217, 'fejn': 357, 'għandha': 64, 'lill-': 680, 's

In [253]:
# Applies Laplace smoothing to each UNK model with the same Laplace constant and vocabulary size as before
laplace_smoothing(unigram_model_unk, laplace_constant, vocabulary_size)
laplace_smoothing(bigram_model_unk, laplace_constant, vocabulary_size)
laplace_smoothing(trigram_model_unk, laplace_constant, vocabulary_size)

# Normalizes probabilities for each model
normalize_model(unigram_model_unk)
normalize_model(bigram_model_unk)
normalize_model(trigram_model_unk)

# Checks probabilities (should be close to 1 for total and not 0 for minimum)
print('Unigram model with <UNK> tokens:')
check_probabilities(unigram_model_unk)
print()
print('Bigram model with <UNK> tokens:')
check_probabilities(bigram_model_unk)
print()
print('Trigram model with <UNK> tokens:')
check_probabilities(trigram_model_unk)

Unigram model with <UNK> tokens:
Total probability: 1.0000000000000069
Minimum probability: 4.1870456287846453e-07

Bigram model with <UNK> tokens:
Total probability: 0.9999999999994397
Minimum probability: 2.6432649608796785e-05

Trigram model with <UNK> tokens:
Total probability: 1.0000000000086742
Minimum probability: 1.8449367555680192e-06


In [254]:
# Creates a dictionary to store all models (unigram, bigram, trigram) in UNK language model
unk_model = {'unigram_model': unigram_model_unk, 'bigram_model': bigram_model_unk, 'trigram_model': trigram_model_unk}

# Specifies the file path to save the UNK model
file_path = 'unk_language_model.pkl'

# Opens the file in write mode and dumps the UNK model dictionary using pickle
with open(file_path, 'wb') as f:
    pickle.dump(unk_model, f)

print('UNK model saved successfully.')

'''
The UNK Language model is 15,036KB. The UNK model was very slow to build, 
that cell ran for about 20 minutes to replace low-frequency words with UNK tokens 
and update each model's nested dictionaries.  It took another couple minutes to apply 
Laplace smoothing, normalize the models, and check them.
'''

UNK model saved successfully.


In [255]:
# Performs linear interpolation on each n-gram model to output probability of a sentence (provided from train set)
def linear_interpolation(unigram_model, bigram_model, trigram_model, sentence):
    
    # Sets lambda values from project instructions
    lambda_unigram = 0.1    
    lambda_bigram = 0.3   
    lambda_trigram = 0.6
    
    # Splits the provided sentence from train set into tokens
    tokens = sentence.split()

    # Adds start and end of sentence tags
    tokens = ['<s>'] + tokens + ['</s>']

    # Initializes probability to 1
    probability = 1.0

    # Sets up tokens as n-grams for each n-gram model
    for i in range(2, len(tokens)):
        unigram = tokens[i]
        bigram = (tokens[i - 1], tokens[i])
        trigram = (tokens[i - 2], tokens[i - 1], tokens[i])

        # Calculates probabilities using linear interpolation for each model and sets to 0 if token not found in model
        prob_unigram = unigram_model.get(unigram, 0) if unigram_model else 0
        prob_bigram = bigram_model.get(bigram, 0) if bigram_model else 0
        prob_trigram = trigram_model.get(trigram, 0) if trigram_model else 0

        # Applies lambdas (weights) to each probability
        prob = (lambda_unigram * prob_unigram) + (lambda_bigram * prob_bigram) + (lambda_trigram * prob_trigram)

        # Multiplies by the total probability
        probability *= prob

    return probability

print('Linear Interpolation is ready!')

'''
Testing must be done in Maltese, of course, considering that the data is only in Maltese.  
On my initial attempt, I used an English sentence and struggled to understand why the probability would be 0.  
When a Maltese sentence from the train set is used, the probability should not be 0.
'''

Linear Interpolation is ready!


'\nTesting must be done in Maltese, of course, considering that the data is only in Maltese.  \nOn my initial attempt, I used an English sentence and struggled to understand why the probability would be 0.  \nWhen a Maltese sentence from the train set is used, the probability should not be 0.\n'

In [258]:
# Performs linear interpolation on Vanilla, Laplace, and UNK models to get probability of this Maltese sentence from train set
sentence = "kmieni din il- ġimgħa l- prim ministru joseph muscat semma l- viżjoni tiegħu li jibdel lil malta f' dubai jew singapore żewġ pajjiżi magħrufa għar- rikkezzi li għandhom iżda mhux biss"

# Loads the Vanilla model from file to do linear interpolation
VanillaModel = pickle.load(open('vanilla_language_model.pkl', 'rb'))
unigram_model = VanillaModel['unigram_model']
bigram_model = VanillaModel['bigram_model']
trigram_model = VanillaModel['trigram_model']

# Checks the probability of sentence using linear interpolation for the Vanilla model
probability = linear_interpolation(unigram_model, bigram_model, trigram_model, sentence)
print('Probability of the sentence (Vanilla):', probability)

# Loads laplace model from file to do linear interpolation
LaplaceModel = pickle.load(open('laplace_language_model.pkl', 'rb'))
smooth_unigram_model = LaplaceModel['unigram_model']
smooth_bigram_model = LaplaceModel['bigram_model']
smooth_trigram_model = LaplaceModel['trigram_model']

# Checks probability of sentence using linear interpolation for the Laplace model
interpolated_probability = linear_interpolation(smooth_unigram_model, smooth_bigram_model, smooth_trigram_model, sentence)
print('Probability of the sentence (Laplace):', interpolated_probability)

# Loads UNK model from file to do linear interpolation
UNKModel = pickle.load(open('unk_language_model.pkl', 'rb'))
unigram_model_unk = UNKModel['unigram_model']
bigram_model_unk = UNKModel['bigram_model']
trigram_model_unk = UNKModel['trigram_model']

# Checks probability of sentence using linear interpolation for the UNK model
interpolated_probability_unk = linear_interpolation(unigram_model_unk, bigram_model_unk, trigram_model_unk, sentence)
print('Probability of the sentence (UNK):', interpolated_probability_unk)

Probability of the sentence (Vanilla): 1.1933663939151002e+81
Probability of the sentence (Laplace): 4.987122683304575e-183
Probability of the sentence (UNK): 1.7736942880128425e-129


In [None]:
'''
For the Vanilla language model, the probability is 1.193 x10^81 which is extremely high, 
meaning that the provided sentence is very likely to occur according to that model.  This makes sense 
considering that the sentence is from the training set.  However, for both the Laplace and UNK language models, 
the probabilities are 4.987 x10^-183 and 1.774 x10^-129 respectively, which are exremely low.  
According to those language models, the provided sentence is very unlikely to occur.  
This could be due to low-frequency or rare words in the provided sentence such as names like "Joseph Muscat",
or limitations of the model such as disproportionate effects from the linear interpolation weights.
'''

In [260]:
# Calculates the probabilities of each sentence in test corpus with every model

# Initializes lists of probabilities for each model (Vanilla, Laplace, UNK)
each_probability = []
each_interpolated_probability = []
each_interpolated_probability_unk = []

# Iterates through each sentence in test_set
for sentence in test_set:
    
    # Calculates probabilities for each model using linear interpolation
    # Vanilla model
    prob_vanilla = linear_interpolation(unigram_model, bigram_model, trigram_model, sentence)
    
    # Laplace model
    prob_smooth = linear_interpolation(smooth_unigram_model, smooth_bigram_model, smooth_trigram_model, sentence)
    
    # UNK model
    prob_unk = linear_interpolation(unigram_model_unk, bigram_model_unk, trigram_model_unk, sentence)
    
    # Appends probabilities to respective lists
    each_probability.append(prob_vanilla)
    each_interpolated_probability.append(prob_smooth)
    each_interpolated_probability_unk.append(prob_unk)

# Prints the lists of probabilities for each model
print('Probabilities for Vanilla model:')
print(each_probability)
print()
print('Probabilities for Laplace model:')
print(each_interpolated_probability)
print()
print('Probabilities for UNK model:')
print(each_interpolated_probability_unk)

# Returns the lists of probabilities
return each_probability, each_interpolated_probability, each_interpolated_probability_unk

'''
Since I need to use the lists of probabilities for every sentence in each model in later functions, 
the lists need to be returned at the end of the function, otherwise I encountered errors down the line.
'''

Probabilities for Vanilla model:
[7.300570299476873e+55, 2.2174611014828933e+18, 1.5256361044783674e+27, 2.160678876473224e+49, 8183073746.035001, 7.750320193990791e+60, 1.0509510557769183e+42, 3.46412840861982e+59, 4.979409169688218e+27, 6.860757615336543e+67, 2.0187398897127366e+30, 3.471829760553593e+48, 7.597972985210089e+39, 9439244216774478.0, 3.163611032442406e+67, 9.05342543226678e+20, 7.870293100076367e+63, 1.3861124509616656e+36, 1.7864420083683e+163, 2.7382624346727636e+108, 1.4713469289442926e+47, 7.18199664099518e+55, 1.1719480333776648e+67, 3.3967069704294164e+39, 8.851763418380505e+58, 1.6977330227147726e+34, 9.00560045705061e+99, 4.979409169688218e+27, 2.451658864123603e+89, 1311343868057322.2, 1.6287431678354444e+44, 3.0921056489452143e+33, 4.0460496722368667e+34, 8.46672573593589e+154, 6.083717346295782e+51, 9.514634371799209e+24, 3.1316509443643243e+52, 4.876039894522026e+60, 711571630.0900002, 0.0, 8.947185490881755e+77, 2.663033513228292e+61, 0.0, 1.140957241109726

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [293]:
# Calculates perplexity for each model using test data
def calculate_perplexity(data, model):
    
    # Initializes variables to keep track of total words and total logarithmic probabilities
    total_words = 0
    total_log_prob = 0
    
    # Iterates through sentences (in test set)
    for sentence in data:
        
        # Splits sentences into words
        sents = sentence.split()
        
        # Loops through all words except last 2 since it can't form trigram
        for j in range(len(sents) - 2):
            w1, w2, w3 = sents[j], sents[j + 1], sents[j + 2]
            
            # Check if the model for w1 is a dictionary
            if w1 in model and isinstance(model[w1], dict):
                
                # If w2 is in the nested dictionary, it's a bigram
                if w2 in model[w1] and not isinstance(model[w1][w2], dict):
                    
                    # Calculates probability of bigram and adds to total probabilities
                    prob = model[w1][w2] / sum(model[w1].values())
                    log_prob = math.log(prob)
                    total_log_prob -= log_prob    
                else:
                    # Otherwise it's a trigram 
                    if w2 in model[w1] and isinstance(model[w1][w2], dict):
                        if w3 in model[w1][w2]:
                            
                            # Calculates probability of trigram and adds to total probabilities
                            prob = model[w1][w2][w3] / sum(model[w1][w2].values())
                            log_prob = math.log(prob)
                            total_log_prob -= log_prob
            else:
                # If w1 is not in the model or the model for w1 is not a dictionary, it's a unigram
                # Calculates uniform distribution
                total_log_prob -= math.log(1 / len(model))
            
            # Increases count for each word processed
            total_words += 1
    
    # Calculates perplexity using e raised to the exponent of total logarithmic probabilities divided by total words
    perplexity = math.exp(total_log_prob / total_words)
    return perplexity

print('Perplexity is ready!')


Perplexity is ready!


In [417]:
# Loads the Vanilla model from file using pickle
Vanilla_Model = pickle.load(open('vanilla_language_model.pkl', 'rb'))
unigram_model = Vanilla_Model['unigram_model']
bigram_model = Vanilla_Model['bigram_model']
trigram_model = Vanilla_Model['trigram_model']

# Loads laplace model from file 
Laplace_Model = pickle.load(open('laplace_language_model.pkl', 'rb'))
smooth_unigram_model = Laplace_Model['unigram_model']
smooth_bigram_model = Laplace_Model['bigram_model']
smooth_trigram_model = Laplace_Model['trigram_model']

# Loads UNK model from file 
UNK_Model = pickle.load(open('unk_language_model.pkl', 'rb'))
unigram_model_unk = UNK_Model['unigram_model']
bigram_model_unk = UNK_Model['bigram_model']
trigram_model_unk = UNK_Model['trigram_model']

In [304]:
# Converts the list of linear interpolated probabilities for each model (Vanilla, Laplace, UNK) into a dictionary 
# with the sentences from the test_set in order to calculate perplexity 
def convert_to_dict(sentences, probabilities):
    
    # Initializes empty dictionary for linear interpolated probabilities of sentences
    lin_dict = {}
    
    # Iterates over every sentence in test_set
    for i, sentence in enumerate(sentences):
        
        # Assigns probabilities as values in dictionary where keys are the sentences
        lin_dict[sentence] = probabilities[i]
    return lin_dict

# Converts Vanilla model linear interpolated probabilities into dictionary and prints first 5 entries to check
vanilla_lin_dict = convert_to_dict(test_set, each_probability)
for key, value in list(vanilla_lin_dict.items())[:5]:
    print('Vanilla linear interpolated dictionary:')
    print(key, ':', value)
    
# Converts Laplace model linear interpolated probabilities into dictionary and prints first 5 entries to check
smooth_lin_dict = convert_to_dict(test_set, each_interpolated_probability)
for key, value in list(smooth_lin_dict.items())[:5]:
    print('Laplace linear interpolated dictionary:')
    print(key, ':', value)
    
# Converts UNK model linear interpolated probabilities into dictionary and prints first 5 entries to check
unk_lin_dict = convert_to_dict(test_set, each_interpolated_probability_unk)
for key, value in list(unk_lin_dict.items())[:5]:
    print('UNK linear interpolated dictionary:')
    print(key, ':', value)

Vanilla linear interpolated dictionary:
<s> apparti l- iżvilupp massiv tal- proġett tal- midi u fort cambridge hemm ukoll proposta ta' żvilupp ta' townsquare </s> : 7.300570299476873e+55
Vanilla linear interpolated dictionary:
<s> iċ- ċiprijott baghdatis kontra l- isvediż johannson </s> : 2.2174611014828933e+18
Vanilla linear interpolated dictionary:
<s> fuq naħa l- izviluppaturi l- kbar </s> : 1.5256361044783674e+27
Vanilla linear interpolated dictionary:
<s> għaliex din l- istorja turi nuqqas ta' effettività tal- parlament malti li ma tistax tiġi tollerata </s> : 2.160678876473224e+49
Vanilla linear interpolated dictionary:
<s> 62 </s> : 8183073746.035001
Laplace linear interpolated dictionary:
<s> apparti l- iżvilupp massiv tal- proġett tal- midi u fort cambridge hemm ukoll proposta ta' żvilupp ta' townsquare </s> : 2.530833207694472e-118
Laplace linear interpolated dictionary:
<s> iċ- ċiprijott baghdatis kontra l- isvediż johannson </s> : 1.2223406886916858e-53
Laplace linear inter

In [306]:
# Calculates perplexity for each model 
vanilla_uni_perplex = calculate_perplexity(test_set, unigram_model)
print('Perplexity for Vanilla unigram Model: ', vanilla_uni_perplex)
vanilla_bi_perplex = calculate_perplexity(test_set, bigram_model)
print('Perplexity for Vanilla bigram Model: ', vanilla_bi_perplex)
vanilla_tri_perplex = calculate_perplexity(test_set, trigram_model)
print('Perplexity for Vanilla trigram Model: ', vanilla_tri_perplex)
vanilla_lin_perplex = calculate_perplexity(test_set, vanilla_lin_dict)
print('Perplexity for Vanilla linear interpolation Model: ', vanilla_lin_perplex)

print()

smooth_uni_perplex = calculate_perplexity(test_set, smooth_unigram_model)
print('Perplexity for Laplace unigram Model: ', smooth_uni_perplex)
smooth_bi_perplex = calculate_perplexity(test_set, smooth_bigram_model)
print('Perplexity for Laplace bigram Model: ', smooth_bi_perplex)
smooth_tri_perplex = calculate_perplexity(test_set, smooth_trigram_model)
print('Perplexity for Laplace trigram Model: ', smooth_tri_perplex)
smooth_lin_perplex = calculate_perplexity(test_set, smooth_lin_dict)
print('Perplexity for Laplace linear interpolation Model: ', smooth_lin_perplex)

print()

unk_uni_perplex = calculate_perplexity(test_set, unigram_model_unk)
print('Perplexity for UNK unigram Model: ', unk_uni_perplex)
unk_bi_perplex = calculate_perplexity(test_set, bigram_model_unk)
print('Perplexity for UNK bigram Model: ', unk_bi_perplex)
unk_tri_perplex = calculate_perplexity(test_set, trigram_model_unk)
print('Perplexity for UNK trigram Model: ', unk_tri_perplex)
unk_lin_perplex = calculate_perplexity(test_set, unk_lin_dict)
print('Perplexity for UNK linear interpolation Model: ', unk_lin_perplex)

Perplexity for Vanilla unigram Model:  76305.99998453884
Perplexity for Vanilla bigram Model:  48.2176264409125
Perplexity for Vanilla trigram Model:  3.280199294082204
Perplexity for Vanilla linear interpolation Model:  29634.000004744095

Perplexity for Laplace unigram Model:  76305.99998453884
Perplexity for Laplace bigram Model:  76305.99998453884
Perplexity for Laplace trigram Model:  76305.99998453884
Perplexity for Laplace linear interpolation Model:  29634.000004744095

Perplexity for UNK unigram Model:  37832.0000036053
Perplexity for UNK bigram Model:  37832.0000036053
Perplexity for UNK trigram Model:  542023.9998627678
Perplexity for UNK linear interpolation Model:  29634.000004744095


In [448]:
# Generates rest of sentence based on user input for start phrase and language model choice

# Selects which language model to use (Vanilla, Laplace, or UNK)
def select_language_model():
    
    # Prompts the user to choose a language model and converts to lowercase
    model_name = input("Select the language model (Vanilla, Laplace, or UNK): ").lower()
    model = None
    
    # Initializes model variables 
    Laplace_Model = None
    UNK_Model = None
    Vanilla_Model = None

    # Checks for keywords in model name to determine which model to use
    if "smooth" in model_name:
        
        # Loads Laplace model from file using pickle and assigns to model variable
        Laplace_Model = pickle.load(open('laplace_language_model.pkl', 'rb'))
        model = Laplace_Model
        
    elif "unk" in model_name:
        
        # Loads UNK model from file using pickle and assigns to model variable
        UNK_Model = pickle.load(open('unk_language_model.pkl', 'rb'))
        model = UNK_Model
        
    else:
        # Loads Vanilla model from file using pickle and assigns to model variable
        Vanilla_Model = pickle.load(open('vanilla_language_model.pkl', 'rb'))
        model = Vanilla_Model

    # Checks if a valid model was selected
    if model is None:
        print("Invalid language model. Please choose from Vanilla, Smooth, or UNK.")
        return
    return model


# Selects most likely next word based on chosen model
def select_next_word(next_words):
    
    # Checks if next_words is a dictionary
    if isinstance(next_words, dict):
        
        # Selects the most likely next word based on the model
        max_prob_word = max(next_words, key=next_words.get)
        return max_prob_word
    
    # Checks if next_words is list of floats 
    elif isinstance(next_words, list) and all(isinstance(x, float) for x in next_words):
        
        # If next_words is a list of floats, selects the next word with the maximum probability
        max_prob_index = next_words.index(max(next_words))
        return max_prob_index
    
    else:

        # Tries to handle errors due to other formats
        print('Unexpected format for next_words:', next_words)
        return None


# Generates the rest of the sentence
def Generate(start_word, language_model):
    
    # Sets sentence list to start with chosen word and then go from there
    sentence = [start_word]
    current_word = start_word
    
    # Keeps going until end of sentence tag is reached
    while current_word != '</s>':
        
        # Checks if a word is found in any model
        found_word = False  
        
        # Iterates through each model (unigram, bigram, trigram) to get next words
        for model_name, model_dict in language_model.items():
            next_words = model_dict.get(current_word)
            
            if next_words:
                
                # Selects the most likely next word based on the model
                current_word = select_next_word(next_words)
                
                # Appends selected word to sentence list
                sentence.append(current_word)
                
                # Marks Word as found in this model
                found_word = True  
                
                # Moves on to the next iteration of the loop
                break  
        
        # If word is not found in model, selects random next word and appends as string
        if not found_word:
            random_word = random.choice(list(language_model['unigram_model'].keys()))
            sentence.append(str(random_word))
            current_word = random_word
    
    # Returns full sentence with spaces between next words
    return ' '.join(map(str, sentence))

# Main function to generate sentence with chosen model and input start word 
def generate_sentence_with_model():
    
    # Selects language model
    language_model = select_language_model()
    if language_model is None:
        print('Invalid language model name.')
        return
    
    # Gets user input for the start word
    start_word = input('Enter the start word: ')
    
    # Generates the rest of the sentence
    generated_sentence = Generate(start_word, language_model)
    print('Generated Sentence:', generated_sentence)

print('generation is ready')

generation is ready


In [None]:
# Calls the main function to generate rest of sentence based on user choice of start word and language model
generate_sentence_with_model()

In [345]:
# Calculates probability of a given sentence
def Sen_Probability():
    
    # Asks user to input a sentence
    sentence = input('Enter a sentence: ')
    
    # Asks user to select a language model and converts to lower case
    model_name = input('Enter the language model (Vanilla, Laplace, or UNK): ').lower()
    model = None

    # Checks for keywords in model name to determine which model to use
    if "smooth" in model_name:
        model = Laplace_Model
    elif "unk" in model_name:
        model = UNK_Model
    else:
        model = Vanilla_Model
        
    # Checks if a valid model was selected
    if model is None:
        print('Invalid language model. Please choose from Vanilla, Smooth, or UNK.')
        return
    
    # Calculates the probability of the sentence using linear interpolation for the selected model
    probability = linear_interpolation(model['unigram_model'], model['bigram_model'], model['trigram_model'], sentence)
    print(f"Probability of the sentence ({model_name.capitalize()} Model):", probability)

# Calls function (for UNK model)
Sen_Probability()

Enter a sentence: kmieni din il- ġimgħa l- prim ministru joseph muscat semma l- viżjoni tiegħu li jibdel lil malta f' dubai jew singapore żewġ pajjiżi magħrufa għar- rikkezzi li għandhom iżda mhux biss
Enter the language model (Vanilla, Laplace, or UNK): unk
Probability of the sentence (Unk Model): 1.7736942880128425e-129


In [342]:
# Calls function (for Vanilla model)
Sen_Probability()

Enter a sentence: kmieni din il- ġimgħa l- prim ministru joseph muscat semma l- viżjoni tiegħu li jibdel lil malta f' dubai jew singapore żewġ pajjiżi magħrufa għar- rikkezzi li għandhom iżda mhux biss
Enter the language model (Vanilla, Laplace, or UNK): vanilla
Probability of the sentence (Vanilla Model): 1.1933663939151002e+81


In [343]:
# Calls function (for Laplace model)
Sen_Probability()

Enter a sentence: kmieni din il- ġimgħa l- prim ministru joseph muscat semma l- viżjoni tiegħu li jibdel lil malta f' dubai jew singapore żewġ pajjiżi magħrufa għar- rikkezzi li għandhom iżda mhux biss
Enter the language model (Vanilla, Laplace, or UNK): laplace
Probability of the sentence (Laplace Model): 1.1933663939151002e+81
