## N-Grams
#### Andrew Grant

### Load Packages

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk

### Load English and Italian Datasets

In [7]:

# Load English df

df_en = pd.read_csv('data/CONcreTEXT_trial_EN.tsv' ,sep='\t')

# Load Italian df

df_it = pd.read_csv('data/CONcreTEXT_trial_IT.tsv' ,sep='\t')

In [8]:
df_en


Unnamed: 0,TARGET,POS,INDEX,TEXT,MEAN
0,achievement,N,3,"Bring up academic achievements , awards , and ...",3.06
1,achievement,N,9,"Please list people you have helped , your pers...",3.03
2,activate,V,1,Add activated carbon straight to your vodka .,3.83
3,activate,V,15,"Place sensors around your garden , and when a ...",5.51
4,adventure,N,9,Look for a partner that shares your level of a...,2.03
...,...,...,...,...,...
95,water,N,5,Rinse your face with warm water and pat it dry .,6.91
96,win,V,4,Staying mentally strong means winning half the...,2.34
97,win,V,7,The person who has the highest score wins the ...,4.60
98,woman,N,7,"For the most part , men and women wear the sam...",6.29


### Create a list of words with no punctuation and all lowercase

In [9]:
# Import a list of punctuations
from string import punctuation


# Function to change words to lowercase
def lower_text(text):
    #Make text lowercase 
    text = text.lower()
    return text

# function to remove punctuation
def punc_rem(text):
    text.replace("'s", "")
    return ''.join(word for word in text if word not in punctuation)

# lambda function versions for the above functions
low_clean = lambda x: lower_text(x)
punc_clean = lambda x: punc_rem(x)

In [10]:
# Applying the lowercase and remove punctuation functions to the English data frame
df_en_clean = pd.DataFrame(df_en['TEXT'].apply(low_clean).apply(punc_clean))

# Joining all the lines of "TEXT" in the English df into one corpus
en_clean_text = " ".join(text for text in df_en_clean['TEXT'])


# Applying the lowercase and remove punctuation functions to the Italian data frame
df_it_clean = pd.DataFrame(df_it['TEXT'].apply(low_clean).apply(punc_clean))

# Joining all the lines of "TEXT" in the Italian df into one corpus
it_clean_text = " ".join(text for text in df_it_clean['TEXT'])

In [11]:
# Import tokenizer
from nltk.tokenize import RegexpTokenizer

# Setting up tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Tokenizing the english corpus
en_tok_list = tokenizer.tokenize(en_clean_text)

# Tokenizing the italian corpus
it_tok_list = tokenizer.tokenize(it_clean_text)


### Create Training and Test Lists

In [12]:

from sklearn.model_selection import train_test_split

# use the sklearn train_test_split function to randomly split the tokenized words into train and test lists
train_en, test_en = train_test_split(en_tok_list, test_size= 0.2)
train_it, test_it = train_test_split(it_tok_list, test_size= 0.2)

#### -----------------------------------------------------------------------------------------------------------------------------

### Creating the Unigram Model

#### ** Please note that capitalized letters were all lowercased for the training and test sets.  This was based on the size of the training set as I didn't think there were enough words for the capatalizations to add value without also potentially throwing some unwanted bias into the models

In [13]:

from nltk import ngrams
from collections import Counter, defaultdict

In [14]:

# Function to change the list of tokenized words to a string with no blank spaces
# this is only used for the unigrams to more easily find the letter frequencies
def toString(List): 
    return ''.join(List) 

# apply to string function to the two training word lists
train_en_txt = toString(train_en)
train_it_txt = toString(train_it)


In [15]:

# finding the count of each letter in the string created from the training lists
en_count_sorted = Counter(train_en_txt).most_common()
it_count_sorted = Counter(train_it_txt).most_common()

# using the len function to find the total count of letters in both training lists of words
en_total_count = len(train_en_txt)    
it_total_count = len(train_it_txt)  


In [16]:

# function that creates a list that will have a tupple made up of the letter and its frequency in the training string

def unigram_freq_counter(let_list, text):
    let_probs = []
    total_count = len(text)   
    for letter, count in let_list:
        freq = count/total_count
        let_probs.append([letter, freq])
    
    return let_probs


In [17]:

# apply the above function to both english and italian training list of words strings

en_prob_list = unigram_freq_counter(en_count_sorted, train_en_txt)

it_prob_list = unigram_freq_counter(it_count_sorted, train_it_txt)



In [19]:

# converting lists to a dictionaries so that it can be more easily read by the model function

def listtodict(A): 
    model = dict(A) 
    return model

en_model_1 = listtodict(en_prob_list)
it_model_1 = listtodict(it_prob_list)
en_model_1


{'e': 0.11504802561366062,
 'o': 0.09007470651013874,
 't': 0.08687299893276414,
 'a': 0.08153681963713981,
 'n': 0.07065101387406617,
 'i': 0.06894343649946638,
 'r': 0.06446104589114195,
 's': 0.060832443970117396,
 'h': 0.04525080042689435,
 'l': 0.04503735325506937,
 'u': 0.03457844183564568,
 'y': 0.032017075773746,
 'd': 0.03030949839914621,
 'c': 0.0288153681963714,
 'f': 0.024759871931696906,
 'p': 0.024332977588046957,
 'm': 0.023265741728922092,
 'g': 0.018356456776947704,
 'w': 0.015581643543223053,
 'b': 0.013233724653148345,
 'v': 0.011739594450373533,
 'k': 0.008537886872998933,
 'x': 0.00192102454642476,
 'z': 0.0017075773745997866,
 'j': 0.0010672358591248667,
 'q': 0.0008537886872998933,
 'é': 0.00021344717182497332}

In [13]:

# model to predict the language of the words in the English test list of words
    
def unigram_language_predictor(en_model,it_model, test_listofwords):
    
    # create empty results list to be appended to
    results = []
     
    # for loop that goes through all of the words in the test list of words
    for word in test_listofwords:
        
        # initially setting the probability of the word being engluish and italian to 1
        en_prob = 1
        it_prob =1
         
        # looping through the letters in each word    
        for let1 in word:
            
            # if else clause incase the letter is not a key in the dictionary from the english training list of words
            if let1 not in en_model.keys():
                en_prob = en_prob
            else:
                # model creates a probability of the word being english by multiplying the letter frequncies together
                en_prob = en_prob*en_model[let1]
            
            
            # if else clause incase the letter is not a letter key in the dictionary from the italian training list of words    
            if let1 not in it_model.keys():
                it_prob = it_prob
            else:
                # model creates a probability of the word being italian by multiplying the letter frequncies together
                it_prob = it_prob*it_model[let1]
         
        # if else statement to choose which language has a higher probability and assign that value to a variable
        if en_prob >= it_prob:
            en_flag = 1
            pred_lang= 'English'
        else:
            en_flag = 0
            pred_lang = 'Italian'
        
        # append the results to a results list created at the start of the function
        results.append([word, en_prob, it_prob, en_flag, pred_lang])
            
    return results        

In [14]:
# run predictor using the english and italian frequency dictionaries and the english test list of words

unigram_res = unigram_language_predictor(en_model_1, it_model_1, test_en)

In [15]:
# create a dataframe from the resulting list

df_unigram_res = pd.DataFrame(unigram_res,columns = ['word','en_prob','it_prob','english','language prediction'])

df_unigram_res.head(20)

Unnamed: 0,word,en_prob,it_prob,english,language prediction
0,a,0.0791889,0.1073034,0,Italian
1,for,0.0001315209,8.902523e-05,1,English
2,of,0.002177298,0.001260994,1,English
3,your,5.783464e-06,0.0002168563,0,Italian
4,a,0.0791889,0.1073034,0,Italian
5,street,3.802466e-07,2.225793e-07,1,English
6,easier,2.778611e-07,5.554455e-07,0,Italian
7,help,6.315489e-06,1.923168e-06,1,English
8,physical,1.494797e-11,3.788034e-10,0,Italian
9,thoughts,6.026386e-11,6.365914e-13,1,English


In [16]:
# count the number of words classified as an english word
unigram_eng_pos = df_unigram_res.english.sum()

# count the total number of words in the english test list of words
unigram_total_words = len(df_unigram_res.index)

# calculate the accuracy of the unigram model
unigram_model_accuracy = unigram_eng_pos/unigram_total_words

unigram_model_accuracy

0.3688212927756654

#### ----------------------------------------------------------------------------------------------------------------------------------

### Creating the Bigram Model

In [17]:
# Create an empty dictionary inside of a dictionary
en_model_2 = defaultdict(lambda: defaultdict(lambda: 0))

# loop through the words in the english training list of words 
for words in train_en:
    
    # using the ngrams package loop through the bigrams and create a dictionary of counts for each bigram
    for let1, let2 in ngrams(words, 2):
        en_model_2[(let1)][let2] += 1


# transform the counts to probabilities
#loop through each key letter in the bigram model
for let1 in en_model_2:
    
    # creates a total count of the values under the key letter
    total_count = float(sum(en_model_2[let1].values()))
    
    # loops through the second letters of the bigrams for each first letter
    for let2 in en_model_2[let1]:
        
        # divides the second letter count by the total count of the first letter in the bigram 
        en_model_2[let1][let2] /= total_count
        
        
en_model_2

defaultdict(<function __main__.<lambda>()>,
            {'e': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'x': 0.019444444444444445,
                          'r': 0.18055555555555555,
                          's': 0.14722222222222223,
                          'o': 0.013888888888888888,
                          'c': 0.03888888888888889,
                          'd': 0.075,
                          'b': 0.002777777777777778,
                          'a': 0.09166666666666666,
                          'n': 0.14722222222222223,
                          'm': 0.027777777777777776,
                          'i': 0.016666666666666666,
                          't': 0.03611111111111111,
                          'l': 0.06666666666666667,
                          'e': 0.03611111111111111,
                          'g': 0.011111111111111112,
                          'v': 0.030555555555555555,
                          'f': 0.005555555555555556

In [18]:
# Create an empty dictionary inside of a dictionary
it_model_2 = defaultdict(lambda: defaultdict(lambda: 0))

# loop through the words in the italian training list of words 
for words in train_it:
    
    # using the ngrams package loop through the bigrams and create a dictionary of counts for each bigram
    for let1, let2 in ngrams(words, 2):
        it_model_2[(let1)][let2] += 1


# transform the counts to probabilities
#loop through each key letter in the bigram model
for let1 in it_model_2:
    
    # creates a total count of the values under the key letter
    total_count = float(sum(it_model_2[let1].values()))
    
    # loops through the second letters of the bigrams for each first letter
    for let2 in it_model_2[let1]:
            
        # divides the second letter count by the total count of the first letter in the bigram 
        it_model_2[let1][let2] /= total_count
        
        
it_model_2

defaultdict(<function __main__.<lambda>()>,
            {'c': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'h': 0.14601769911504425,
                          'u': 0.07079646017699115,
                          'i': 0.15486725663716813,
                          'a': 0.11946902654867257,
                          'e': 0.11946902654867257,
                          'o': 0.2831858407079646,
                          'r': 0.022123893805309734,
                          'c': 0.06637168141592921,
                          'q': 0.008849557522123894,
                          'l': 0.008849557522123894}),
             'h': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'e': 0.5121951219512195,
                          'a': 0.14634146341463414,
                          'i': 0.3170731707317073,
                          'é': 0.024390243902439025}),
             'd': defaultdict(<function __main__.<lambda>.<loca

In [19]:
# predictor function that uses bigram model to predict the language of the words in the English test list of words
def bigram_language_predictor(en_model,it_model, test_listofwords):
    
    # create empty results list to be appended to
    results = []
    
    # loop that goes through all of the words in the test list of words
    for word in test_listofwords:
        
        # initially setting the probability of the word being engluish and italian to 1    
        en_prob = 1
        it_prob =1
        
        # looping through the bigrams in each word
        for let1, let2 in ngrams(word, 2):
            # if else clause incase the letter is not a key in the dictionary from the english training list of words
            if let1 not in en_model.keys():
                en_prob = en_prob
            else: 
                # model creates a probability of if the test word is english by multiplying the bigram frequncies together
                en_prob = en_prob*en_model[let1][let2]
            
            
            # if else clause incase the letter is not a letter key in the dictionary from the italian training list of words    
            if let1 not in it_model.keys():
                it_prob = it_prob
            else:
                # model creates a probability of the word being italian by multiplying the letter frequncies together    
                it_prob = it_prob*it_model[let1][let2]
        
        # if else statement to flag which language has a higher probability and assign that language to a variable
        if en_prob >= it_prob:
            en_flag = 1
            pred_lang= 'English'
        else:
            en_flag = 0
            pred_lang = 'Italian'
        
        # append the results to a results list created at the start of the function
        results.append([word, en_prob, it_prob, en_flag, pred_lang])
            
    return results       



In [20]:

# run predictor using the english and italian bigram frequency dictionaries and the english test list of words
bigram_res = bigram_language_predictor(en_model_2,it_model_2, test_en)

bigram_res

[['a', 1, 1, 1, 'English'],
 ['for', 0.027439024390243903, 0.024916190993929506, 1, 'English'],
 ['of', 0.09166666666666666, 0.028268551236749116, 1, 'English'],
 ['your', 0.06438071895424836, 0.0, 1, 'English'],
 ['a', 1, 1, 1, 'English'],
 ['street', 4.74076873398168e-06, 1.234781551602595e-06, 1, 'English'],
 ['easier', 3.7988594656206426e-06, 1.388353700182376e-06, 1, 'English'],
 ['help', 0.0012631330421437846, 0.0007591266326286009, 1, 'English'],
 ['physical', 3.0745259615610137e-10, 0.0, 1, 'English'],
 ['thoughts', 2.2418786031029916e-08, 0.0, 1, 'English'],
 ['is', 0.10289389067524116, 0.0395778364116095, 1, 'English'],
 ['on', 0.11944444444444445, 0.26855123674911663, 0, 'Italian'],
 ['areas', 0.00021156109820888872, 7.26342561955914e-05, 1, 'English'],
 ['for', 0.027439024390243903, 0.024916190993929506, 1, 'English'],
 ['down', 8.169934640522875e-05, 0.0, 1, 'English'],
 ['to', 0.12974683544303797, 0.14634146341463414, 0, 'Italian'],
 ['websites', 0.0, 0.0, 1, 'English'],


In [21]:

# create a dataframe from the resulting list
df_bigram_res = pd.DataFrame(bigram_res,columns = ['word','en_prob','it_prob','english','language prediction'])

df_bigram_res.head(20)

Unnamed: 0,word,en_prob,it_prob,english,language prediction
0,a,1.0,1.0,1,English
1,for,0.02743902,0.024916,1,English
2,of,0.09166667,0.028269,1,English
3,your,0.06438072,0.0,1,English
4,a,1.0,1.0,1,English
5,street,4.740769e-06,1e-06,1,English
6,easier,3.798859e-06,1e-06,1,English
7,help,0.001263133,0.000759,1,English
8,physical,3.074526e-10,0.0,1,English
9,thoughts,2.241879e-08,0.0,1,English


In [22]:

# count the number of words classified as an english word
bigram_eng_pos = df_bigram_res.english.sum()

# count the number of words in the df
bigram_total_words = len(df_bigram_res.index)

# calculate the accuracy of the bigram model
bigram_accuracy = bigram_eng_pos/bigram_total_words

bigram_accuracy


0.7338403041825095

#### ---------------------------------------------------------------------------------------------------------------------------------------

### Trigram Model

In [23]:
# Create an empty dictionary inside of a dictionary
en_model_3 = defaultdict(lambda: defaultdict(lambda: 0))

# loop through the words in the english training list of words 
for words in train_en:
    
    # using the ngrams package loop through the trigrams and create a dictionary of counts for each trigram  
    for let1, let2, let3 in ngrams(words, 3):
        en_model_3[(let1,let2)][let3] += 1


# transform the counts to probabilities
#loop through each key letter in the bigram model
for let1_let2 in en_model_3:
    
    # creates a total count of the values under the key letters
    total_count = float(sum(en_model_3[let1_let2].values()))
    
    # loops through the second letters of the bigrams for each first letter
    for let3 in en_model_3[let1_let2]:
        
        # divides the third letter count by the total count of the first two letter key in the trigram
        en_model_3[let1_let2][let3] /= total_count
        
        
en_model_3

defaultdict(<function __main__.<lambda>()>,
            {('e',
              'x'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'p': 0.5714285714285714,
                          'i': 0.14285714285714285,
                          'a': 0.14285714285714285,
                          't': 0.14285714285714285}),
             ('x',
              'p'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'a': 0.25,
                          'e': 0.75}),
             ('p',
              'a'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'n': 0.16666666666666666,
                          't': 0.05555555555555555,
                          'r': 0.3888888888888889,
                          's': 0.1111111111111111,
                          'c': 0.16666666666666666,
                          'p': 0.05555555555555555,
                          'i': 0.05555555555555555}),
             ('a',
              'n'): defaultdict(<function __main__.<lambda>.

In [24]:
# Create an empty dictionary inside of a dictionary
it_model_3 = defaultdict(lambda: defaultdict(lambda: 0))

# loop through the words in the italian training list of words  
for words in train_it:
    
    # using the ngrams package loop through the trigrams and create a dictionary of counts for each trigram
    for let1, let2, let3 in ngrams(words, 3):
        it_model_3[(let1,let2)][let3] += 1


# transform the counts to probabilities
#loop through each key letter in the bigram model
for let1_let2 in it_model_3:
    
    # creates a total count of the values under the key letters
    total_count = float(sum(it_model_3[let1_let2].values()))
    
    # loops through the second letters of the bigrams for each first letter
    for let3 in it_model_3[let1_let2]:
        
        # divides the third letter count by the total count of the first two letter key in the trigram
        it_model_3[let1_let2][let3] /= total_count
        
        
it_model_3

defaultdict(<function __main__.<lambda>()>,
            {('c',
              'h'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'e': 0.6363636363636364,
                          'i': 0.3333333333333333,
                          'é': 0.030303030303030304}),
             ('d',
              'e'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'l': 0.4827586206896552,
                          'r': 0.13793103448275862,
                          'n': 0.05172413793103448,
                          'a': 0.017241379310344827,
                          'c': 0.034482758620689655,
                          'v': 0.1206896551724138,
                          't': 0.05172413793103448,
                          'i': 0.05172413793103448,
                          'e': 0.017241379310344827,
                          'g': 0.017241379310344827,
                          'f': 0.017241379310344827}),
             ('e',
              'l'): defaultdict(<function __main__.

In [25]:
# predictor function that uses trigram model to predict the language of the words in the English test list of words
def trigram_language_predictor(en_model,it_model, test_listofwords):
    
    # create empty results list to be appended to
    results = []
    
    # loop that goes through all of the words in the test list of words    
    for word in test_listofwords:
        
        # initially setting the probability of the word being engluish and italian to 1    
        en_prob = 1
        it_prob =1
        
        # looping through the trigrams in each word
        for let1, let2, let3 in ngrams(word, 3):
            
            # if else clause incase the two first letters is not a key in the dictionary from the english training list of words
            if (let1, let2) not in en_model.keys():
                en_prob = en_prob
            else: 
                # model creates a probability of if the test word is english by multiplying the trigram frequncies together
                en_prob = en_prob * en_model[(let1,let2)][let3]
                    
                
            # if else clause incase the two first letters is not a key in the dictionary from the english training list of words    
            if (let1,let2) not in it_model.keys():
                it_prob = it_prob
            else:
                # model creates a probability of if the test word is english by multiplying the trigram frequncies together
                it_prob = it_prob * it_model[(let1,let2)][let3]
        
        # if else statement to flag which language has a higher probability and assign that language to a variable
        if en_prob >= it_prob:
            en_flag = 1
            pred_lang= 'English'
        else:
            en_flag = 0
            pred_lang = 'Italian'
        
        # append the results to a results list created at the start of the function
        results.append([word,en_prob, it_prob, en_flag, pred_lang])
            
    return results    



In [26]:

# run predictor using the english and italian trigram frequency dictionaries and the english test list of words
trigram_res = trigram_language_predictor(en_model_3,it_model_3, test_en)

trigram_res

[['a', 1, 1, 1, 'English'],
 ['for', 0.6666666666666666, 0.6, 1, 'English'],
 ['of', 1, 1, 1, 'English'],
 ['your', 0.5084745762711864, 1, 0, 'Italian'],
 ['a', 1, 1, 1, 'English'],
 ['street', 4.3841189674523005e-05, 0.0, 1, 'English'],
 ['easier', 0.0, 0.0, 1, 'English'],
 ['help', 0.036, 0, 1, 'English'],
 ['physical', 0.0, 0.002184002184002184, 0, 'Italian'],
 ['thoughts', 0.0, 0, 1, 'English'],
 ['is', 1, 1, 1, 'English'],
 ['on', 1, 1, 1, 'English'],
 ['areas', 0.00916138125440451, 0.0, 1, 'English'],
 ['for', 0.6666666666666666, 0.6, 1, 'English'],
 ['down', 0.0, 0, 1, 'English'],
 ['to', 1, 1, 1, 'English'],
 ['websites', 0.0, 0.0, 1, 'English'],
 ['pizzas', 0.013888888888888888, 0.0, 1, 'English'],
 ['other', 0.1157938144329897, 0.0, 1, 'English'],
 ['travels', 0.0, 0.0, 1, 'English'],
 ['ones', 0.04081632653061224, 0.045112781954887216, 0, 'Italian'],
 ['to', 1, 1, 1, 'English'],
 ['this', 0.002689376961004034, 0, 1, 'English'],
 ['interest', 0.0003536003536003536, 0.00016269

In [27]:

# create a dataframe from the resulting list
df_trigram_res = pd.DataFrame(trigram_res,columns = ['word','en_prob','it_prob','english','language prediction'])

df_trigram_res.head(20)

Unnamed: 0,word,en_prob,it_prob,english,language prediction
0,a,1.0,1.0,1,English
1,for,0.666667,0.6,1,English
2,of,1.0,1.0,1,English
3,your,0.508475,1.0,0,Italian
4,a,1.0,1.0,1,English
5,street,4.4e-05,0.0,1,English
6,easier,0.0,0.0,1,English
7,help,0.036,0.0,1,English
8,physical,0.0,0.002184,0,Italian
9,thoughts,0.0,0.0,1,English


In [28]:

# count the number of words classified as an english word
eng_pos = df_trigram_res.english.sum()

# count the number of words in the df
total_words = len(df_trigram_res.index)

# calculate the accuracy of the trigram model
accuracy_trigram = eng_pos/total_words

accuracy_trigram

0.8060836501901141