In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import pickle

In [2]:
## Program 1
## 1.a
def getDicts(filename): ## create the function, takes in the training filename
    
    ## 1.b
    f = open(filename, "r", encoding='utf-8') ## open the file
    text = f.read() ## read the entire contents of the file
    f.close() ## close the file
    text.replace('\n', '') ## remove the newlines
    
    ## 1.c
    tokens = word_tokenize(text) ## tokenize the words
    
    ## 1.d
    bigrams = list(ngrams(tokens, 2)) ## create the list of bigrams
    
    ## 1.e
    unigrams = tokens ## create the list of unigrams (basically just the tokens)
    
    ## 1.f
    bigram_dict = {b: bigrams.count(b) for b in set(bigrams)} ## create the count dict for bigrams
    
    ## 1.g
    unigram_dict = {t: unigrams.count(t) for t in set(unigrams)} ## create the count dict for unigrams
    
    ## 1.h
    return unigram_dict, bigram_dict ## return the bigram and unigram dict

In [9]:
## Program 2
def getPreds(filename, outputfile):
    
    ## 2.a
    ## unpickle the dictionaries and load them in
    uni_eng = pickle.load(open('uni_eng.p', 'rb'))
    bi_eng = pickle.load(open('bi_eng.p', 'rb'))
    uni_fr = pickle.load(open('uni_fr.p', 'rb'))
    bi_fr = pickle.load(open('bi_fr.p', 'rb'))
    uni_it = pickle.load(open('uni_it.p', 'rb'))
    bi_it = pickle.load(open('bi_it.p', 'rb'))
    
    ## 2.b
    f = open(filename, 'r', encoding = 'utf-8') ## open the file with t
    line = f.readline() ## read the first line
    preds = [] ## list to hold the predictions
    while(line): ## go by line to predict
        
        uni = word_tokenize(line) ## tokenize the line 
        bi = list(ngrams(uni, 2))  ## get the bigrams from the line
        
        ## initialize the laplace values
        laplace_eng = 1
        laplace_fr = 1
        laplace_it = 1
        
        for bigram in bi: ## go through the bigrams in the line
            
            
            b_eng = bi_eng[bigram] if bigram in bi_eng else 0 ## get the number of bigram occurrences in the training
            u_eng = uni_eng[bigram[0]] if bigram[0] in uni_eng else 0 ## get the number of unigram occurrences in the training
            v_eng  =  len(uni_eng) ## get the size of the vocab
            
            b_fr = bi_fr[bigram] if bigram in bi_fr else 0 ## get the number of bigram occurences in the training
            u_fr = uni_fr[bigram[0]] if bigram[0] in uni_fr else 0 ## get the number of unigram occurrences in the training
            v_fr  =  len(uni_fr) ## get the size of the vocab
            
            b_it = bi_it[bigram] if bigram in bi_it else 0 ## get the number of bigram occurences in the training
            u_it = uni_it[bigram[0]] if bigram[0] in uni_it else 0 ## get the number of unigram occurrences in the training
            v_it  =  len(uni_it) ## get the size of the vocab
            
            ## Use the laplace equation to update the value based on the current bigram
            laplace_eng = laplace_eng * ((b_eng + 1) / (u_eng + v_eng))
            laplace_fr = laplace_fr * ((b_fr + 1) / (u_fr + v_fr))
            laplace_it = laplace_it * ((b_it + 1) / (u_it + v_it))
            
        print(line)
        print("English probability : " + str(laplace_eng))
        print("French probability : " + str(laplace_fr))
        print("Italian probability : " + str(laplace_it))
        print()

        ## Find the highest probability language based on the laplace value for each language
        if(laplace_eng > laplace_fr and laplace_eng > laplace_it):
            preds.append("English") ## append the predicition to the list
        elif (laplace_fr > laplace_it):
            preds.append("French") ## append the predicition to the list
        else:
            preds.append("Italian") ## append the predicition to the list
        
        line = f.readline() ## go to the next line
    f.close() ## close the file
    
    f = open(outputfile, 'w') ## open the output file
    
    for i in range(len(preds)): ## iterate through the predictions
        line = str(i + 1) + ' ' + preds[i] + '\n' ## create the string
        f.write(line) ## write the line to the file
        
    f.close() ## close the file
    
        
    return(preds) ## return the predictions

In [4]:
## 2.b
def getAccuracy(preds, filename): 
    
    f = open(filename, 'r', encoding = 'utf-8') ## open the actual language file
    actual = f.read() ## read all the data
    actual = ''.join([i for i in actual if not i.isdigit()]) ## remove the digits
    actual = actual.replace(' ', '').split('\n') ## remove the spaces and make it a list for easy comparison
    
    correct = 0 ## count the correct predictions
    incorrect = [] ## keep track of the incorrect line numbers
    for i in range(len(preds)): ## iterate through the predictions
        if preds[i] == actual[i]: ## check if pred = actual
            correct = correct + 1 ## add 1 to correct if pred == actual
        else:
            incorrect.append(i + 1) ## else add the line to the incorrect lines
            
    accuracy = correct/len(preds) ## calculate the accuracy

    return accuracy, incorrect ## return the accuracy and the incorrect lines

In [None]:
if __name__ == '__main__':
    ## i
    uni_eng, bi_eng = getDicts('ngram_files/LangId.train.English') ## create the english dicts
    #print("English Completed")
    uni_fr, bi_fr = getDicts('ngram_files/LangId.train.French') ## create the french dicts
    #print("French Completed")
    uni_it, bi_it = getDicts('ngram_files/LangId.train.Italian') ## create the italian dicts
    #print("Italian Completed")
    
    ## pickle all the created dictionaries
    pickle.dump(uni_eng, open("uni_eng.p","wb"))
    pickle.dump(bi_eng, open("bi_eng.p","wb"))
    pickle.dump(uni_fr, open("uni_fr.p","wb"))
    pickle.dump(bi_fr, open("bi_fr.p","wb"))
    pickle.dump(uni_it, open("uni_it.p","wb"))
    pickle.dump(bi_it, open("bi_it.p","wb"))
    

In [10]:
    preds = getPreds('ngram_files/LangId.test', 'output.txt') ## get the prediction and write them
    accuracy, incorrect = getAccuracy(preds, 'ngram_files/LangId.sol') ## get the accuracy and incorrect lines
    ## 2.c
    print("The accuracy we get is " + str(accuracy)) ## outpt the accuracy
    print("We have incorrect classifications on the following lines :") ## output the incorrect lines
    for i in incorrect:
        print(i)

Signora Presidente , vorrei sapere perché questo Parlamento non rispetta le norme in materia di salute e sicurezza che esso stesso approva .

English probability : 1.6483497316673794e-85
French probability : 1.9564927480913856e-87
Italian probability : 1.7857540943873623e-73

Is there a member who wishes to speak on behalf of this Group to propose this ?

English probability : 2.2058752668654216e-53
French probability : 1.2411620165157507e-63
Italian probability : 2.404282325955855e-64

Signora Presidente , sarebbe opportuno che inviasse una lettera alla Presidente del Sri Lanka per esprimere le condoglianze del Parlamento per questa e le altre morti violente verificatesi in Sri Lanka e per invitarla a fare quanto in suo potere al fine di giungere a una riconciliazione pacifica in questa situazione assai difficile .

English probability : 6.72661730819385e-205
French probability : 2.2500170078920348e-209
Italian probability : 2.994180710850441e-188

C ' est en effet tout à fait dans la

Italian probability : 4.186360220912136e-08

Tali risultati sono alla base dei programmi europei di tutela del Mare di Barents . La prego pertanto di prendere in esame la bozza della lettera in cui vengono indicati i fatti principali e di sostenere presso le autorità russe la posizione assunta dal Parlamento , conformemente alle sue risoluzioni .

English probability : 2.7269930239452634e-193
French probability : 1.111169932890823e-195
Italian probability : 9.523127148039839e-180

Oui , Madame Schroedter , j ' examinerai bien volontiers les faits relatifs à cette question lorsque j ' aurai reçu votre lettre .

English probability : 1.727164873010875e-89
French probability : 1.1282987241140441e-79
Italian probability : 1.3187052119869256e-92

Noi tutti siamo lieti che il tribunale lo abbia assolto , ribadendo che anche in Russia l ' accesso a informazioni sull ' ambiente è un diritto sancito dalla costituzione .

English probability : 2.427652056158095e-116
French probability : 4.457525