In [1]:
import nltk
from nltk.corpus import udhr

english = udhr.raw('English-Latin1')
french = udhr.raw("French_Francais-Latin1")
italian = udhr.raw("Italian_Italiano-Latin1")
spanish = udhr.raw("Spanish_Espanol-Latin1")

In [2]:
english_train, english_dev = english[0:1000], english[1000:1100]
french_train, french_dev = french[0:1000], french[1000:1100]
italian_train, italian_dev = italian[0:1000], italian[1000:1100]
spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100]
english_test = udhr.words('English-Latin1')[0:1000]
french_test = udhr.words('French_Francais-Latin1')[0:1000]
italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000]
spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000]

In [3]:
en_uni_freqs = nltk.FreqDist(english_train)
fr_uni_freqs = nltk.FreqDist(french_train)
it_uni_freqs = nltk.FreqDist(italian_train)
sp_uni_freqs = nltk.FreqDist(spanish_train)

In [4]:
en_bi_freqs = nltk.FreqDist(list(nltk.bigrams(english_train)))
fr_bi_freqs = nltk.FreqDist(list(nltk.bigrams(french_train)))
it_bi_freqs = nltk.FreqDist(list(nltk.bigrams(italian_train)))
sp_bi_freqs = nltk.FreqDist(list(nltk.bigrams(spanish_train)))

In [5]:
en_tri_freqs = nltk.FreqDist(list(nltk.trigrams(english_train)))
fr_tri_freqs = nltk.FreqDist(list(nltk.trigrams(french_train)))
it_tri_freqs = nltk.FreqDist(list(nltk.trigrams(italian_train)))
sp_tri_freqs = nltk.FreqDist(list(nltk.trigrams(spanish_train)))

In [6]:
def word_prob(word, dist):
    # given the frequency distribution, figure out if unigram, bigram...
    n = len(dist.most_common(1)[0][0])
    # calculate frequency
    result = 1.0
    for c in range(len(word)-n+1):
        result *= dist.freq(tuple(word[c:c+n]))
    # result never changed so 
    if result != 1.0:
        return result
    else:
        return 0.0

In [21]:
##Test code
#print("%12s|%12s|%12s|%12s|%12s|%12s|%12s" % ("word", "en_uni", "en_bi", "en_tri", "fr_uni", "fr_bi", "fr_tri"))
#print("="*90)
#for i in range(10):
#    print("%12s| %.8f | %.8f | %.8f | %.8f | %.8f | %.8f" %
#          (english_test[i],
#           word_prob(english_test[i],en_uni_freqs),
#           word_prob(english_test[i],en_bi_freqs),
#           word_prob(english_test[i],en_tri_freqs),
#           word_prob(english_test[i],fr_uni_freqs),
#           word_prob(english_test[i],fr_bi_freqs),
#           word_prob(english_test[i],fr_tri_freqs)))
    

In [23]:
## Get accuracies of each english and french model

en_uni_prob_sum = 0.0;
en_bi_prob_sum = 0.0;
en_tri_prob_sum = 0.0;
fr_uni_prob_sum = 0.0;
fr_bi_prob_sum = 0.0;
fr_tri_prob_sum = 0.0;

for word in english_test:
    en_uni_prob_sum += word_prob(word,en_uni_freqs)
    en_bi_prob_sum += word_prob(word,en_bi_freqs)
    en_tri_prob_sum += word_prob(word,en_tri_freqs)
    fr_uni_prob_sum += word_prob(word,fr_uni_freqs)
    fr_bi_prob_sum += word_prob(word,fr_bi_freqs)
    fr_tri_prob_sum += word_prob(word,fr_tri_freqs)

en_test_len = len(english_test)

en_uni_prob_avg = en_uni_prob_sum / en_test_len
en_bi_prob_avg = en_bi_prob_sum / en_test_len
en_tri_prob_avg = en_tri_prob_sum / en_test_len
fr_uni_prob_avg = fr_uni_prob_sum / en_test_len
fr_bi_prob_avg = fr_bi_prob_sum / en_test_len
fr_tri_prob_avg = fr_tri_prob_sum / en_test_len

print("Accuracy of each english and french model for english test set:\n")
print("%12s|%12s|%12s|%12s|%12s|%12s" % ("en_uni", "en_bi", "en_tri", "fr_uni", "fr_bi", "fr_tri"))
print("="*80)
print("%.10f|%.10f|%.10f|%.10f|%.10f|%.10f\n" %
      (en_uni_prob_avg, en_bi_prob_avg, en_tri_prob_avg, fr_uni_prob_avg, fr_bi_prob_avg, fr_tri_prob_avg))
print("Abs diff between uni models:", abs(en_uni_prob_avg-fr_uni_prob_avg))
print("Abs diff between bi models:", abs(en_bi_prob_avg-fr_bi_prob_avg))
print("Abs diff between tri models:", abs(en_tri_prob_avg-fr_tri_prob_avg))

Accuracy of each english and french model for english test set:

      en_uni|       en_bi|      en_tri|      fr_uni|       fr_bi|      fr_tri
0.0000000000|0.0015537524|0.0015622716|0.0000000000|0.0003338303|0.0000110221

Abs diff between uni models: 0.0
Abs diff between bi models: 0.0012199221306470594
Abs diff between tri models: 0.0015512495057428501


In [22]:
## Get accuracies of each spanish and italian model

it_uni_prob_sum = 0.0;
it_bi_prob_sum = 0.0;
it_tri_prob_sum = 0.0;
sp_uni_prob_sum = 0.0;
sp_bi_prob_sum = 0.0;
sp_tri_prob_sum = 0.0;

for word in italian_test:
    it_uni_prob_sum += word_prob(word,it_uni_freqs)
    it_bi_prob_sum += word_prob(word,it_bi_freqs)
    it_tri_prob_sum += word_prob(word,it_tri_freqs)
    sp_uni_prob_sum += word_prob(word,sp_uni_freqs)
    sp_bi_prob_sum += word_prob(word,sp_bi_freqs)
    sp_tri_prob_sum += word_prob(word,sp_tri_freqs)

it_test_len = len(italian_test)

it_uni_prob_avg = it_uni_prob_sum / it_test_len
it_bi_prob_avg = it_bi_prob_sum / it_test_len
it_tri_prob_avg = it_tri_prob_sum / it_test_len
sp_uni_prob_avg = sp_uni_prob_sum / it_test_len
sp_bi_prob_avg = sp_bi_prob_sum / it_test_len
sp_tri_prob_avg = sp_tri_prob_sum / it_test_len

print("Accuracy of each italian and spanish model for italian test set:\n")
print("%12s|%12s|%12s|%12s|%12s|%12s" % ("it_uni", "it_bi", "it_tri", "sp_uni", "sp_bi", "sp_tri"))
print("="*80)
print("%.10f|%.10f|%.10f|%.10f|%.10f|%.10f\n" %
      (it_uni_prob_avg, it_bi_prob_avg, it_tri_prob_avg, sp_uni_prob_avg, sp_bi_prob_avg, sp_tri_prob_avg))
print("Abs diff between uni models:", abs(it_uni_prob_avg-sp_uni_prob_avg))
print("Abs diff between bi models:", abs(it_bi_prob_avg-sp_bi_prob_avg))
print("Abs diff between tri models:", abs(it_tri_prob_avg-sp_tri_prob_avg))

Accuracy of each italian and spanish model for italian test set:

      it_uni|       it_bi|      it_tri|      sp_uni|       sp_bi|      sp_tri
0.0000000000|0.0010674431|0.0002931043|0.0000000000|0.0006732689|0.0000180384
