In [1]:
import nltk 

In [2]:
from nltk.corpus import udhr 
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

In [3]:
nltk.download('udhr')

[nltk_data] Downloading package udhr to
[nltk_data]     C:\Users\Anjali\AppData\Roaming\nltk_data...
[nltk_data]   Package udhr is already up-to-date!


True

In [4]:
english = udhr.raw('English-Latin1')
french = udhr.raw('French_Francais-Latin1')
italian = udhr.raw('Italian_Italiano-Latin1')
spanish = udhr.raw('Spanish_Espanol-Latin1')

### Preprocessing

In [5]:
# Creating Training Dataset and Developement dataset for English,French, Italian, Spanish
english_train, english_dev = english[0:1000], english[1000:1100]
french_train, french_dev = french[0:1000], french[1000:1100]
italian_train, italian_dev = italian[0:1000], italian[1000:1100]
spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100] 

In [6]:
# Creating Test Dataset for English,French, Italian, Spanish
english_test = udhr.words('English-Latin1')[0:1000]
french_test = udhr.words('French_Francais-Latin1')[0:1000]
italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000]
spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000]

In [8]:
# Preprocessing the Corpus
import string

In [9]:
# Processing English Corpus and lower case all words

english_train_words = english_train.split()
english_train_words = [word.lower() for word in english_train_words]
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
english_train = [w.translate(table) for w in english_train_words]
english_test = [token.lower() for token in english_test]
print(english_train[:100])

['universal', 'declaration', 'of', 'human', 'rights', 'preamble', 'whereas', 'recognition', 'of', 'the', 'inherent', 'dignity', 'and', 'of', 'the', 'equal', 'and', 'inalienable', 'rights', 'of', 'all', 'members', 'of', 'the', 'human', 'family', 'is', 'the', 'foundation', 'of', 'freedom', 'justice', 'and', 'peace', 'in', 'the', 'world', 'whereas', 'disregard', 'and', 'contempt', 'for', 'human', 'rights', 'have', 'resulted', 'in', 'barbarous', 'acts', 'which', 'have', 'outraged', 'the', 'conscience', 'of', 'mankind', 'and', 'the', 'advent', 'of', 'a', 'world', 'in', 'which', 'human', 'beings', 'shall', 'enjoy', 'freedom', 'of', 'speech', 'and', 'belief', 'and', 'freedom', 'from', 'fear', 'and', 'want', 'has', 'been', 'proclaimed', 'as', 'the', 'highest', 'aspiration', 'of', 'the', 'common', 'people', 'whereas', 'it', 'is', 'essential', 'if', 'man', 'is', 'not', 'to', 'be']


In [10]:
# Processing French Corpus and lower case all words

french_train_words = french_train.split()
french_train_words = [word.lower() for word in french_train_words]
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
french_train = [w.translate(table) for w in french_train_words]
french_test = [token.lower() for token in french_test]
print(french_train[:100])

['déclaration', 'universelle', 'des', 'droits', 'de', 'lhomme', 'préambule', 'considérant', 'que', 'la', 'reconnaissance', 'de', 'la', 'dignité', 'inhérente', 'à', 'tous', 'les', 'membres', 'de', 'la', 'famille', 'humaine', 'et', 'de', 'leurs', 'droits', 'égaux', 'et', 'inaliénables', 'constitue', 'le', 'fondement', 'de', 'la', 'liberté', 'de', 'la', 'justice', 'et', 'de', 'la', 'paix', 'dans', 'le', 'monde', 'considérant', 'que', 'la', 'méconnaissance', 'et', 'le', 'mépris', 'des', 'droits', 'de', 'lhomme', 'ont', 'conduit', 'à', 'des', 'actes', 'de', 'barbarie', 'qui', 'révoltent', 'la', 'conscience', 'de', 'lhumanité', 'et', 'que', 'lavènement', 'dun', 'monde', 'où', 'les', 'êtres', 'humains', 'seront', 'libres', 'de', 'parler', 'et', 'de', 'croire', 'libérés', 'de', 'la', 'terreur', 'et', 'de', 'la', 'misère', 'a', 'été', 'proclamé', 'comme', 'la', 'plus']


In [11]:
# Processing Italian Corpus and lower case all words

italian_train_words = italian_train.split()
italian_train_words = [word.lower() for word in italian_train_words]
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
italian_train = [w.translate(table) for w in italian_train_words]
italian_test = [token.lower() for token in italian_test]
print(italian_train[:100])

['dichiarazione', 'universale', 'dei', 'diritti', 'umani', 'preambolo', 'considerato', 'che', 'il', 'riconoscimento', 'della', 'dignità', 'inerente', 'a', 'tutti', 'i', 'membri', 'della', 'famiglia', 'umana', 'e', 'dei', 'loro', 'diritti', 'uguali', 'ed', 'inalienabili', 'costituisce', 'il', 'fondamento', 'della', 'libertà', 'della', 'giustizia', 'e', 'della', 'pace', 'nel', 'mondo', 'considerato', 'che', 'il', 'disconoscimento', 'e', 'il', 'disprezzo', 'dei', 'diritti', 'umani', 'hanno', 'portato', 'ad', 'atti', 'di', 'barbarie', 'che', 'offendono', 'la', 'coscienza', 'dellumanità', 'e', 'che', 'lavvento', 'di', 'un', 'mondo', 'in', 'cui', 'gli', 'esseri', 'umani', 'godano', 'della', 'libertà', 'di', 'parola', 'e', 'di', 'credo', 'e', 'della', 'libertà', 'dal', 'timore', 'e', 'dal', 'bisogno', 'è', 'stato', 'proclamato', 'come', 'la', 'più', 'alta', 'aspirazione', 'delluomo', 'considerato', 'che', 'è', 'indispensabile']


In [12]:
# Processing spanish Corpus and lower case all words

spanish_train_words = spanish_train.split()
spanish_train_words = [word.lower() for word in spanish_train_words]
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
spanish_train = [w.translate(table) for w in spanish_train_words]
spanish_test = [token.lower() for token in spanish_test]
print(spanish_train[:100])

['declaración', 'universal', 'de', 'derechos', 'humanos', 'adoptada', 'y', 'proclamada', 'por', 'la', 'asamblea', 'general', 'en', 'su', 'resolución', '217', 'a', 'iii', 'de', '10', 'de', 'diciembre', 'de', '1948', 'preámbulo', 'considerando', 'que', 'la', 'libertad', 'la', 'justicia', 'y', 'la', 'paz', 'en', 'el', 'mundo', 'tienen', 'por', 'base', 'el', 'reconocimiento', 'de', 'la', 'dignidad', 'intrínseca', 'y', 'de', 'los', 'derechos', 'iguales', 'e', 'inalienables', 'de', 'todos', 'los', 'miembros', 'de', 'la', 'familia', 'humana', 'considerando', 'que', 'el', 'desconocimiento', 'y', 'el', 'menosprecio', 'de', 'los', 'derechos', 'humanos', 'han', 'originado', 'actos', 'de', 'barbarie', 'ultrajantes', 'para', 'la', 'conciencia', 'de', 'la', 'humanidad', 'y', 'que', 'se', 'ha', 'proclamado', 'como', 'la', 'aspiración', 'más', 'elevada', 'del', 'hombre', 'el', 'advenimiento', 'de', 'un']


### Model creation

In [13]:
# creating models for English

In [14]:
# unigram Models
print("English Unigram model")
fdist_unigram_english=nltk.FreqDist(list(nltk.ngrams(english_train,1)))
print(fdist_unigram_english)

#bigram Model
print("English bigram model")
fdist_bigram_english=nltk.FreqDist(list(nltk.bigrams(english_train)))
print(fdist_bigram_english)

#trigram model
print("English trigram model")
fdist_trigram_english=nltk.FreqDist(list(nltk.trigrams(english_train)))
print(fdist_trigram_english)

English Unigram model
<FreqDist with 89 samples and 167 outcomes>
English bigram model
<FreqDist with 151 samples and 166 outcomes>
English trigram model
<FreqDist with 162 samples and 165 outcomes>


In [15]:
#Creating models for French

In [16]:
# unigram Models
print("French Unigram model")
fdist_unigram_french=nltk.FreqDist(list(nltk.ngrams(french_train,1)))
print(fdist_unigram_french)

#bigram Model
print("French bigram model")
fdist_bigram_french=nltk.FreqDist(list(nltk.bigrams(french_train)))
print(fdist_bigram_french)

#trigram model
print("French trigram model")
fdist_trigram_french=nltk.FreqDist(list(nltk.trigrams(french_train)))
print(fdist_trigram_french)

French Unigram model
<FreqDist with 91 samples and 164 outcomes>
French bigram model
<FreqDist with 142 samples and 163 outcomes>
French trigram model
<FreqDist with 155 samples and 162 outcomes>


In [17]:
# Creating models for Italian

In [18]:
# unigram Models
print("italian Unigram model")
fdist_unigram_italian=nltk.FreqDist(list(nltk.ngrams(italian_train,1)))
print(fdist_unigram_italian)

#bigram Model
print("italian bigram model")
fdist_bigram_italian=nltk.FreqDist(list(nltk.bigrams(italian_train)))
print(fdist_bigram_italian)

#trigram model
print("italian trigram model")
fdist_trigram_italian=nltk.FreqDist(list(nltk.trigrams(italian_train)))
print(fdist_trigram_italian)

italian Unigram model
<FreqDist with 98 samples and 153 outcomes>
italian bigram model
<FreqDist with 138 samples and 152 outcomes>
italian trigram model
<FreqDist with 147 samples and 151 outcomes>


In [19]:
# Creating models for spanish language

In [20]:
# unigram Models
print("spanish Unigram model")
fdist_unigram_spanish=nltk.FreqDist(list(nltk.ngrams(spanish_train,1)))
print(fdist_unigram_spanish)

#bigram Model
print("spanish bigram model")
fdist_bigram_spanish=nltk.FreqDist(list(nltk.bigrams(spanish_train)))
print(fdist_bigram_spanish)

#trigram model
print("spanish trigram model")
fdist_trigram_spanish=nltk.FreqDist(list(nltk.trigrams(spanish_train)))
print(fdist_trigram_spanish)

spanish Unigram model
<FreqDist with 89 samples and 166 outcomes>
spanish bigram model
<FreqDist with 145 samples and 165 outcomes>
spanish trigram model
<FreqDist with 159 samples and 164 outcomes>


### Problem 1:  English vs. French unigram models, English vs. French bigram models,and English vs. French trigram models

In [22]:
## English model accuracy

In [23]:
#English vs French unigram model
print("English vs French unigram model")
count_eng = 0
for i in english_test:
  english_test_unigram =list(nltk.ngrams(i,1))
  probability_english =1
  probability_french  =1

  for t in  english_test_unigram:
     probability_english = probability_english * fdist_unigram_english.freq(t)
     probability_french  = probability_french  *  fdist_unigram_french.freq(t)


  if probability_english >= probability_french:
         count_eng=count_eng+1   #smoothing

print("English Unigram model accuracy with English test data=" +str(count_eng/len(english_test)))

English vs French unigram model
English Unigram model accuracy with English test data=0.998


In [24]:
#English vs French bigram model
print("English vs French bigram model")
count_eng=0
for i in english_test:
  english_test_bigram =list(nltk.bigrams(i))
  probability_english =1
  probability_french  =1
  for t in  english_test_bigram:
     probability_english = probability_english * fdist_bigram_english.freq(t)
     probability_french  = probability_french  *  fdist_bigram_french.freq(t)


  if probability_english >= probability_french:
         count_eng=count_eng+1  #smoothing
        

print("English Bigram model accuracy with English test data =" +str(count_eng/len(english_test)))

English vs French bigram model
English Bigram model accuracy with English test data =1.0


In [25]:
#English vs French trigram model

print("English vs French trigram model")
count_eng=0
for i in english_test:
  english_test_trigram =list(nltk.trigrams(i))
  probability_english =1
  probability_french  =1
  for t in  english_test_trigram:
     probability_english = probability_english * fdist_trigram_english.freq(t)
     probability_french  = probability_french  *  fdist_trigram_french.freq(t)


  if probability_english >= probability_french:
         count_eng=count_eng+1

print("English trigram model accuracy with English test data =" +str(count_eng/len(english_test)))

English vs French trigram model
English trigram model accuracy with English test data =1.0


In [26]:
## French model accuracy

In [27]:
# French vs english unigram model with french test data
print("French Unigram Accuracy with French test data ")
count_french = 0
for i in french_test:
    french_test_unigram = list(nltk.ngrams(i, 1))
    probability_english = 1
    probability_french = 1

    for t in french_test_unigram:
        probability_english = probability_english * fdist_unigram_english.freq(t)
        probability_french = probability_french * fdist_unigram_french.freq(t)

    if probability_english <= probability_french:
        count_french = count_french + 1

print("French unigram model accuracy with French test data =" + str(count_french / len(french_test)))

French Unigram Accuracy with French test data 
French unigram model accuracy with French test data =0.992


In [28]:
# French vs english bigram model with french test data
print("French bigram model Accuracy with French test data ")
count_french = 0
for i in french_test:
    french_test_bigram = list(nltk.bigrams(i))
    probability_english = 1
    probability_french = 1
    for t in french_test_bigram:
        probability_english = probability_english * fdist_bigram_english.freq(t)
        probability_french = probability_french * fdist_bigram_french.freq(t)

    if probability_english <= probability_french:
        count_french = count_french + 1

print("French bigram model accuracy with French test data =" + str(count_french / len(french_test)))

French bigram model Accuracy with French test data 
French bigram model accuracy with French test data =1.0


In [29]:
# French vs english trigram model with french test data
print("French Trigram model Accuracy with French test data ")
count_french = 0
for i in french_test:
    french_test_trigram = list(nltk.trigrams(i))
    probability_english = 1
    probability_french = 1
    for t in french_test_trigram:
        probability_english = probability_english * fdist_trigram_english.freq(t)
        probability_french = probability_french * fdist_trigram_french.freq(t)

    if probability_english <= probability_french:
        count_french = count_french + 1

print("French trigram model accuracy with French test data =" + str(count_french / len(french_test)))

French Trigram model Accuracy with French test data 
French trigram model accuracy with French test data =1.0


### Problem 2 - Spanish vs Italian model comparison

In [31]:
# Italian vs spanish unigram model with italian test data
print("Italian Unigram model Accuracy with italian data ")
count_italian = 0
for i in italian_test:
    italian_test_unigram = list(nltk.ngrams(i, 1))
    probability_italian = 1
    probability_spanish = 1

    for t in italian_test_unigram:
        probability_italian = probability_italian * fdist_unigram_italian.freq(t)
        probability_spanish = probability_spanish * fdist_unigram_spanish.freq(t)

    if probability_italian >= probability_spanish:
        count_italian = count_italian + 1

print("Italian unigram model accuracy with italian data=" + str(count_italian / len(italian_test)))

Italian Unigram model Accuracy with italian data 
Italian unigram model accuracy with italian data=1.0


In [32]:
# Italian vs spanish bigram model with italian test data
print("Italian bigram Accuracy with italian test data")
count_italian = 0
for i in italian_test:
    italian_test_bigram = list(nltk.bigrams(i))
    probability_italian = 1
    probability_spanish = 1
    for t in italian_test_bigram:
        probability_italian = probability_italian * fdist_bigram_italian.freq(t)
        probability_spanish = probability_spanish * fdist_bigram_spanish.freq(t)

    if probability_italian >= probability_spanish:
        count_italian = count_italian + 1

print("Italian bigram model accuracy with italian data =" + str(count_italian / len(italian_test)))

Italian bigram Accuracy with italian test data
Italian bigram model accuracy with italian data =1.0


In [33]:
# Italian vs spanish trigram model with italian test data
print("Italian trigram Accuracy with italian test data")
count_italian = 0
for i in italian_test:
    italian_test_trigram = list(nltk.trigrams(i))
    probability_italian = 1
    probability_spanish = 1
    for t in italian_test_trigram:
        probability_italian = probability_italian * fdist_trigram_italian.freq(t)
        probability_spanish = probability_spanish * fdist_trigram_spanish.freq(t)

    if probability_italian >= probability_spanish:
        count_italian = count_italian + 1

print("Italian trigram model accuracy with italian test data=" + str(count_italian / len(italian_test)))


Italian trigram Accuracy with italian test data
Italian trigram model accuracy with italian test data=1.0


In [34]:
# spanish Accuracy

In [35]:
# Spanish vs italian unigram model with spanish test data
print("Spanish Unigram Accuracy with spanish test data")
count_spanish = 0
for i in spanish_test:
    spanish_test_unigram = list(nltk.ngrams(i, 1))
    probability_italian = 1
    probability_spanish = 1

    for t in spanish_test_unigram:
        probability_italian = probability_italian * fdist_unigram_italian.freq(t)
        probability_spanish = probability_spanish * fdist_unigram_spanish.freq(t)

    if probability_italian <= probability_spanish:
        count_spanish = count_spanish + 1

print("Spanish unigram model accuracy with spanish test data=" + str(count_spanish / len(spanish_test)))

Spanish Unigram Accuracy with spanish test data
Spanish unigram model accuracy with spanish test data=0.961


In [36]:
# Spanish vs italian bigram model with spanish test data
print("Spanish bigram Accuracy  with spanish test data")
count_spanish = 0
for i in spanish_test:
    spanish_test_bigram = list(nltk.bigrams(i))
    probability_italian = 1
    probability_spanish = 1
    for t in spanish_test_bigram:
        probability_italian = probability_italian * fdist_bigram_italian.freq(t)
        probability_spanish = probability_spanish * fdist_bigram_spanish.freq(t)

    if probability_italian <= probability_spanish:
        count_spanish = count_spanish + 1

print("Spanish bigram model accuracy  with spanish test data=" + str(count_spanish / len(spanish_test)))


Spanish bigram Accuracy  with spanish test data
Spanish bigram model accuracy  with spanish test data=1.0


In [37]:
# Spanish vs italian trigram model with spanish test data
print("spanish trigram Accuracy with spanish test data")
count_spanish = 0
for i in spanish_test:
    spanish_test_trigram = list(nltk.trigrams(i))
    probability_italian = 1
    probability_spanish = 1
    for t in spanish_test_trigram:
        probability_italian = probability_italian * fdist_trigram_italian.freq(t)
        probability_spanish = probability_spanish * fdist_trigram_spanish.freq(t)

    if probability_italian <= probability_spanish:
        count_spanish = count_spanish + 1

print("Spanish trigram model accuracy with spanish test data=" + str(count_spanish / len(spanish_test)))


spanish trigram Accuracy with spanish test data
Spanish trigram model accuracy with spanish test data=1.0


### Spanish Vs Italian unigram model is harder to distinguish