In [41]:
import nltk

from nltk.corpus import udhr

LANGUAGES = [ 
    "English-Latin1", 
    "French_Francais-Latin1",
    "Spanish-Latin1",
    "Italian-Latin1",
    "German_Deutsch-Latin1"
]


def makeTrigrams(words):
    return nltk.trigrams("$$" + "$$".join((w.lower() for w in words if w.isalpha() )) + "$$")


def createTrigramGrammar(trigrams):
    conditions = ( ((a, b), c) for a, b, c in trigrams )
    return nltk.ConditionalFreqDist(conditions)


def wordProb(word, grammar):
    assert word.isalpha()
    word = "$$" + word.lower() + "$$"
    
    p = 1.0
    for a, b, c in nltk.trigrams(word):
        p += grammar[(a, b)].freq(c)
    return p / len(word)


def predictLanguage(word, grammars):
    return [ ( wordProb(word, grammar), lang ) for lang, grammar in grammars ]


def predictSentLanguage(sent, grammars):
    return [ ( sum([ wordProb(word, grammar) for word in sent ]) / len(sent), lang ) for lang, grammar in grammars ] 


all_trigrams = [ (lang, makeTrigrams(udhr.words(lang))) for lang in LANGUAGES ]
trigram_grammars = [ (lang, createTrigramGrammar(trigrams)) for lang, trigrams in all_trigrams ]

predictLanguage('wood', trigram_grammars)

[(0.45786900988355456, 'English-Latin1'),
 (0.25, 'French_Francais-Latin1'),
 (0.25297619047619047, 'Spanish-Latin1'),
 (0.265625, 'Italian-Latin1'),
 (0.2645062800254674, 'German_Deutsch-Latin1')]