In [8]:
import random
#these tokens will be skipped while calculating probability
insignificant_tokens=[' ','!', '"','#','$','%','&','*','+','-','(',')',',','.','/','0','1','2','3','4','5','6','7','8',
                      '9',';','<','>','=','?','@','|','«','»','`','[',']',"'",'\\']
language_ids=['bg','bs','cz','es-AR','es-ES','hr','id','mk','my','pt-BR','pt-PT','sk','sr']

class Language: #for each language in data set, we create a Language class
   
    def __init__ (self,lang_id):
        self.chars={} # storing characters and their counts
        self.prob={} #storing p(c|l) probability for each character
        # corresponding language id 
        self.lang_id=lang_id
    def calculate_probability(self,sentence):
        s=list(sentence)
        s=[x for x in s if x not in insignificant_tokens]
        total=0
        for ch in s:
            if self.chars.has_key(ch):
                total+=self.prob[ch]
            else: 
                return 0
        return total
    def get_chars(self):
        return self.chars
    def get_prob(self):
        return self.prob
    def get_lang_id(self):
        return self.lang_id
with open ("Corpus/Raw Corpus.txt") as f:
    corpus = f.readlines()


languages=[]
model=[]
for i in range(13): #divides data set as languages
    languages.append("")
    start=i*2000
    end=(i+1)*2000
    languages[i]=corpus[start:end] #2k sentence for each language

training_set=[]
test_set=[]
for idx,l in enumerate(languages):
    lang=Language(language_ids[idx])
    random.shuffle(l)     
    
    training_partition=l[0:1800]
    training_set.extend(training_partition)
 
    training_partition=[i.split('\t')[0] for i in training_partition] #remove language identifier at the last of the sentences
    for sentence in training_partition:
        for letter in sentence:
            if letter not in insignificant_tokens:
                if lang.get_chars().has_key(letter):
                    nominator=lang.get_chars()[letter]
                else:
                    nominator=0 #laplace 
                nominator+=1
                lang.get_chars()[letter]=nominator
                
    test_partition=l[1800:2000]
    test_set.extend(test_partition)
    test_part=[i.split('\t')[0] for i in test_partition] #remove language identifier at the last of the sentences
    
    unk=list(''.join(set(''.join(test_part))))
    unk=[x for x in unk if x not in insignificant_tokens]
    unk=[x for x in unk if x not in lang.get_chars().keys()]
    if len(unk)!=0: # add unknowns characters in test set for smoothing
        for l in unk:
            lang.get_chars()[l]=0
        
    for ch in lang.chars: #laplace smoothing
        lang.get_chars()[ch]+=1
    
    denominator=sum(lang.get_chars().values())+len(lang.get_chars().values()) #for performance
    for letter in lang.get_chars(): #calculate probabilities
        lang.get_prob()[letter]=(lang.get_chars()[letter]+1)/float(denominator)
  

    model.append(lang)


In [9]:
i=0
predictions=[]
expected=[]
for sentence in test_set:
    s=sentence.split('\t')
    probabilities={}
    expected.append(language_ids.index(s[1].strip())+1)
    for l_model in model: #foreach language calculate the probability of the given sentence
        probabilities[l_model.get_lang_id()]=l_model.calculate_probability(s[0])
    #find one language having most likely given sentence     
    predictions.append(language_ids.index(max(probabilities.items(), key=lambda k: k[1])[0])+1) 

In [10]:
metrics={}
for idx,lang_id in enumerate(language_ids):
    false_negatives=len([x for x in predictions[idx*200:(idx+1)*200] if int(x) != (idx+1)])
    true_positives=200-false_negatives
    if idx==0: #count false positives on proceeding predictions
        false_positives=len([x for x in predictions[200:len(predictions)] if int(x) ==1])
    elif idx==12:  #count false positives on preceeding predictions
        false_positives=len([x for x in predictions[0:2400] if int(x)==13])
    else: #count false positives on both preceeding and proceeding predictions
        false_positives=[x for x in predictions[(idx-1)*200:idx*200] if int(x) == (idx+1)]
        false_positives=len(false_positives + [x for x in predictions[(idx+1)*200:2600] if int(x) == (idx+1)])
    true_negatives=2400-false_positives
    metrics[lang_id]={"tp":true_positives,"fn":false_negatives,"tn":true_negatives,"fp":false_positives}
for key,value in metrics.iteritems():
    print(key)
    print("True positives: " + str(value["tp"]))
    print("False positives: " + str(value["fp"]))
    print("True negatives: " + str(value["tn"]))
    print("False negatives: " + str(value["fn"]))

bg
True positives: 45
False positives: 0
True negatives: 2400
False negatives: 155
hr
True positives: 28
False positives: 7
True negatives: 2393
False negatives: 172
es-AR
True positives: 1
False positives: 42
True negatives: 2358
False negatives: 199
mk
True positives: 200
False positives: 0
True negatives: 2400
False negatives: 0
sk
True positives: 160
False positives: 0
True negatives: 2400
False negatives: 40
cz
True positives: 21
False positives: 0
True negatives: 2400
False negatives: 179
sr
True positives: 8
False positives: 2
True negatives: 2398
False negatives: 192
pt-BR
True positives: 171
False positives: 166
True negatives: 2234
False negatives: 29
bs
True positives: 170
False positives: 357
True negatives: 2043
False negatives: 30
my
True positives: 200
False positives: 15
True negatives: 2385
False negatives: 0
pt-PT
True positives: 9
False positives: 5
True negatives: 2395
False negatives: 191
es-ES
True positives: 197
False positives: 228
True negatives: 2172
False neg

In [11]:
total_tp=0.0
total_fp=0.0
total_fn=0.0
total_tn=0.0
total_precision=0.0
total_recall=0.0
total_f1score=0.0
for key in metrics.keys():
    tp=metrics[key]["tp"]
    fp=metrics[key]["fp"]
    fn=metrics[key]["fn"]
    tn=metrics[key]["tn"]
    precision=tp/float(tp+fp)
    recall=tp/float(tp+fn)
    f1_score=(2*recall*precision)/float(recall+precision)
    total_precision+=precision
    total_recall+=recall
    total_f1score+=f1_score
    
    total_tp+=tp
    total_fp+=fp
    total_fn+=fn
    total_tn+=tn
    
mic_prec=total_tp/float(total_tp+total_fp)
mic_recall=total_tp/float(total_tp+total_fn)
print("Micro-averaged precision: " + str(mic_prec))
print("Micro-averaged recall: " + str(mic_recall))
print("Micro-averaged f1-score: " + str((2*mic_prec*mic_recall)/float(mic_prec+mic_recall)))
print("")
print("Macro-averaged precision: " + str(total_precision/13.0))
print("Macro-averaged recall: " + str(total_recall/13.0))
print("Macro-averaged f1-score: " + str(total_f1score/13.0))
print("")
print("Total accuracy: "+ str(total_tp/2600.0))
print("Accuracies for languages:")
for key,value in metrics.iteritems():
    print(key + str(": ")+str(value["tp"]/200.0))
print("")
print("fp: "+str(total_fp))
print("tp: "+str(total_tp))
print("fn: "+str(total_fn))
print("tn: "+str(total_tn))

metrics

Micro-averaged precision: 0.595085995086
Micro-averaged recall: 0.465769230769
Micro-averaged f1-score: 0.522545846818

Macro-averaged precision: 0.678708254064
Macro-averaged recall: 0.465769230769
Macro-averaged f1-score: 0.427828196335

Total accuracy: 0.465769230769
Accuracies for languages:
bg: 0.225
hr: 0.14
es-AR: 0.005
mk: 1.0
sk: 0.8
cz: 0.105
sr: 0.04
pt-BR: 0.855
bs: 0.85
my: 1.0
pt-PT: 0.045
es-ES: 0.985
id: 0.005

fp: 824.0
tp: 1211.0
fn: 1389.0
tn: 30376.0


{'bg': {'fn': 155, 'fp': 0, 'tn': 2400, 'tp': 45},
 'bs': {'fn': 30, 'fp': 357, 'tn': 2043, 'tp': 170},
 'cz': {'fn': 179, 'fp': 0, 'tn': 2400, 'tp': 21},
 'es-AR': {'fn': 199, 'fp': 42, 'tn': 2358, 'tp': 1},
 'es-ES': {'fn': 3, 'fp': 228, 'tn': 2172, 'tp': 197},
 'hr': {'fn': 172, 'fp': 7, 'tn': 2393, 'tp': 28},
 'id': {'fn': 199, 'fp': 2, 'tn': 2398, 'tp': 1},
 'mk': {'fn': 0, 'fp': 0, 'tn': 2400, 'tp': 200},
 'my': {'fn': 0, 'fp': 15, 'tn': 2385, 'tp': 200},
 'pt-BR': {'fn': 29, 'fp': 166, 'tn': 2234, 'tp': 171},
 'pt-PT': {'fn': 191, 'fp': 5, 'tn': 2395, 'tp': 9},
 'sk': {'fn': 40, 'fp': 0, 'tn': 2400, 'tp': 160},
 'sr': {'fn': 192, 'fp': 2, 'tn': 2398, 'tp': 8}}

In [None]:

def write_files(file_name,array):
    with open (file_name, mode='wt') as t_file:
        for item in array:
            t_file.write(item)
print(len(training_set))
print(len(test_set))
write_files("Training set.txt",training_set)
write_files("Test set.txt",test_set)