# **PRÀCTICA 1: Identificació d'idioma**

In [2]:
import math
import nltk
from nltk.collocations import TrigramCollocationFinder
import json
import os

In [3]:
# Load trigrams and unique characters from json files
def dict_string_to_trigramtuple(d: dict) -> dict:
    return {key: {tuple(k): v for k, v in value.items()} for key, value in d.items()}

trigrams = dict_string_to_trigramtuple(json.load(open('./weights/trigrams.json')))
unique_chars = json.load(open('./weights/unique_chars.json'))

# Load preprocessed data (train and test)
train = json.load(open('./preprocessed_langId/train.json'))
test = json.load(open('./preprocessed_langId/test.json'))

In [3]:
# Dictionaries to optimize Lidstone smoothing calculation
b_dict = {key: value ** 3 for key, value in unique_chars.items()}
totals = {key: sum(trigrams[key].values()) for key in trigrams.keys()}

# Functions for Lidstone smoothing
def lidstone_smooth(language: str, trigram: tuple, lambda_value: float = 0.5):
    counts = trigrams[language]
    vocab = b_dict[language]
    total = totals[language]
    number = counts.get(trigram, 0)
    probs = (number + lambda_value) / (total + lambda_value * vocab)
    return probs

def pau_discounting(language: str, trigram: tuple, delta: float = 0.1):
    counts = trigrams[language]
    total = totals[language]
    number = counts.get(trigram, 0)
    count_trigram = counts.get(trigram, 0)
    unique = len(counts)
    prob = max(number - delta, 0) / total + (delta * unique / total) * (1 / unique)
    return prob

def absolute_discounting(language: str, trigram: tuple, alpha: float = 0.1):
    counts = trigrams[language]
    total = totals[language]
    vocab = b_dict[language]
    count_trigram = counts.get(trigram, 0)
    if count_trigram == 0:
        prob = ((vocab - len(counts))*alpha/len(counts))/total
    else:
        prob = ((count_trigram - alpha)/ total)

def linear_discounting(language: str, trigram: tuple, alpha: float = 0.1):
    counts = trigrams[language]
    total = totals[language]
    vocab = b_dict[language]
    count_trigram = counts.get(trigram, 0)
    if count_trigram == 0:
        prob = alpha / (vocab - len(counts))
    else:
        prob = (1-alpha)*(count_trigram / total)

def lidstone_total(text: str, language: str):
    trigram_finder = TrigramCollocationFinder.from_words(text)
    prob_sec = 0
    for trigram, num_instances in trigram_finder.ngram_fd.items():
        prob_sec += num_instances * math.log(absolute_discounting(trigram=trigram, language=language))
    return prob_sec

In [4]:
# Test the Lidstone smoothing
text = 'Example'
probs = []
languages = {"deu_trn.txt": "Deutsch (German)", 
             "eng_trn.txt": "English (English)", 
             "fra_trn.txt": "Français (French)", 
             "ita_trn.txt": "Italiano (Italian)", 
             "spa_trn.txt": "Español (Spanish)", 
             "nld_trn.txt": "Nederlands (Dutch)"}
for language in trigrams.keys():
    prob = lidstone_total(text, language)
    probs.append((language, prob))
max_probs = max(probs, key = lambda x: x[1])
print(max_probs)
probs = sorted(probs, key = lambda x: x[1], reverse=True)

# Normalize the probabilities and print the results
exp = [math.exp(p[1] - max_probs[1]) for p in probs]
normalize = [(e / sum (exp))*100 for e in exp]
lang = [languages[p[0]] for p in probs]
for e in range(len(lang)):
    print(f"{lang[e]}: {normalize[e]}")

('eng_trn.txt', -49.787416303726516)


In [9]:
# Test
errors_list = []
correct_pred_count = {language: 0 for language in test.keys()}

for lingua in test.keys():
	for oration in test[lingua]:
		probs = []
		for language in trigrams.keys():
			prob = lidstone_total(oration, language)
			probs.append((language, prob))
		max_probs = max(probs, key = lambda x: x[1])
		if max_probs[0] == lingua:
			correct_pred_count[lingua] += 1
		else:
			errors_list.append(oration)

total_accuracy = sum(correct_pred_count.values()) / sum([len(value) for value in test.values()])
accuracy_languages = {key: value / len(test[key]) for key, value in correct_pred_count.items()}

print(f"Total errors: {len(errors_list)}")
print(f"Total accuracy: {total_accuracy}")
for key, value in accuracy_languages.items():
	print(f"*Accuracy {key}: {value}")

Total accuracy: 0.9987996599036394. Total errors: 72
*Accuracy deu: 0.9987989190271244
*Accuracy eng: 0.999199038846616
*Accuracy fra: 0.9987001299870013
*Accuracy ita: 0.9984001599840016
*Accuracy nld: 0.9988001199880012
*Accuracy spa: 0.9989001099890011


In [None]:
for e in errors_list:
    print(e)

   bildergalerie teilen der donkervoort d gto ist ja per se nicht gerade ein warmduscher-auto.
   der stoffbeutel war leer.
   hij was de leider van de groep in de provincie khorasan van de islamitische staat.
   ist das land bald insolvent?
   jarjarthomas . jan theorethisch ist es simpel.
   kinder werden von fairplay-liga profitieren.
   was ist wicop?
   wat er nu gebeurt, is heel zwaar voor hem als jonge gast.
   wednesday, january schnelles brainstorming online machen keine schlechte idee: online brainstorming.
   google (goog) has one.
   in asia, japan's nikkei (.
   internet trouble.
   vancouver head coach willie desjardins.
  
   novlanguehaliburton et donc dick cheyney ont du acheter "short".
   les anderlechtois devront avant tout museler ruud van nistelrooy.
   salle wilfrid-pelletier, vendredi mars .
   zaza le nounours : skizomeuh> le edison, sur la nd street, dans downtown.
   bert van marwijk ne sourit pas toujours.
   'non non, il dit.
   encore une star addict de t