In [45]:
import numpy as np
import utils

TRAIN_COUNT = 10
players = list("!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~")
lang_files = utils.load_lang_files(train_count=TRAIN_COUNT)

In [46]:
# Define Euclidean distance comparation
def get_closest_by_pbi(train, pbi):
    closest_distance = None
    closest_lang = None

    closest_avr_distance = None
    closest_avr_lang = None

    candidates = []

    for lang, lang_pbis in train.items():
        closest_lang_distance = None
        avr_distance = 0.0
        for t_pbi in lang_pbis:
            d = np.linalg.norm(np.array(pbi) - np.array(t_pbi))
            avr_distance += d
            if(closest_lang_distance == None or d < closest_lang_distance):
                closest_lang_distance = d
            
        if(closest_distance == None or closest_lang_distance < closest_distance):
            closest_lang = lang
            closest_distance = closest_lang_distance

        if(closest_avr_distance == None or avr_distance < closest_avr_distance):
            closest_avr_lang = lang
            closest_avr_distance = avr_distance

        candidates.append((closest_lang_distance, lang))
    return (closest_lang, closest_avr_lang, candidates)

In [47]:
# Train
train_pbis = {}
for l in lang_files:
    train_pbis[l] = []
    for idx, file in enumerate(lang_files[l]):
        if(idx >= TRAIN_COUNT): continue
        train_pbis[l].append(utils.get_comb_phi_file(file.resolve(), players))

In [43]:
# Verify by Euclidean distance
total_verifications = 0
successfull_verifications = 0
successfull_verifications_n = 0
successfull_verifications_avr = 0

for l_idx, l in enumerate(lang_files):
    for idx, file in enumerate(lang_files[l]):
        if(idx < TRAIN_COUNT): continue

        total_verifications += 1

        content = open(file.resolve()).read()
        pbi = utils.get_comb_phi_file(file.resolve(), players)

        closestLang, closest_avr_lang, candidates = get_closest_by_pbi(train_pbis, pbi)

        candidates = sorted(candidates)

        if(closestLang != closest_avr_lang and l == closest_avr_lang):
            successfull_verifications_avr += 1
        elif(l == closestLang):
            successfull_verifications_avr += 1
        
        if(l == closestLang):
            successfull_verifications += 1

        for c in candidates[:3]:
            if(c[1] == l):
                successfull_verifications_n += 1
                break
print(f'Rate: {(successfull_verifications / total_verifications) * 100}%')
print(f'RateN: {(successfull_verifications_n / total_verifications) * 100}%')
print(f'RateAVR: {(successfull_verifications_avr / total_verifications) * 100}%')

Rate: 52.62096774193549%
RateN: 69.65725806451613%
RateAVR: 55.74596774193549%


In [48]:
# Save trained data
np.savez(f"train_{TRAIN_COUNT}.npz", **train_pbis)