!pip install gensim sklearn numpy

### Bibliotecas Usadas

In [1]:
from gensim.models import KeyedVectors, FastText
from gensim.models.fasttext import load_facebook_vectors
import fasttext
import nltk

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from glob import glob

import multiprocessing
from itertools import product

### Funções Utilizadas

In [2]:
def getModel(modelFile, facebook_model=False):
    if(facebook_model):
        model = load_facebook_vectors(modelFile)
    else:
        model = KeyedVectors.load_word2vec_format(modelFile)
    return model

def analogy(model, pos, neg=[], n=5):
    return model.most_similar(pos,neg,n)

def showList(resultList):
    for el in resultList:
        print("\t",el,"\n")
        
def trainUnsupervisedTxt(inFile, outFile, dimension=100,model="skipgram"):
    model = fasttext.train_unsupervised(inFile, model=model, dim=dimension)
    model.save_model(outFile)
    
    return 'Model %s saved(%s %s) @ %s' % (inFile, dimension, model, outFile)

def eval_analogies_acc_keydvec(model_ref,test_set,case_insensitive=True,dummy4unknown=False):
    if(isinstance(model_ref, str)):
        model = getModel(model_ref)
    else:
        model = model_ref
    for test in test_set:
        acc_ = model.evaluate_word_analogies(test, case_insensitive=case_insensitive, dummy4unknown=dummy4unknown)
        
        obs = ''
        if(dummy4unknown):
            obs += ' | setting 0 to missing words'
        if(case_insensitive):
            obs += ' | case insensitive'
        else:
            obs += ' | case sensitive'
        log_str =  test + ' %.4f ' % acc_[0] + obs     
        
    return acc_[0], acc_[1], log_str

def get_facebook_vecs_word_analogy_accuracy(model_ref, test, case_insensitive=True, dummy4unknown=False):
    if(isinstance(model_ref, str)):
        model = load_facebook_vectors(model_ref)
    else:
        model = model_ref
#     for test in test_set:
    acc_ = model.evaluate_word_analogies(test, case_insensitive=case_insensitive, dummy4unknown=dummy4unknown)

    obs = ''
    if(dummy4unknown):
        obs += ' | setting 0 to missing words'
    if(case_insensitive):
        obs += ' | case insensitive'
    else:
        obs += ' | case sensitive'
    log_str = test + ' %.4f ' % acc_[0] +  obs
    
    return acc_[0], acc_[1], log_str

# WRAPPERS
def evaluate_acc_by_function(model, model_ref, test, case_insensitive, dummy4unknown, acc_func):
    
    acc, res, log = acc_func(model, test, case_insensitive, dummy4unknown)
    print(model_ref + " @ " + log)
    
    return

def eval_all_params(model_ref, test_set, eval_func, facebook_model=False):
    model = getModel(model_ref, facebook_model)

    evaluate_acc_by_function(model, model_ref, test_set, True, 
                                 True, eval_func)
    evaluate_acc_by_function(model, model_ref, test_set, True, 
                            False, eval_func)
    evaluate_acc_by_function(model, model_ref, test_set, False, 
                             True, eval_func)
    evaluate_acc_by_function(model, model_ref, test_set, False, 
                             False, eval_func)
    
def eval_default_param(model_ref, test_set, eval_func, facebook_model=False):
    model = getModel(model_ref, facebook_model)
    
    acc, res, log = eval_func(model, test)
    print(model_ref + " @ " + log)
    
    return

def get_model_info(model_ref, testset, analogy_acc_func, facebook_model=False):
    model_info = {}
    model_info['model_ref'] = model_ref
    model_info['benchmark'] = {}

    model_info['model'] = getModel(model_info['model_ref'], facebook_model)
    
    for test in testset:
        model_info['benchmark'][test] = {
            'accuracy': None,
            'results': None,
            'log' : None
        }
        model_info['benchmark'][test]['accuracy'], \
        model_info['benchmark'][test]['results'], \
        model_info['benchmark'][test]['log'] = analogy_acc_func(model_info['model'], test)

    return model_info

def print_model_acc(model, test, acc, info = ''):
    print(model, ' @ ', test, ' : ', acc, info)
    
def print_tex_table_format(matrix):
    for row in matrix:
        print(str(row[0]) + ' & %.4f' % row[1])

def build_model_accuracy_tuple(model_list, testset, func, facebook_model):
    product_set = product(model_list, testset)
    
    return [(*i, func, facebook_model) for i in product_set]

testset = glob('../testsets/LX-4WAnalogies*')

### Eval Mock

In [3]:
poolMock = multiprocessing.Pool()

../models/mock.txt @ ../testsets/LX-4WAnalogiesBr.txt0.0 | setting 0 to missing words | case insensitive
../models/mock.txt @ ../testsets/LX-4WAnalogies.txt0.0 | setting 0 to missing words | case insensitive
../models/mock.txt @ ../testsets/LX-4WAnalogiesBr.txtNone | case insensitive
../models/mock.txt @ ../testsets/LX-4WAnalogies.txtNone | case insensitive
../models/mock.txt @ ../testsets/LX-4WAnalogiesBr.txt0.0 | setting 0 to missing words | case sensitive
../models/mock.txt @ ../testsets/LX-4WAnalogies.txt0.0 | setting 0 to missing words | case sensitive
../models/mock.txt @ ../testsets/LX-4WAnalogiesBr.txtNone | case sensitive
../models/mock.txt @ ../testsets/LX-4WAnalogies.txtNone | case sensitive


In [39]:
pool_res = poolMock.starmap(eval_all_params, [('../models/mock.txt','../testsets/LX-4WAnalogiesBr.txt', get_facebook_vecs_word_analogy_accuracy)])

In [74]:
product_set = product(['../models/mock.txt'],['../testsets/LX-4WAnalogiesBr.txt', '../testsets/LX-4WAnalogies.txt'])

In [6]:
pool_res = poolMock.starmap(eval_all_params, build_model_accuracy_tuple(['../models/mock.txt'],['../testsets/LX-4WAnalogiesBr.txt', '../testsets/LX-4WAnalogies.txt'], get_facebook_vecs_word_analogy_accuracy, facebook_model=False))

In [75]:
[(*i, get_facebook_vecs_word_analogy_accuracy) for i in product_set]

[('../models/mock.txt',
  '../testsets/LX-4WAnalogiesBr.txt',
  <function __main__.get_facebook_vecs_word_analogy_accuracy(model_ref, test, case_insensitive=True, dummy4unknown=False)>),
 ('../models/mock.txt',
  '../testsets/LX-4WAnalogies.txt',
  <function __main__.get_facebook_vecs_word_analogy_accuracy(model_ref, test, case_insensitive=True, dummy4unknown=False)>)]

In [13]:
def processTriadAnalogy(word1,word2,word3, modelsList):
    result = []
    for modelFileRef in modelsList:
        model = getModel(modelFileRef)
        if(word1 in model and word2 in model and word3 in model):
            
            v_word1 = model.word_vec(word1)
            v_word2 = model.word_vec(word2)
            v_word3 = model.word_vec(word3)
    #         basisRelation = extract_relation(v_word1, v_word2)
            basisRelation = v_word1 - v_word2 + v_word3
            bestWordOnModel = find_best_related_word(basisRelation, model)
            result.append('Model: ' + modelFileRef + '/n Best Word : ' + bestWordOnModel)
        else:
            print('Fail:Word not found in model')
    return result

# https://datascience-enthusiast.com/DL/Operations_on_word_vectors.html
def similar_cos(u, v):
    distance = 0.0

    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u, v)
    # Compute the L2 norm of u (≈1 line)
#     norm_u = np.sqrt(np.sum(u**2))
    norm_u = np.linalg.norm(u)
    
    # Compute the L2 norm of v (≈1 line)
#     norm_v = np.sqrt(np.sum(v**2))
    norm_v = np.linalg.norm(v)
    
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u * norm_v)

    return cosine_similarity

def extract_relation(target, related):
    return target - related

def find_best_related_word(analogy_vector, model):
#     related = target + relation
    max_sim = 0.0
    best_word = None
    for word in model.vocab:
#         relation1 = relation.reshape(1,-1)
#         relation2 = extract_relation(model.word_vec(target), model.word_vec(word)).reshape(1,-1)
        sim = similar_cos(analogy_vector, model[word])
        if(sim > max_sim):
            best_word = word
    return best_word;

def analogy(pos, neg):
    return model.most_similar(pos,neg)
    
            

In [4]:
# result = processTriadAnalogy(word1,word2,word3,modelsList)

### Teste 1 - Biblioteca KeyedVector e seus metodos de similaridade

In [5]:
modelsList = glob('../models/cbow_*.txt')
model = [getModel(x) for x in modelsList]

print(modelsList)


['../models/cbow_s300.txt', '../models/cbow_s100.txt', '../models/cbow_s50.txt']


In [21]:
model[0].most_similar(positive=['barack', 'obama'])

[('eisenhower', 0.47269853949546814),
 ('lula', 0.460972398519516),
 ('bush', 0.4601280987262726),
 ('ratsiraka', 0.43053072690963745),
 ('clinton', 0.4282917082309723),
 ('saakashvili', 0.4188674986362457),
 ('uribe', 0.4161287844181061),
 ('draghi', 0.40998852252960205),
 ('netanyahu', 0.403103768825531),
 ('yushchenko', 0.3994860053062439)]

- Comparação dos modelos para a analogia Rei-Homem, Rainha-X

In [8]:
word1 = 'rei'
word2 = 'homem'
word3 = 'rainha'

for index, x in enumerate(model):
    nMostSimilar = x.most_similar(positive=[word1,word3], negative=[word2])

    print(modelsList[index], nMostSimilar)
    print('Positive:  ', word1,word3, '\nNegative:',word2)
    print()

../models/cbow_s300.txt [('princesa', 0.5880060791969299), ('infanta', 0.5544092655181885), ('rainha-mãe', 0.5047087669372559), ('ex-rainha', 0.5002995729446411), ('imperatriz', 0.4982605278491974), ('raínha', 0.498224675655365), ('rainha-consorte', 0.4923296570777893), ('duquesa', 0.4890612065792084), ('condessa', 0.4880494177341461), ('regente', 0.460879385471344)]
Positive:   rei rainha 
Negative: homem

../models/cbow_s100.txt [('raínha', 0.6748343706130981), ('princesa', 0.6687842607498169), ('rainha-consorte', 0.6617846488952637), ('rainha-mãe', 0.6472653746604919), ('duquesa', 0.6418556571006775), ('pártia', 0.6328529119491577), ('imperatriz', 0.628162682056427), ('primogénita', 0.6190635561943054), ('condessa', 0.6179429292678833), ('coroação', 0.6084483861923218)]
Positive:   rei rainha 
Negative: homem

../models/cbow_s50.txt [('duquesa', 0.7385571002960205), ('princesa', 0.724755048751831), ('grã-duquesa', 0.7228216528892517), ('imperatriz', 0.7180843949317932), ('rainha-con

Troca de Palavras 

In [13]:
word1 = 'rei'
word2 = 'homem'
word3 = 'mulher'

model = getModel(modelsList[2])

list = model.most_similar(positive=[word1,word3], negative=[word2])

In [14]:
print(modelsList[2], list)
print('Positive:  ', word1,word3, '\nNegative:',word2)

../models/cbow_s100.txt [('esposa', 0.688875675201416), ('filha', 0.6741011142730713), ('governanta', 0.6724554896354675), ('sobrinha', 0.669994056224823), ('madrasta', 0.6670832633972168), ('concubina', 0.6659663915634155), ('dama-de-companhia', 0.6614803075790405), ('benção', 0.659320056438446), ('múmia', 0.6535747051239014), ('bênção', 0.6500141620635986)]
Positive:   rei mulher 
Negative: homem


### Analise da relação lateral genero / realeza na analogia rei-homem, rainha-mulher

In [14]:
model_cbow_300 = getModel('../models/ext/nilc/cbow_s300.txt')

In [11]:
model_cbow_300.most_similar(positive=['rei','mulher'],negative=['homem'])

[('rainha', 0.6863137483596802),
 ('rei/rainha', 0.6846991181373596),
 ('rainha-viúva', 0.6582984924316406),
 ('rainha-mãe', 0.6532368659973145),
 ('rainha-avó', 0.6457090377807617),
 ('rainha-consorte', 0.6448943614959717),
 ('rainha-a', 0.6409841775894165),
 ('esposa', 0.6285491585731506),
 ('rainha-imperatriz', 0.6235716938972473),
 ('esposa-irmã', 0.6200047135353088)]

In [10]:
model_cbow_300.most_similar(positive=['rei','mulher'],negative=['rainha'])

[('homem/mulher', 0.6432656049728394),
 ('homem', 0.6159863471984863),
 ('esposo', 0.610273003578186),
 ('meio-mulher', 0.5849580764770508),
 ('marido-mulher', 0.5848926305770874),
 ("'mulher", 0.5845355987548828),
 ('mulher-homem', 0.5817847847938538),
 ('\x93mulher', 0.5811667442321777),
 ('[mulher', 0.5805800557136536),
 ('homem-homem', 0.5773978233337402)]

In [15]:
model_cbow_300.most_similar(positive=['rainha','homem'],negative=['rei'])

[('homem/mulher', 0.7166236639022827),
 ('mulher', 0.6981459259986877),
 ('mulher-homem', 0.6651076078414917),
 ('«mulher', 0.6586583852767944),
 ('amulher', 0.6527774333953857),
 ("'mulher", 0.649071216583252),
 ('[mulher', 0.6452865600585938),
 ('mulher-gorila', 0.6439161896705627),
 ('\x93mulher', 0.6409512758255005),
 ('(mulher', 0.6398382186889648)]

ft skip 300

In [15]:
model_skip_300 = getModel('../models/ext/nilc/skip_s300.txt')

In [17]:
analogy(model_skip_300,['rei','mulher'],['homem'])

[('rainha', 0.7449504137039185),
 ('rainha-regente', 0.6796571016311646),
 ('princesa-regente', 0.6528103351593018),
 ('esposa', 0.6449020504951477),
 ('princesa', 0.6373282670974731)]

In [24]:
model_skip_300.most_similar(['rei','mulher'],['rainha'],5)

[('homem', 0.7047942876815796),
 ('marido', 0.6593904495239258),
 ('esposa', 0.6394582986831665),
 ('pai', 0.608444094657898),
 ('filho', 0.6038110256195068)]

In [25]:
model_skip_300.most_similar(['rainha','homem'],['rei'],5)

[('mulher', 0.7244539260864258),
 ("'mulher", 0.6234444379806519),
 ('amulher', 0.6227772831916809),
 ('mulher.', 0.6099158525466919),
 ('rapaz', 0.6031544208526611)]

glove 300

In [3]:
model_glove_300 = getModel('../models/ext/nilc/glove_s300.txt')

In [20]:
analogy(model_glove_300,['rei','mulher'],['homem'])

[('rainha', 0.7193282842636108),
 ('filha', 0.6310628652572632),
 ('esposa', 0.627386212348938),
 ('princesa', 0.6068347096443176),
 ('isabel', 0.5972704887390137)]

In [15]:
model_glove_300.most_similar(['rei','mulher'],['rainha'],5)

[('homem', 0.6963078379631042),
 ('filho', 0.6240413188934326),
 ('marido', 0.596142590045929),
 ('pai', 0.5748201608657837),
 ('jovem', 0.5624553561210632)]

In [16]:
model_glove_300.most_similar(['rainha','homem'],['rei'],5)

[('mulher', 0.6987082958221436),
 ('rapaz', 0.5554484128952026),
 ('ela', 0.548879861831665),
 ('menina', 0.5305519700050354),
 ('mãe', 0.5255801677703857)]

In [30]:
print_tex_table_format(analogy(model_glove_300, ['imperador','mulher'], ['homem']))

imperatriz & 0.60276
esposa & 0.58376
constantino & 0.55014
rainha & 0.53609
filha & 0.51772


In [31]:
print_tex_table_format(analogy(model_glove_300, ['duque','mulher'], ['homem']))

duquesa & 0.62535
filha & 0.58326
princesa & 0.57004
esposa & 0.56504
isabel & 0.55134


In [32]:
print_tex_table_format(analogy(model_glove_300, ['bombeiro','mulher'], ['homem']))

enfermeira & 0.45481
funcionária & 0.42359
bailarina & 0.41011
aposentada & 0.39691
cozinheira & 0.39164


In [10]:
vocab = model_glove_300.vocab

In [12]:
vocab['\x93dilma'].count

601835

In [13]:
vocab['dilma'].count

928201

In [15]:
vocab['bombeira'].count

772888

In [4]:
model_glove_300['bombeira']

array([ 8.46990e-02, -8.92620e-02,  5.58364e-01, -2.03585e-01,
        1.08923e-01, -2.42163e-01,  2.89198e-01,  4.09869e-01,
        3.27623e-01,  3.64873e-01,  8.50960e-02, -2.32650e-02,
        8.73880e-02,  1.40500e-01,  2.05821e-01, -1.82127e-01,
       -6.51860e-02, -3.36552e-01, -1.02728e-01, -3.34929e-01,
       -2.90140e-01, -9.08200e-03,  1.81294e-01, -6.15830e-02,
        8.32000e-03,  1.73651e-01, -1.03636e-01,  6.05953e-01,
       -4.43466e-01, -1.98048e-01,  3.71138e-01,  4.44034e-01,
       -2.11854e-01,  3.34787e-01, -1.09660e-01, -7.53010e-02,
        8.64020e-02,  2.97713e-01, -5.67650e-02, -2.25977e-01,
        3.04881e-01, -4.60010e-02,  1.96470e-02, -2.72156e-01,
        5.36984e-01,  1.07498e-01, -4.27200e-02, -4.18589e-01,
        2.10829e-01,  9.20270e-02,  1.45941e-01, -1.95663e-01,
        4.79655e-01, -4.90820e-01,  2.99465e-01, -2.24454e-01,
       -9.12500e-02, -3.88920e-02,  3.76397e-01,  4.16080e-02,
        1.33836e-01, -1.25850e-02,  6.31380e-02, -2.366

In [33]:
print_tex_table_format(analogy(model_glove_300, ['pai','mulher'], ['homem']))

mãe & 0.80397
filha & 0.79023
esposa & 0.75751
irmã & 0.73740
marido & 0.72126


In [35]:
print_tex_table_format(analogy(model_glove_300, ['médica','homem'], ['mulher']))

médico & 0.59299
clínica & 0.54171
medicina & 0.53221
paciente & 0.49083
saúde & 0.48852


## Testando os métodos indicados pelo Paper - FastText 300 e  Glove 300

In [5]:
models = []
fast = '../models/skip_s300fast.txt'
glove = '../models/glove_s300.txt'
models.append(getModel(fast))
models.append(getModel(glove))

In [6]:
model = models[0]
word1 = 'rei'
word2 = 'homem'
word3 = 'mulher'
# FastText skip300
list = model.most_similar(positive=[word1,word3], negative=[word2])

In [8]:
model = models[1]
# Glove s300
list2 = model.most_similar(positive=[word1,word3], negative=[word2])b

In [9]:
print(list)
print(list2)

[('rainha', 0.7449502348899841), ('rainha-regente', 0.6796571016311646), ('princesa-regente', 0.6528101563453674), ('esposa', 0.6449019908905029), ('princesa', 0.6373282670974731), ('consorte', 0.6351529955863953), ('«rainha', 0.6276739835739136), ('filha', 0.6266849040985107), ('desposa', 0.6201632022857666), ('rainha-a', 0.6200650930404663)]
[('rainha', 0.7193283438682556), ('filha', 0.6310628652572632), ('esposa', 0.627386212348938), ('princesa', 0.6068346500396729), ('isabel', 0.5972704887390137), ('irmã', 0.5631763935089111), ('consorte', 0.5383861064910889), ('trono', 0.5359665155410767), ('príncipe', 0.5172233581542969), ('joana', 0.5126430988311768)]


In [16]:
models[0].most_similar(positive=['primeira-dama','obama'])

[('primeira-damas', 0.8656221628189087),
 ('barack', 0.8468867540359497),
 ('ex-primeira-dama', 0.7975351214408875),
 ('obamas', 0.762374997138977),
 ('clinton', 0.7279771566390991),
 ('obama.a', 0.7279264330863953),
 ('segunda-dama', 0.721454918384552),
 ('vice-primeira-dama', 0.7174794673919678),
 ('robama', 0.713536262512207),
 ('hillary', 0.7119442224502563)]

## Exploração de Relações

### Pais-Presidente

In [4]:
def analogy(pos, neg):
    return model.most_similar(pos,neg)

In [4]:
modelsList = glob('../models/*fast.txt')
model  =  getModel(modelsList[0])

In [63]:
!ls ../models/ext/fasttext

cc.pt.300.vec  wiki-news-300d-1M-subword.vec  wiki-news-300d-1M.vec


### testes feitos em 26/abril Experimentos pro relatorio

In [64]:
modelBR = getModel('../models/ext/nilc/skip_s300.txt')

In [57]:
modelBR.most_similar(['rajoy','brasil'],['espanha'])

[('governo.dilma', 0.574255645275116),
 ('brasileira.brasília', 0.5486313700675964),
 ('temer', 0.5484487414360046),
 ('brasil\x94.', 0.5476303100585938),
 ('presidente.dilma', 0.5410065054893494),
 ('\x93dilma', 0.5401182174682617),
 ('governo.em', 0.5393486022949219),
 ('brasileirуo', 0.5374867916107178),
 ('impeachment.brasília', 0.5352616310119629),
 ('dilma\x94;', 0.5319989919662476)]

In [58]:
modelBR.most_similar(['japão','hamburguer'],['sushi'])

[('japão-', 0.5559786558151245),
 ('coréia/japão', 0.5467069149017334),
 ('alemanha', 0.54466712474823),
 ('tóquio', 0.5241599678993225),
 ('japão-00', 0.5202561616897583),
 ('japão.', 0.5193050503730774),
 ('japão-eua', 0.5170083045959473),
 ('japonese', 0.5135980844497681),
 ('japão-brasil', 0.5119423270225525),
 ('japãoe', 0.5075770616531372)]

In [59]:
modelBR.most_similar(['japão','hamburguer'],['eua'])

[('hamburgueria', 0.6456806659698486),
 ('hamburguerias', 0.6179660558700562),
 ('hamburgueres', 0.6069141030311584),
 ('hamburguers', 0.601699709892273),
 ('sushi', 0.598118007183075),
 ('shitake', 0.5863547325134277),
 ('japonesinho', 0.5727366209030151),
 ('miyaki', 0.5720593929290771),
 ('japãozinho', 0.5718837976455688),
 ('japonesinha', 0.5680685043334961)]

In [60]:
modelBR.most_similar(['japão','burguer'],['sushi'])

[('coréia/japão', 0.5539045333862305),
 ('japão.', 0.5332754850387573),
 ('canadá', 0.5200167298316956),
 ('unidos-japão', 0.5161298513412476),
 ('unidos', 0.5141969919204712),
 ('japão-', 0.51331627368927),
 ('coreia', 0.5122588276863098),
 ('coréia', 0.5115976333618164),
 ('unidos,e', 0.505362868309021),
 ('japão-eua', 0.5047953128814697)]

In [61]:
modelBR.most_similar(['japão','burguer'],['eua'])

[('x-burguer', 0.6115728616714478),
 ('sushi', 0.5868093371391296),
 ('youburguer', 0.582853376865387),
 ('burguers', 0.5764532089233398),
 ('japãozinho', 0.5654754638671875),
 ('burgueras', 0.5545012950897217),
 ('sushiman', 0.5537513494491577),
 ('sushi-bar', 0.5413253307342529),
 ('miashiro', 0.5411183834075928),
 ('sujiyama', 0.5286682844161987)]

In [62]:
modelBR.most_similar(['japão','pizza'],['sushi'])

[('japão.', 0.5643796920776367),
 ('japão-', 0.5541791915893555),
 ('coreia', 0.5432860851287842),
 ('japãoe', 0.5388157367706299),
 ('grã-bretanha', 0.5379763841629028),
 ('coréia/japão', 0.5329406261444092),
 ('coréia', 0.5293791890144348),
 ('japão-coreia', 0.5285506844520569),
 ('japão.a', 0.5228054523468018),
 ('ojapão', 0.5205355286598206)]

In [66]:
modelBR.most_similar(['japão','pizza'],['itália'])

[('sushi', 0.6261380910873413),
 ('taiyaki', 0.625140905380249),
 ('dorayaki', 0.5991750359535217),
 ('donuts', 0.5868442058563232),
 ('sukiyaki', 0.5848889350891113),
 ('cachorro-quente', 0.5829704999923706),
 ('miyaki', 0.5812810659408569),
 ('temaki', 0.5793612599372864),
 ('cachorro-quentes', 0.5730295777320862),
 ('panqueca', 0.5695137977600098)]

In [65]:
analogy(modelBR, ['alemanha','pedro'],['merkel'])

[('almerico', 0.5518991351127625),
 ('casimiro', 0.5496414303779602),
 ('espanha', 0.5445067286491394),
 ('d.álvaro', 0.544254720211029),
 ('espanha—', 0.5391455888748169)]

In [66]:
analogy(modelBR,['merkel','espanha'],['alemanha'])

[('rajoy', 0.7286854982376099),
 ('aznar', 0.6592538952827454),
 ('psoe', 0.620030403137207),
 ('zapatero', 0.6065688133239746),
 ('aznarez', 0.5837963223457336)]

In [67]:
analogy(modelBR,['merkel','espanha'],['rajoy'])

[('alemanha', 0.7647017240524292),
 ('—alemanha', 0.6655969619750977),
 ('alemanhã', 0.6514356136322021),
 ('alemanhao', 0.6505364775657654),
 ('flalemanha', 0.6500216722488403)]

In [68]:
analogy(modelBR, ['merkel','espanha'],['pedro'])

[('alemanha', 0.6142593026161194),
 ('merkels', 0.6087908744812012),
 ('—alemanha', 0.5496106147766113),
 ('alemanhã', 0.5418404340744019),
 ('daalemanha', 0.5393726825714111)]

In [69]:
analogy(modelBR, ['eua','merkel'],['obama'])

[('alemanha', 0.5969653129577637),
 ('merkels', 0.566200852394104),
 ('unidos-alemanha', 0.5261813402175903),
 ('norte-alemanha', 0.5244130492210388),
 ('unidos', 0.5220237970352173)]

Podemos observar algumas "sujeiras" na base, como palavras do vocabulário iguais [alemanha, -alemanha, alemanha., aalemanha, etc] que possivelmente poderiam acresentar o conhecimento porém são distinguidas.

In [70]:
analogy(modelBR, ['eua','rajoy'],['obama'])

[('espanha', 0.5240928530693054),
 ('psoe', 0.5091130137443542),
 ('ciudadanos', 0.483136922121048),
 ('psoe-ciudadanos', 0.47759437561035156),
 ('euskadiko', 0.467637836933136)]

In [71]:
analogy(modelBR, ['eua','conte'],['obama'])

[('aconte', 0.48051655292510986),
 ('reconte', 0.4697737693786621),
 ('contare', 0.46969354152679443),
 ('contar', 0.4668620526790619),
 ('laconte', 0.4613805115222931)]

In [72]:
analogy(modelBR, ['eua','berlusconi'],['obama'])

[('berlusconni', 0.6438901424407959),
 ('berlusconis', 0.6181439161300659),
 ('anti-berlusconi', 0.5776601433753967),
 ('fininvest', 0.5587810277938843),
 ('itália', 0.5479344725608826)]

In [73]:
analogy(modelBR, ['eua','monti'],['obama'])

[('donnini', 0.4999428987503052),
 ('brugnetti', 0.4880524277687073),
 ('pettini', 0.4826626479625702),
 ('rigamonti', 0.48097920417785645),
 ('gargani', 0.47859570384025574)]

In [74]:
analogy(modelBR, ['eua','enrico'],['obama'])

[('henrico', 0.5625894069671631),
 ('valerio', 0.5209310054779053),
 ('vitagliano', 0.5094318985939026),
 ('defrancesco', 0.5074387788772583),
 ('battistini', 0.49750643968582153)]

In [34]:
analogy(modelBR, ['eua','matteo'],['obama'])

[('itália', 0.5320799946784973),
 ('matteoti', 0.5318585634231567),
 ('itáliano', 0.5174192786216736),
 ('matteotti', 0.5150493383407593),
 ('matteoli', 0.5115050077438354),
 ('pierini', 0.5032851099967957),
 ('bartoloni', 0.5032413005828857),
 ('francesco', 0.5031288862228394),
 ('vittorini', 0.5030046105384827),
 ('itáliao', 0.5030040144920349)]

In [75]:
analogy(modelBR, ['eua','renzi'],['obama'])

[('itália', 0.5503702163696289),
 ('firenzi', 0.5223994851112366),
 ('unidos', 0.4993197023868561),
 ('matteo', 0.49648967385292053),
 ('renzis', 0.4830038547515869)]

In [76]:
analogy(modelBR, ['china','macarrão'],['yakisoba'])

[('xangai-china', 0.5721220374107361),
 ('chineses', 0.5484108924865723),
 ('chinês', 0.5439407229423523),
 ('eu-china', 0.5283563733100891),
 ('jianchang', 0.5267598032951355)]

In [77]:
analogy(modelBR, ['merkel','eua'],['alemanha'])

[('obama', 0.6659966111183167),
 ('barack', 0.6550233364105225),
 ('clinton.a', 0.594444751739502),
 ('clinton', 0.5942842960357666),
 ('clinton-assad', 0.5905669927597046)]

In [78]:
analogy(modelBR, ['merkel','italia'],['alemanha'])

[('renzi', 0.5919278860092163),
 ('italiani', 0.5549381971359253),
 ('ditalia', 0.5436686277389526),
 ('italiamo', 0.5381110310554504),
 ('italia0', 0.536308765411377)]

In [79]:
analogy(modelBR, ['merkel','brasil'],['alemanha'])

[('brasil)em', 0.5761305093765259),
 ('\x93dilma', 0.5755375623703003),
 ('brasil)\x94,', 0.5744104385375977),
 ('brasilês', 0.566964328289032),
 ('presidenta', 0.5624070167541504)]

In [80]:
analogy(modelBR, ['merkel','holanda'],['alemanha'])

[('dijssebloem', 0.6147078275680542),
 ('dijsselbloem', 0.6047036051750183),
 ('merkell', 0.596217155456543),
 ('juncker', 0.5896487236022949),
 ('merkels', 0.5853166580200195)]

In [7]:
model.most_similar(positive=['estados','unidos'])

[('eua', 0.8173750638961792),
 ('unidos,e', 0.7958360910415649),
 ('unidos.', 0.7799003720283508),
 ('unidos-', 0.7789234519004822),
 ('unidos.é', 0.7481932640075684),
 ('estadosunidos', 0.7360846400260925),
 ('unidos.fica', 0.7329683303833008),
 ('estados-unidos', 0.7277584075927734),
 ('unidos.ela', 0.7264972925186157),
 ('e.u.a', 0.7021260857582092)]

In [8]:
model.most_similar(positive=['holanda'])

[('holanda.', 0.8068545460700989),
 ('holandas', 0.7937363982200623),
 ('roterdã-holanda', 0.7753633856773376),
 ('frança/holanda', 0.7715115547180176),
 ('holanda-bélgica', 0.7678250670433044),
 ('holand', 0.7434194087982178),
 ('holanda.o', 0.7411854863166809),
 ('bélgica', 0.7325699925422668),
 ('espanha-holanda', 0.7311862111091614),
 ('holanda-alemanha', 0.7242059707641602)]

In [10]:
model.most_similar(positive=['holanda','país'])

[('frança/holanda', 0.6957878470420837),
 ('holandas', 0.6895419359207153),
 ('bélgica', 0.6713705658912659),
 ('espanha', 0.6704224944114685),
 ('portugal', 0.6673567295074463),
 ('inglaterra-holanda', 0.6671522259712219),
 ('espanha-holanda', 0.6555101871490479),
 ('frança', 0.6544818878173828),
 ('holanda-bélgica', 0.6528982520103455),
 ('abélgica', 0.647580623626709)]

In [81]:
analogy(modelBR, ['neve','calor'],['frio'])

[('chuva', 0.5868127346038818),
 ('granizo', 0.5454442501068115),
 ('evaporação', 0.5385027527809143),
 ('aguaneve', 0.5326475501060486),
 ('chuva.a', 0.5262962579727173)]

In [85]:
analogy(modelBR, ['neve', 'calor'])

[('frio', 0.7200497388839722),
 ('chuva', 0.7158651351928711),
 ('calorão', 0.7057672739028931),
 ('friagem', 0.6958485841751099),
 ('temperaturas', 0.6911848783493042)]

In [87]:
analogy(modelBR, ['neve'], ['frio'])

[('cinderela', 0.2628064453601837),
 ('«montanha', 0.252099871635437),
 ('montanha', 0.24522665143013),
 ('snowboards', 0.24412553012371063),
 ('augi', 0.23581856489181519)]

In [82]:
analogy(modelBR, ['cachoro','felino'],['canino'])

[('cachorrinho', 0.6359288096427917),
 ('cachora', 0.6300210952758789),
 ('cachorrito', 0.6155411005020142),
 ('cachorrona', 0.6097216606140137),
 ('cachorrinhos', 0.6066243648529053)]

In [84]:
analogy(modelBR, ['gato','canino'],['cachorro'])

[('caninos', 0.6211661100387573),
 ('felino', 0.6080907583236694),
 ('gatopardo', 0.5799951553344727),
 ('canini', 0.5735528469085693),
 ('canin', 0.5718478560447693)]

#### Update 02/05/2020
## Exploração de analogias em Fastext Cbow 300

In [121]:
fastext_cbow_300_pt = getModel('../models/ext/nilc/cbow_s300.txt')

In [64]:
print_tex_table_format(analogy(fastext_cbow_300_pt, ['merkel','espanha'], ['alemanha']))

merkez & 0.71391
merkel.o & 0.66448
merkell & 0.60394
merkels & 0.59695
aznarez & 0.59658


In [65]:
print_tex_table_format(analogy(fastext_cbow_300_pt, ['rajoy','brasil'], ['espanha']))

tucano.o & 0.52610
brasil), & 0.49735
fhc & 0.47915
tucanês & 0.47823
tucan & 0.47559


In [66]:
print_tex_table_format(analogy(fastext_cbow_300_pt, ['merkel','brasil'], ['alemanha']))

brasil), & 0.60519
brasil, & 0.60134
brasil. & 0.57375
brasil_ & 0.57179
brasil,o & 0.56721


In [67]:
print_tex_table_format(analogy(fastext_cbow_300_pt, ['gato','canino'], ['cachorro']))

ganino & 0.66918
gatopardo & 0.66591
gato-leopardo & 0.65985
gato-andino & 0.65909
anino & 0.65885


In [79]:
print_tex_table_format(analogy(fastext_cbow_300_pt, ['japão','burger'], ['sushi']))

brandenburger & 0.59032
teuto-norte-americano & 0.58228
marburger & 0.58033
neerlando-americano & 0.58029
nordbrandenburger & 0.57957


## Exploração de analogias em GloVe 300

In [34]:
!ls ../models/ext/nilc

cbow_s300.txt  glove_s300.txt  skip_s300.txt


In [122]:
glove_300_pt = getModel('../models/ext/nilc/glove_s300.txt')

In [124]:
print_tex_table_format(glove_300_pt.most_similar_cosmul(['obama','brasil'], ['eua'], 5))

dilma & 0.7911
lembrou & 0.7808
temer & 0.7797
rousseff & 0.7732
presidenta & 0.7682


In [125]:
print_tex_table_format(glove_300_pt.most_similar(['obama','brasil'], ['eua'], 5))

barack & 0.5020
dilma & 0.4605
lembrou & 0.4471
temer & 0.4423
rousseff & 0.4331


In [84]:
print_tex_table_format(glove_300_pt.most_similar('rainha'))

princesa & 0.68855
isabel & 0.67442
rei & 0.64577
esposa & 0.59849
irmã & 0.58511
príncipe & 0.58037
consorte & 0.56825
filha & 0.55982
margarida & 0.55164
maria & 0.55025


In [85]:
print_tex_table_format(glove_300_pt.most_similar_cosmul(['merkel','espanha'], ['alemanha'], 5))

aznar & 0.89494
rajoy & 0.80040
hollande & 0.77528
tsipras & 0.76724
sarkozy & 0.75418


In [68]:
print_tex_table_format(analogy(glove_300_pt, ['merkel','espanha'], ['alemanha']))

aznar & 0.57278
hollande & 0.51189
angela & 0.48401
rajoy & 0.47394
tsipras & 0.46970


In [69]:
print_tex_table_format(analogy(glove_300_pt, ['rajoy','brasil'], ['espanha']))

temer & 0.40790
dilma & 0.35921
interino & 0.35611
rousseff & 0.35059
fhc & 0.34323


In [70]:
print_tex_table_format(analogy(glove_300_pt, ['merkel','brasil'], ['alemanha']))

angela & 0.43377
obama & 0.43102
dilma & 0.42379
hollande & 0.40792
rousseff & 0.40785


In [71]:
print_tex_table_format(analogy(glove_300_pt, ['gato','canino'], ['cachorro']))

metálico & 0.36818
fino & 0.32361
mitológico & 0.31854
bárbaro & 0.31614
sedentário & 0.31549


In [72]:
print_tex_table_format(analogy(glove_300_pt, ['gato','canino'], ['felino']))

cão & 0.44328
cachorro & 0.41754
garoto & 0.36823
cavalo & 0.36806
meu & 0.34271


Parece que soluções laterais podem ser obtidas - Problema do método linear -> so verifica uma resposta

In [73]:
print_tex_table_format(analogy(glove_300_pt, ["cão","felino"], ['canino']))

gato & 0.55100
cachorro & 0.54393
cães & 0.47377
animal & 0.47262
macaco & 0.45189


In [74]:
print_tex_table_format(analogy(glove_300_pt, ["cão","felino"], ['gato']))

predador & 0.49886
animal & 0.41843
canídeo & 0.38755
pré-histórico & 0.37428
nativo & 0.37316


In [81]:
print_tex_table_format(analogy(glove_300_pt, ['japão','burger'], ['eua']))

aki & 0.37331
hokkaido & 0.35904
cooter & 0.35661
mcdonald's & 0.35195
shima & 0.34307


In [87]:
print_tex_table_format(glove_300_pt.most_similar_cosmul(['japão','burger'], ['alemanha'], 5))

mcdonald's & 0.83972
nightmare & 0.81441
abe & 0.79952
candy & 0.79379
mekk & 0.78611


### Utilizando Base Pre-Treinada com o FastText a partir da Wikipedia

src: https://fasttext.cc/docs/en/english-vectors.html

In [6]:
modelList = glob('../models/eng/*.vec')
model = getModel('../models/ext/fasttext/wiki-news-300d-1M.vec')

In [42]:
model.most_similar(['canine','cat'],['dog'])

[('feline', 0.8340439796447754),
 ('cats', 0.7032248973846436),
 ('felines', 0.6862081289291382),
 ('Feline', 0.6237226724624634),
 ('Cat', 0.5961567759513855),
 ('Saber-toothed', 0.5884695053100586),
 ('kitten', 0.5774850845336914),
 ('cat-related', 0.5745742321014404),
 ('cat-like', 0.5722991228103638),
 ('rodent', 0.5714539289474487)]

In [37]:
model.most_similar(positive=['Japan','burger'], negative=['U.S.A'])

[('hamburger', 0.6090177893638611),
 ('burgers', 0.5968979597091675),
 ('sushi', 0.5891664028167725),
 ('cheeseburger', 0.5540557503700256),
 ('ramen', 0.5526171922683716),
 ('sashimi', 0.5424988269805908),
 ('sandwich', 0.5383584499359131),
 ('hamburgers', 0.5355225205421448),
 ('fast-food', 0.5213922262191772),
 ('steak', 0.5205715894699097)]

In [38]:
model.most_similar(positive=['Japan','burger'], negative=['USA'])

[('hamburger', 0.630410373210907),
 ('sushi', 0.6180722713470459),
 ('burgers', 0.6126525402069092),
 ('ramen', 0.6112750172615051),
 ('yakitori', 0.6034594178199768),
 ('sashimi', 0.5863991975784302),
 ('cheeseburger', 0.5636043548583984),
 ('teriyaki', 0.5585513114929199),
 ('hamburgers', 0.5578770637512207),
 ('gyoza', 0.5487343668937683)]

In [35]:
model.most_similar(positive=['Japan','burger'], negative=['sushi'])

[('Germany', 0.589952290058136),
 ('Canada', 0.5562193393707275),
 ('U.S.A.', 0.5414534211158752),
 ('Australia', 0.5396606922149658),
 ('Britain', 0.5318068265914917),
 ('France', 0.5265113711357117),
 ('China', 0.5201818943023682),
 ('Burger', 0.5177664756774902),
 ('USA', 0.5154217481613159),
 ('Korea', 0.5103574395179749)]

In [39]:
model.most_similar(positive=['Japan','pizza'], negative=['sushi'])

[('Italy', 0.6227899789810181),
 ('Germany', 0.60132896900177),
 ('Pizza', 0.5715957283973694),
 ('Canada', 0.5570455193519592),
 ('Australia', 0.5475723743438721),
 ('Greece', 0.5475215911865234),
 ('Europe', 0.5442083477973938),
 ('USA', 0.534557580947876),
 ('China', 0.5322703123092651),
 ('Poland', 0.5261463522911072)]

### Nota-se a seguir a diferença para resultados lower-case, demonstrando a diferença entre vetores na base

In [7]:
model.most_similar(positive=['Merkel','Brazil'], negative=['Germany'])

[('Rousseff', 0.7313451170921326),
 ('Roussef', 0.6911478042602539),
 ('Rouseff', 0.6840354204177856),
 ('Dilma', 0.6766487956047058),
 ('Lula', 0.6736571788787842),
 ('Petrobras', 0.5874807238578796),
 ('Cardoso', 0.5827363133430481),
 ('Suplicy', 0.5818279981613159),
 ('Sarney', 0.5682660937309265),
 ('Brazilians', 0.5665429830551147)]

In [72]:
analogy(['snow','hot'], ['cold'])

[('snowfall', 0.5610437393188477),
 ('snow.', 0.5231389403343201),
 ('mountain', 0.5171799659729004),
 ('leopard', 0.5139029026031494),
 ('snows', 0.5021113157272339)]

In [6]:
analogy(['germany','pedro'],['merkel'])

[('spain', 0.5449022054672241),
 ('uruguay', 0.5438195466995239),
 ('austrailia', 0.5267531871795654),
 ('france', 0.5248713493347168),
 ('brasil', 0.520304799079895),
 ('italy', 0.5182735323905945),
 ('mexico', 0.5151197910308838),
 ('brazil', 0.5144617557525635),
 ('diego', 0.5095052719116211),
 ('portugal', 0.5055333375930786)]

In [12]:
analogy(['brazil', 'merkel'],['germany'])

[('Rouseff', 0.5680029988288879),
 ('Roussef', 0.558526337146759),
 ('juncker', 0.5372946262359619),
 ('Dilma', 0.5332738161087036),
 ('oboma', 0.5172426700592041),
 ('jarrett', 0.5172005295753479),
 ('blatter', 0.5170618295669556),
 ('butt.', 0.516357958316803),
 ('zuma', 0.5108668804168701),
 ('Merkal', 0.5086339712142944)]

In [13]:
analogy(['germany','Dilma'],['merkel'])

[('Brazil', 0.6214088201522827),
 ('Brasil', 0.6054402589797974),
 ('brazil', 0.5685147047042847),
 ('brazilian', 0.5589703321456909),
 ('PSDB', 0.5487357378005981),
 ('brazilians', 0.5418741106987),
 ('Brazilians', 0.5331428050994873),
 ('Roussef', 0.5214848518371582),
 ('Campinas', 0.5139091610908508),
 ('Brasilian', 0.5097894072532654)]

In [14]:
analogy(['germany','Lula'],['merkel'])

[('Brazil', 0.6056524515151978),
 ('Brasil', 0.5765014886856079),
 ('Argentina', 0.507718026638031),
 ('Brazilian', 0.493157297372818),
 ('Brasilia', 0.48074477910995483),
 ('Campinas', 0.4762342572212219),
 ('Uruguay', 0.47142088413238525),
 ('brazil', 0.46736085414886475),
 ('Ecuador', 0.46712833642959595),
 ('Brazilians', 0.46171486377716064)]

In [16]:
analogy(['Germany','Pedro'],['Merkel'])

[('Luis', 0.5925270318984985),
 ('Carlos', 0.5798797607421875),
 ('Juan', 0.5720661878585815),
 ('Miguel', 0.5616669654846191),
 ('Francisco', 0.5607050657272339),
 ('Fernando', 0.5570615530014038),
 ('Felipe', 0.5556502342224121),
 ('Spain', 0.5526822805404663),
 ('Brazil', 0.5482572317123413),
 ('Portugal', 0.5244642496109009)]

In [22]:
analogy(['canine','cat'],['dog'])

[('feline', 0.8340439796447754),
 ('cats', 0.703224778175354),
 ('felines', 0.6862080097198486),
 ('Feline', 0.6237226724624634),
 ('Cat', 0.5961567163467407),
 ('Saber-toothed', 0.5884692668914795),
 ('kitten', 0.5774850249290466),
 ('cat-related', 0.5745742321014404),
 ('cat-like', 0.5722991824150085),
 ('rodent', 0.5714539885520935)]

In [27]:
analogy(['china','dish'],[])

[('dishes', 0.7780748605728149),
 ('porcelain', 0.6793701648712158),
 ('cuisine', 0.6353912949562073),
 ('crockery', 0.625084638595581),
 ('tableware', 0.6163812875747681),
 ('chinese', 0.6142603158950806),
 ('dessert', 0.6098870635032654),
 ('chinaware', 0.5995481014251709),
 ('vase', 0.5953329801559448),
 ('kitchen', 0.5941058993339539)]

In [30]:
analogy(['spain','minister'],[])

[('ministers', 0.6690819263458252),
 ('france', 0.6395168900489807),
 ('governement', 0.639469563961029),
 ('minster', 0.6383272409439087),
 ('england', 0.6359577775001526),
 ('goverment', 0.6316805481910706),
 ('minister.', 0.6290231943130493),
 ('minsiter', 0.6260089874267578),
 ('catalonia', 0.6244105100631714),
 ('britain', 0.6221211552619934)]

In [22]:
analogy(['america','conte'],['obama'])

[('novelle', 0.49730175733566284),
 ('imaginaire', 0.4891471266746521),
 ('comedia', 0.4860025644302368),
 ('marchand', 0.4846856892108917),
 ('contes', 0.4834492802619934)]

* Testando relações de Comidas e Regiões

In [5]:
!ls ../models/ext/fasttext

cc.pt.300.vec  wiki-news-300d-1M-subword.vec  wiki-news-300d-1M.vec


In [11]:
model = getModel("../models/ext/fasttext/wiki-news-300d-1M-subword.vec")

In [16]:
analogy(model,['china'])

[('china.', 0.6974451541900635),
 ('chinas', 0.6943490505218506),
 ('porcelain', 0.6891270875930786),
 ('chinese', 0.6583600044250488),
 ('taiwan', 0.6369878053665161)]

In [19]:
analogy(model, ["usa","china"],["hamburger"])

[('russia', 0.6298999786376953),
 ('india', 0.5967906713485718),
 ('china.', 0.5941920876502991),
 ('america', 0.5697692632675171),
 ('brazil', 0.5591113567352295)]

In [23]:
analogy(model, ["hamburger", "japan"],["usa"])

[('hamburgers', 0.6760963201522827),
 ('burger', 0.6408655643463135),
 ('cheeseburger', 0.598473072052002),
 ('burgers', 0.5784617066383362),
 ('steak', 0.5778934359550476)]

In [24]:
analogy(model, ["japan", "hamburger"],["sushi"])

[('hamburg', 0.5717610120773315),
 ('germany', 0.564173698425293),
 ('america', 0.5614770650863647),
 ('europe', 0.5546668767929077),
 ('britain', 0.5541375875473022)]

In [21]:
analogy(model, ["usa", "sushi"],["hamburger"])

[('japan', 0.5848140716552734),
 ('sochi', 0.5593737363815308),
 ('usa.', 0.5162376165390015),
 ('japan.', 0.5158356428146362),
 ('peru', 0.5003796219825745)]

In [26]:
res = model.evaluate_word_analogies("../testsets/ownBR.txt")

In [27]:
res

(0.25,
 [{'section': 'country-food',
   'correct': [('JAPAN', 'SUSHI', 'USA', 'PIZZA')],
   'incorrect': [('JAPAN', 'SUSHI', 'USA', 'HAMBURGER'),
    ('JAPAN', 'SUSHI', 'FRANCE', 'TAPAS'),
    ('JAPAN', 'SUSHI', 'GERMANY', 'BRATWURST')]},
  {'section': 'Total accuracy',
   'correct': [('JAPAN', 'SUSHI', 'USA', 'PIZZA')],
   'incorrect': [('JAPAN', 'SUSHI', 'USA', 'HAMBURGER'),
    ('JAPAN', 'SUSHI', 'FRANCE', 'TAPAS'),
    ('JAPAN', 'SUSHI', 'GERMANY', 'BRATWURST')]}])



## Comparando Modelos FastText [ Default x Subword by Wiki]

In [7]:
modelList = glob('../models/*.vec')

def getAnalogyByTriple(triple):
    return analogy(triple[:2], triple[2:])

def getAnalogies(model):
    analogieSet = [
        ['canine','cat','dog'],
        ['Germany','Pedro','Merkel'],
        ['germany','Lula','merkel'],
        ['germany','Dilma','merkel'],
        ['brazil', 'merkel','germany'],
        ['germany','pedro','merkel'],
        ['merkel','italy','germany'],
        ['usa','conte','obama']]
    
    res=[]
    
    for triple in analogieSet:
        res.append(str(triple) + ' : ' + str(getAnalogyByTriple(triple)))
    
    
    
    return res

In [10]:
resultsDict = {}
for modelRef in modelList:
    model = getModel(modelRef)
    
    analogiesSet = getAnalogies(model)
    resultsDict[modelRef] = analogiesSet

["['canine', 'cat', 'dog'] : [('feline', 0.8340439796447754), ('cats', 0.7032248973846436), ('felines', 0.6862081289291382), ('Feline', 0.6237226724624634), ('Cat', 0.5961567759513855)]", "['Germany', 'Pedro', 'Merkel'] : [('Luis', 0.5925270915031433), ('Carlos', 0.5798799395561218), ('Juan', 0.5720661282539368), ('Miguel', 0.5616670250892639), ('Francisco', 0.5607051253318787)]", "['germany', 'Lula', 'merkel'] : [('Brazil', 0.6056525707244873), ('Brasil', 0.5765014886856079), ('Argentina', 0.5077179670333862), ('Brazilian', 0.4931572675704956), ('Brasilia', 0.48074474930763245)]", "['germany', 'Dilma', 'merkel'] : [('Brazil', 0.6214088201522827), ('Brasil', 0.6054402589797974), ('brazil', 0.5685147047042847), ('brazilian', 0.5589703917503357), ('PSDB', 0.5487357378005981)]", "['brazil', 'merkel', 'germany'] : [('Rouseff', 0.5680030584335327), ('Roussef', 0.5585262775421143), ('juncker', 0.5372946262359619), ('Dilma', 0.5332738161087036), ('oboma', 0.5172426700592041)]", "['germany', '

In [11]:
for modelRef in resultsDict:
    print('\n' + modelRef.split('/')[2])
    showList(resultsDict[modelRef])


wiki-news-300d-1M.vec
	 ['canine', 'cat', 'dog'] : [('feline', 0.8340439796447754), ('cats', 0.7032248973846436), ('felines', 0.6862081289291382), ('Feline', 0.6237226724624634), ('Cat', 0.5961567759513855)] 

	 ['Germany', 'Pedro', 'Merkel'] : [('Luis', 0.5925270915031433), ('Carlos', 0.5798799395561218), ('Juan', 0.5720661282539368), ('Miguel', 0.5616670250892639), ('Francisco', 0.5607051253318787)] 

	 ['germany', 'Lula', 'merkel'] : [('Brazil', 0.6056525707244873), ('Brasil', 0.5765014886856079), ('Argentina', 0.5077179670333862), ('Brazilian', 0.4931572675704956), ('Brasilia', 0.48074474930763245)] 

	 ['germany', 'Dilma', 'merkel'] : [('Brazil', 0.6214088201522827), ('Brasil', 0.6054402589797974), ('brazil', 0.5685147047042847), ('brazilian', 0.5589703917503357), ('PSDB', 0.5487357378005981)] 

	 ['brazil', 'merkel', 'germany'] : [('Rouseff', 0.5680030584335327), ('Roussef', 0.5585262775421143), ('juncker', 0.5372946262359619), ('Dilma', 0.5332738161087036), ('oboma', 0.51724267

## Acuracia dos Modelos pré-treinados para os TestSets obtidos

### Modelo PT disponivel em fasttext.cc

In [3]:
!ls ../models/ext/fasttext

cc.pt.300.vec  wiki-news-300d-1M-subword.vec  wiki-news-300d-1M.vec


In [5]:
fastxt_pt_model_ref = '../models/ext/fasttext/cc.pt.300.vec'
test_set = glob('../testsets/*Analogies*')

print('fastxt_pt', fastxt_pt_model_ref)
print('test_set', test_set)

fastxt_pt ../models/ext/fasttext/cc.pt.300.vec
test_set ['../testsets/LX-4WAnalogiesBr.txt', '../testsets/LX-4WAnalogies.txt']


In [26]:
fastxt_model = getModel(fastxt_pt_model_ref)

In [33]:
eval_analogies_acc_keydvec(fastxt_model, test_set, case_insensitive=True, dummy4unknown=True)
eval_analogies_acc_keydvec(fastxt_model, test_set, case_insensitive=True, dummy4unknown=False)
eval_analogies_acc_keydvec(fastxt_model, test_set, case_insensitive=False, dummy4unknown=True)
eval_analogies_acc_keydvec(fastxt_model, test_set, case_insensitive=False, dummy4unknown=False)

../models/ext/fasttext/cc.pt.300.vec  @  ../testsets/LX-4WAnalogiesBr.txt  :  0.5622508258343775  /   | setting 0 to missing words | case insensitive
../models/ext/fasttext/cc.pt.300.vec  @  ../testsets/LX-4WAnalogies.txt  :  0.5491507977354606  /   | setting 0 to missing words | case insensitive
../models/ext/fasttext/cc.pt.300.vec  @  ../testsets/LX-4WAnalogiesBr.txt  :  0.6830888458344866  /   | case insensitive
../models/ext/fasttext/cc.pt.300.vec  @  ../testsets/LX-4WAnalogies.txt  :  0.666736096646532  /   | case insensitive
../models/ext/fasttext/cc.pt.300.vec  @  ../testsets/LX-4WAnalogiesBr.txt  :  0.5436837908645632  /   | setting 0 to missing words | case sensitive
../models/ext/fasttext/cc.pt.300.vec  @  ../testsets/LX-4WAnalogies.txt  :  0.5313661577171613  /   | setting 0 to missing words | case sensitive
../models/ext/fasttext/cc.pt.300.vec  @  ../testsets/LX-4WAnalogiesBr.txt  :  0.6775017743080198  /   | case sensitive
../models/ext/fasttext/cc.pt.300.vec  @  ../testse

* Modelos FastText: CBOW(300) e Skipgram(300); e Glove(300) disponiveis no repositorio de wordEmebddings do NILC

#### Modelos obtidos em Nilc

In [47]:
nilc_model_ref_list = glob('../models/ext/nilc/*.txt')

In [48]:
pool = multiprocessing.Pool()
pool_res = pool.starmap(eval_all_params, product(nilc_model_ref_list, test_set))

../models/ext/nilc/glove_s300.txt @ ../testsets/LX-4WAnalogiesBr.txt0.43655313817063446 | setting 0 to missing words | case insensitive
../models/ext/nilc/glove_s300.txt @ ../testsets/LX-4WAnalogies.txt0.42889003259564246 | setting 0 to missing words | case insensitive
../models/ext/nilc/skip_s300.txt @ ../testsets/LX-4WAnalogiesBr.txt0.49407677412005924 | setting 0 to missing words | case insensitive
../models/ext/nilc/cbow_s300.txt @ ../testsets/LX-4WAnalogies.txt0.3837708011665809 | setting 0 to missing words | case insensitive
../models/ext/nilc/cbow_s300.txt @ ../testsets/LX-4WAnalogiesBr.txt0.3660439685613396 | setting 0 to missing words | case insensitive
../models/ext/nilc/skip_s300.txt @ ../testsets/LX-4WAnalogies.txt0.49385257619946243 | setting 0 to missing words | case insensitive
../models/ext/nilc/glove_s300.txt @ ../testsets/LX-4WAnalogiesBr.txt0.47632363907531694 | case insensitive
../models/ext/nilc/glove_s300.txt @ ../testsets/LX-4WAnalogies.txt0.46462644034196504 | c

In [19]:
skip_s300 = getModel('../models/ext/nilc//skip_s300.txt')

In [21]:
skip_s300

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f42a2f6f748>

In [20]:
skip_s300.evaluate_word_analogies('../testsets/LX-4WAnalogies.txt', case_insensitive=True, dummy4unknown=False)

(0.5350018585057613,
 [{'section': 'capital-common-countries',
   'correct': [('ATENAS', 'GRÉCIA', 'BAGDADE', 'IRAQUE'),
    ('ATENAS', 'GRÉCIA', 'BANGUECOQUE', 'TAILÂNDIA'),
    ('ATENAS', 'GRÉCIA', 'PEQUIM', 'CHINA'),
    ('ATENAS', 'GRÉCIA', 'BERLIM', 'ALEMANHA'),
    ('ATENAS', 'GRÉCIA', 'CAIRO', 'EGITO'),
    ('ATENAS', 'GRÉCIA', 'CAMBERRA', 'AUSTRÁLIA'),
    ('ATENAS', 'GRÉCIA', 'HANÓI', 'VIETNAME'),
    ('ATENAS', 'GRÉCIA', 'HAVANA', 'CUBA'),
    ('ATENAS', 'GRÉCIA', 'HELSÍNQUIA', 'FINLÂNDIA'),
    ('ATENAS', 'GRÉCIA', 'LONDRES', 'INGLATERRA'),
    ('ATENAS', 'GRÉCIA', 'MADRID', 'ESPANHA'),
    ('ATENAS', 'GRÉCIA', 'MOSCOVO', 'RÚSSIA'),
    ('ATENAS', 'GRÉCIA', 'OSLO', 'NORUEGA'),
    ('ATENAS', 'GRÉCIA', 'OTTAWA', 'CANADÁ'),
    ('ATENAS', 'GRÉCIA', 'PARIS', 'FRANÇA'),
    ('ATENAS', 'GRÉCIA', 'ROMA', 'ITÁLIA'),
    ('ATENAS', 'GRÉCIA', 'ESTOCOLMO', 'SUÉCIA'),
    ('ATENAS', 'GRÉCIA', 'TÓQUIO', 'JAPÃO'),
    ('BAGDADE', 'IRAQUE', 'BANGUECOQUE', 'TAILÂNDIA'),
    ('BAGDADE', 'IR

### Question Seen Modelos NILC

In [4]:
cbow_s300 = getModel('../models/ext/nilc/cbow_s300.txt')

In [14]:
question_seen_cbow_s300_lxPT_ci = question_seen_in_model(cbow_s300, testset[1], True )
question_seen_cbow_s300_lxPT_cs = question_seen_in_model(cbow_s300, testset[1], False )
question_seen_cbow_s300_LxBR_ci = question_seen_in_model(cbow_s300, testset[0], True)
question_seen_cbow_s300_LxBR_cs = question_seen_in_model(cbow_s300, testset[0], False)

In [15]:
print(question_seen_cbow_s300_lxPT_ci['total'])
print(question_seen_cbow_s300_lxPT_cs['total'])
print(question_seen_cbow_s300_LxBR_ci['total'])
print(question_seen_cbow_s300_LxBR_cs['total'])

{'q_no': 17487, 'q_seen': 16142}
{'q_no': 17487, 'q_seen': 7125}
{'q_no': 17558, 'q_seen': 16092}
{'q_no': 17558, 'q_seen': 7114}


In [24]:
print('%.2f' % (100*float(question_seen_cbow_s300_lxPT_ci['total']['q_seen']/question_seen_cbow_s300_lxPT_ci['total']['q_no'])))
print('%.2f' % (100*float(question_seen_cbow_s300_lxPT_cs['total']['q_seen']/question_seen_cbow_s300_lxPT_cs['total']['q_no'])))
print('%.2f' % (100*float(question_seen_cbow_s300_LxBR_ci['total']['q_seen']/question_seen_cbow_s300_LxBR_ci['total']['q_no'])))
print('%.2f' % (100*float(question_seen_cbow_s300_LxBR_cs['total']['q_seen']/question_seen_cbow_s300_LxBR_cs['total']['q_no'])))

92.31
40.74
91.65
40.52


In [28]:
skip_s300 = getModel('../models/ext/nilc/skip_s300.txt')

In [16]:
question_seen_skip_s300_lxPT_ci = question_seen_in_model(skip_s300, testset[1], True )
question_seen_skip_s300_lxPT_cs = question_seen_in_model(skip_s300, testset[1], False )
question_seen_skip_s300_LxBR_ci = question_seen_in_model(skip_s300, testset[0], True)
question_seen_skip_s300_LxBR_cs = question_seen_in_model(skip_s300, testset[0], False)

In [17]:
print(question_seen_skip_s300_lxPT_ci['total'])
print(question_seen_skip_s300_lxPT_cs['total'])
print(question_seen_skip_s300_LxBR_ci['total'])
print(question_seen_skip_s300_LxBR_cs['total'])

{'q_no': 17487, 'q_seen': 16142}
{'q_no': 17487, 'q_seen': 7125}
{'q_no': 17558, 'q_seen': 16092}
{'q_no': 17558, 'q_seen': 7114}


In [30]:
print(' skip_s300_lxPT_ci %.2f' % (100 * float(question_seen_skip_s300_lxPT_ci['total']['q_seen']/question_seen_skip_s300_lxPT_ci['total']['q_no'])))
print(' skip_s300_lxPT_cs %.2f' % (100 * float(question_seen_skip_s300_lxPT_cs['total']['q_seen']/question_seen_skip_s300_lxPT_cs['total']['q_no'])))
print(' skip_s300_LxBR_ci %.2f' % (100 * float(question_seen_skip_s300_LxBR_ci['total']['q_seen']/question_seen_skip_s300_LxBR_ci['total']['q_no'])))
print(' skip_s300_LxBR_cs %.2f' % (100 * float(question_seen_skip_s300_LxBR_cs['total']['q_seen']/question_seen_skip_s300_LxBR_cs['total']['q_no'])))

 skip_s300_lxPT_ci 92.31
 skip_s300_lxPT_cs 40.74
 skip_s300_LxBR_ci 91.65
 skip_s300_LxBR_cs 40.52


In [27]:
glove_s300 = getModel('../models/ext/nilc/glove_s300.txt')

In [42]:
glove_s300.most_similar_cosmul(['itália','capital'])

[('frança', 0.5731724500656128),
 ('cidade', 0.5694936513900757),
 ('espanha', 0.5664260983467102),
 ('país', 0.5657956600189209),
 ('sul', 0.5462279915809631),
 ('alemanha', 0.5461822152137756),
 ('norte', 0.5430349707603455),
 ('roma', 0.5424421429634094),
 ('grécia', 0.5380972623825073),
 ('parte', 0.5333743095397949)]

In [18]:
question_seen_glove_s300_lxPT_ci = question_seen_in_model(glove_s300, testset[1], True )
question_seen_glove_s300_lxPT_cs = question_seen_in_model(glove_s300, testset[1], False )
question_seen_glove_s300_LxBR_ci = question_seen_in_model(glove_s300, testset[0], True)
question_seen_glove_s300_LxBR_cs = question_seen_in_model(glove_s300, testset[0], False)

In [19]:
print(question_seen_glove_s300_lxPT_ci['total'])
print(question_seen_glove_s300_lxPT_cs['total'])
print(question_seen_glove_s300_LxBR_ci['total'])
print(question_seen_glove_s300_LxBR_cs['total'])

{'q_no': 17487, 'q_seen': 16142}
{'q_no': 17487, 'q_seen': 7125}
{'q_no': 17558, 'q_seen': 16092}
{'q_no': 17558, 'q_seen': 7114}


In [29]:
print(' glove_s300_lxPT_ci %2f' % (100 * float(question_seen_glove_s300_lxPT_ci['total']['q_seen']/question_seen_glove_s300_lxPT_ci['total']['q_no'])))
print(' glove_s300_lxPT_cs %2f' % (100 * float(question_seen_glove_s300_lxPT_cs['total']['q_seen']/question_seen_glove_s300_lxPT_cs['total']['q_no'])))
print(' glove_s300_LxBR_ci %2f' % (100 * float(question_seen_glove_s300_LxBR_ci['total']['q_seen']/question_seen_glove_s300_LxBR_ci['total']['q_no'])))
print(' glove_s300_LxBR_cs %2f' % (100 * float(question_seen_glove_s300_LxBR_cs['total']['q_seen']/question_seen_glove_s300_LxBR_cs['total']['q_no'])))

 glove_s300_lxPT_ci 92.308572
 glove_s300_lxPT_cs 40.744553
 glove_s300_LxBR_ci 91.650530
 glove_s300_LxBR_cs 40.517143


In [25]:
model  = getModel('../models/ext/nilc/cbow_s300.txt')

In [26]:
cbow_s300 = model

#### Question Seen com Vocabulário irrestrito

In [30]:
question_seen_glove_s300_lxPT_irrest = question_seen_in_model(glove_s300, testset[1], True, restrict_vocab=None)
question_seen_glove_s300_LxBR_irrest = question_seen_in_model(glove_s300, testset[0], True, restrict_vocab=None)

question_seen_cbow_s300_lxPT_irrest = question_seen_in_model(cbow_s300, testset[1], True, restrict_vocab=None)
question_seen_cbow_s300_LxBR_irrest = question_seen_in_model(cbow_s300, testset[0], True, restrict_vocab=None)

question_seen_skip_s300_lxPT_irrest = question_seen_in_model(skip_s300, testset[1], True, restrict_vocab=None)
question_seen_skip_s300_LxBR_irrest = question_seen_in_model(skip_s300, testset[0], True, restrict_vocab=None)

In [31]:
print(' glove_s300_lxPT_ci %2f' % (100 * float(question_seen_glove_s300_lxPT_irrest['total']['q_seen']/question_seen_glove_s300_lxPT_irrest['total']['q_no'])))
print(' glove_s300_lxPT_cs %2f' % (100 * float(question_seen_glove_s300_lxPT_irrest['total']['q_seen']/question_seen_glove_s300_lxPT_irrest['total']['q_no'])))

print(' cbow_s300_LxBR_ci %2f' % (100 * float(question_seen_cbow_s300_LxBR_irrest['total']['q_seen']/question_seen_cbow_s300_LxBR_irrest['total']['q_no'])))
print(' cbow_s300_LxBR_cs %2f' % (100 * float(question_seen_cbow_s300_LxBR_irrest['total']['q_seen']/question_seen_cbow_s300_LxBR_irrest['total']['q_no'])))

print(' skip_s300_LxBR_ci %2f' % (100 * float(question_seen_skip_s300_LxBR_irrest['total']['q_seen']/question_seen_skip_s300_LxBR_irrest['total']['q_no'])))
print(' skip_s300_LxBR_cs %2f' % (100 * float(question_seen_skip_s300_LxBR_irrest['total']['q_seen']/question_seen_skip_s300_LxBR_irrest['total']['q_no'])))

 glove_s300_lxPT_ci 97.192200
 glove_s300_lxPT_cs 97.192200
 cbow_s300_LxBR_ci 96.992824
 cbow_s300_LxBR_cs 96.992824
 skip_s300_LxBR_ci 96.992824
 skip_s300_LxBR_cs 96.992824


In [32]:
question_seen_glove_s300_lxPT_irrest_ci = question_seen_in_model(glove_s300, testset[1], True, restrict_vocab=None)
question_seen_glove_s300_LxBR_irrest_ci = question_seen_in_model(glove_s300, testset[0], True, restrict_vocab=None)

question_seen_glove_s300_lxPT_irrest_cs = question_seen_in_model(glove_s300, testset[1], False, restrict_vocab=None)
question_seen_glove_s300_LxBR_irrest_cs = question_seen_in_model(glove_s300, testset[0], False, restrict_vocab=None)

In [37]:
print(' glove_s300_lxPT_ci %.2f' % (100 * float(question_seen_glove_s300_lxPT_irrest_ci['total']['q_seen']/question_seen_glove_s300_lxPT_irrest_ci['total']['q_no'])))
print(' glove_s300_lxBr_ci %.2f' % (100 * float(question_seen_glove_s300_LxBR_irrest_ci['total']['q_seen']/question_seen_glove_s300_LxBR_irrest_ci['total']['q_no'])))

print(' glove_s300_lxPT_cs %.2f' % (100 * float(question_seen_glove_s300_lxPT_irrest_cs['total']['q_seen']/question_seen_glove_s300_lxPT_irrest_cs['total']['q_no'])))
print(' glove_s300_lxBr_cs %.2f' % (100 * float(question_seen_glove_s300_LxBR_irrest_cs['total']['q_seen']/question_seen_glove_s300_LxBR_irrest_cs['total']['q_no'])))

 glove_s300_lxPT_ci 97.19
 glove_s300_lxBr_ci 96.99
 glove_s300_lxPT_cs 42.13
 glove_s300_lxBr_cs 41.91


In [22]:
res = model.evaluate_word_analogies('../testsets/ownBR.txt')

## Treinamento de próprio modelo

### Setup do dataset

Dump do wiki

In [87]:
!wget https://dumps.wikimedia.org/ptwiki/20190920/ptwiki-20190920-pages-articles-multistream.xml.bz2 -O ptwiki-20190920-pages-articles-multistream.xml.bz2

--2019-10-01 23:20:53--  https://dumps.wikimedia.org/ptwiki/20190920/ptwiki-20190920-pages-articles-multistream.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.7, 2620:0:861:1:208:80:154:7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1767735669 (1,6G) [application/octet-stream]
Saving to: ‘ptwiki-20190920-pages-articles-multistream.xml.bz2’


2019-10-01 23:36:03 (1,86 MB/s) - ‘ptwiki-20190920-pages-articles-multistream.xml.bz2’ saved [1767735669/1767735669]



In [91]:
!bzip2 -dk ptwiki-20190920-pages-articles-multistream.xml.bz2 

In [94]:
!ls

 fastText
 ptWiki0920
 ptwiki-20190920-pages-articles-multistream.xml
 ptwiki-20190920-pages-articles-multistream.xml.bz2
'Testes de Analogias para bases.ipynb'


Limpeza do XML

In [93]:
!perl fastText/wikifil.pl ptwiki-20190920-pages-articles-multistream.xml > ptWiki0920

In [39]:
!head -c 120 ptWiki0920

 forma o estrelar na grande nuvem de magalh es uma gal xia irregular mosaico da nebulosa do caranguejo remanescente de u

In [11]:
!grep -w "galáxia irregular" ptwiki-20190920-pages-articles-multistream.xml


[[Imagem:Starsinthesky.jpg|thumb|[[Estrela|Formação estrelar]] na [[Grande Nuvem de Magalhães]], uma [[galáxia irregular]].]]
Algumas galáxias de menor porte orbitam a Via Láctea, sendo, portanto [[galáxia satélite|galáxias satélite]]. A mais próxima delas é a [[Galáxia Anã do Cão Maior]], situada a cerca de 42 mil anos-luz do centro galáctico, seguida pela [[Galáxia Anã Elíptica de Sagitário]]. A [[Grande Nuvem de Magalhães]] e a [[Pequena Nuvem de Magalhães]] são as maiores dentre as galáxias satélite da Via Láctea. Ambas são visíveis a olho nu no hemisfério sul celeste como manchas brilhantes, sendo que a Grande Nuvem de Magalhães é a galáxia mais brilhante vista da Terra depois da própria Via Láctea. Ambas são [[galáxia irregular|estruturas irregulares]] e apresentam regiões de intensa formação estelar. Uma corrente de gases existe ligando as nuvens de Magalhães entre si e também com a Via Láctea, sendo sugerido que teria origem na interação gravitacional entre as galáxias.&lt;ref&

Alternatives to cleaning dump

https://radimrehurek.com/gensim/corpora/wikicorpus.html

## Treinamento

#### Modelo sem pre-processamento

In [98]:
model = fasttext.train_unsupervised('ptWiki0920')

In [100]:
model.save_model('../models/ptWiki0920-noPP.vec')

In [3]:
trainUnsupervisedTxt('ptWiki0920', '../models/ptWiki0920-noPP-skip-300'+'.bin', dimension=300)

'Model ptWiki0920 saved(300 <fasttext.FastText._FastText object at 0x7f0bb7f949b0>) @ ../models/ptWiki0920-noPP-skip-300.bin'

#### Modelos com pre-processamento

In [7]:
ptStopwords = nltk.corpus.stopwords.words('portuguese')

def normalize(txtData) :
    return txtData.lower()

def rmStopWords(txtData) :
    tokens = nltk.word_tokenize(txtData)
    processedData = [ word for word in tokens if word not in ptStopwords]
    return " ".join(processedData)
    
        

In [5]:
with open('ptWiki0920', 'r') as file:
    with open('ptWiki0920-norm-stop', 'w') as outFile:
        for line in file:
            outFile.write(normalize(rmStopWords(line)))
# print (preProcessData[80:100])

In [7]:
trainUnsupervisedTxt('ptWiki0920-norm-stop', '../models/ptWiki0920-norm-stop-skip-100'+'.vec')

In [31]:
trainUnsupervisedTxt('ptWiki0920-norm-stop', '../models/ptWiki0920-norm-stop-skip-300'+'.bin', dimension=300)

'Model ptWiki0920-norm-stop saved(300 <fasttext.FastText._FastText object at 0x7f0badf37518>) @ ../models/ptWiki0920-norm-stop-skip-300.bin'

\* ***aparentemente o script de decupagem do html normaliza o texto***

#### Modelo so com rm-Stopwords e ` dim 300`

In [23]:
with open('ptWiki0920', 'r') as file:
    with open('ptWiki0920-stop', 'w') as outFile:
        for line in file:
            outFile.write(rmStopWords(line))

In [30]:
trainUnsupervisedTxt('ptWiki0920-stop', '../models/ptWiki0920-stop-skip-300'+'.bin', dimension=300)

In [36]:
trainUnsupervisedTxt('ptWiki0920-stop', '../models/ptWiki0920-stop-skip-100'+'.bin', dimension=100)

### Comparação do Vocabulário pre-processado x n-processado

In [60]:
most_frequent_noPP = noPP_skip_100['model'].index2entity[:100]

In [61]:
ptStopwords = nltk.corpus.stopwords.words('portuguese')

intersection_most_frequent = [word for word in most_frequent_noPP if word in ptStopwords]

In [71]:
print(intersection_most_frequent)
print(len(intersection_most_frequent))

['de', 'o', 'a', 'e', 'do', 'em', 'da', 'que', 'um', 'com', 'no', 'uma', 'para', 'na', 'por', 'os', 'se', 'foi', 'como', 'as', 'dos', 'ao', 'mais', 'sua', 'ou', 'seu', 'das', 'pela', 'pelo', 'ele', 'mas', 'entre', 'nos', 'era', 'tem', 'foram', 'quando', 'seus', 'esta']
39


In [58]:
norm_stop_skip_100['model'].index2entity[:100]

['one',
 'zero',
 'two',
 'nine',
 'three',
 'eight',
 'four',
 'five',
 'seven',
 'six',
 's',
 'categoria',
 'm',
 'es',
 'n',
 'rio',
 'h',
 'f',
 'ncia',
 'utc',
 'p',
 'min',
 'tamb',
 'c',
 't',
 'the',
 'ser',
 'est',
 'ria',
 'd',
 'pr',
 'v',
 'l',
 'discuss',
 'dia',
 'id',
 'j',
 'rios',
 'bgcolor',
 'of',
 'sobre',
 'anos',
 'liga',
 'at',
 'vel',
 'lia',
 'brasil',
 'cidade',
 'r',
 'redirecionamento',
 'hist',
 'tica',
 'in',
 'g',
 'rd',
 'mar',
 'dio',
 'janeiro',
 'ano',
 'externas',
 'pol',
 'nome',
 'grande',
 'lan',
 'ncias',
 'onde',
 'ver',
 'b',
 'cio',
 'ap',
 'pa',
 'estados',
 'estado',
 'rea',
 'regi',
 'durante',
 'primeiro',
 'local',
 'primeira',
 'outros',
 'pode',
 'parte',
 'al',
 'ter',
 'km',
 'jogos',
 'gols',
 'sendo',
 'rias',
 'esp',
 'unidos',
 'q',
 'lista',
 'nia',
 'i',
 'dois',
 'ainda',
 'cada',
 'morte',
 'sul']

In [72]:
noPP_skip_100['model'].index2entity[:100]


['de',
 'o',
 'one',
 'zero',
 'two',
 'a',
 'e',
 'nine',
 'do',
 'em',
 'three',
 'da',
 'eight',
 'que',
 'four',
 'five',
 'seven',
 'six',
 's',
 'um',
 'com',
 'no',
 'categoria',
 'uma',
 'para',
 'm',
 'na',
 'es',
 'por',
 'os',
 'se',
 'n',
 'foi',
 'como',
 'as',
 'dos',
 'rio',
 'h',
 'ao',
 'mais',
 'sua',
 'f',
 'ou',
 'ncia',
 'utc',
 'p',
 'seu',
 'das',
 'min',
 'tamb',
 'c',
 't',
 'the',
 'ser',
 'est',
 'ria',
 'pela',
 'pelo',
 'd',
 'pr',
 'ele',
 'v',
 'mas',
 'l',
 'discuss',
 'entre',
 'dia',
 'id',
 'j',
 'rios',
 'bgcolor',
 'of',
 'sobre',
 'nos',
 'anos',
 'liga',
 'at',
 'vel',
 'lia',
 'era',
 'brasil',
 'cidade',
 'r',
 'tem',
 'redirecionamento',
 'hist',
 'tica',
 'in',
 'g',
 'rd',
 'mar',
 'dio',
 'foram',
 'janeiro',
 'quando',
 'ano',
 'seus',
 'externas',
 'pol',
 'esta']

In [78]:
pp_noPP_intersection = [pp_word for pp_word in norm_stop_skip_100['model'].index2entity[:100] if pp_word in most_frequent_noPP]

In [79]:
print('Intersection between Preprocessed model and not PP \n',pp_noPP_intersection)
print('Tamanho %d' % len(pp_noPP_intersection))

Intersection between Preprocessed model and not PP 
 ['one', 'zero', 'two', 'nine', 'three', 'eight', 'four', 'five', 'seven', 'six', 's', 'categoria', 'm', 'es', 'n', 'rio', 'h', 'f', 'ncia', 'utc', 'p', 'min', 'tamb', 'c', 't', 'the', 'ser', 'est', 'ria', 'd', 'pr', 'v', 'l', 'discuss', 'dia', 'id', 'j', 'rios', 'bgcolor', 'of', 'sobre', 'anos', 'liga', 'at', 'vel', 'lia', 'brasil', 'cidade', 'r', 'redirecionamento', 'hist', 'tica', 'in', 'g', 'rd', 'mar', 'dio', 'janeiro', 'ano', 'externas', 'pol']
Tamanho 61


Das 100 palavras mais frequentes no modelo sem pre-processamento, 39 são stopwords.

In [96]:
noPP_skip_100['model']['Brasil']

array([-3.20488930e-01,  7.34150708e-02, -2.25654840e-01, -4.33122486e-01,
        4.23747241e-01,  7.08579719e-02, -4.41147760e-02, -3.95013273e-01,
       -3.51756483e-01, -1.84792265e-01, -8.73326540e-01, -1.38309136e-01,
       -9.54652876e-02,  5.06048277e-02, -2.97635913e-01,  3.32867801e-01,
        1.21611424e-01, -2.01417714e-01,  4.43238802e-02, -1.68290004e-01,
        4.61084433e-02,  3.11402250e-02, -1.68521747e-01,  6.06458932e-02,
        9.56166461e-02, -9.55177844e-02, -2.68204331e-01, -4.78912503e-01,
       -1.84049681e-02,  1.75404176e-01,  3.42079312e-01,  3.66617888e-02,
       -6.45369440e-02,  3.69723499e-01,  5.07388785e-02,  2.80019730e-01,
       -3.74184877e-01,  2.48893976e-01, -3.05782165e-02, -1.05200201e-01,
        2.54557043e-01, -4.61127758e-01,  6.94447905e-02, -1.82592208e-04,
        8.04761276e-02, -1.84754059e-01,  8.09174478e-02,  1.36033118e-01,
       -2.78557301e-01, -6.46118701e-01, -2.71684974e-01,  3.93784255e-01,
        1.29571006e-01, -

In [97]:
noPP_skip_100['model']['brasil']

array([-6.3858616e-01,  3.6946565e-02, -2.1032934e-01,  4.9100355e-03,
        5.1675802e-01,  3.3930570e-01, -1.6995139e-01, -3.5193259e-01,
       -7.5601719e-02, -1.1550857e-01, -4.3874732e-01, -1.7515351e-01,
       -2.6592660e-01,  5.1678073e-02, -4.3907508e-01,  4.1048348e-01,
       -1.2571995e-01, -5.4090235e-02, -2.4331547e-01,  7.4416143e-01,
       -2.3910758e-01, -2.1536571e-01, -5.3951602e-02,  2.4117766e-01,
        9.7411059e-02, -1.3098441e-01, -1.7464896e-01, -3.3646679e-01,
       -1.7805803e-01,  2.1332951e-01,  6.3000315e-01,  3.4801105e-01,
       -3.7784794e-01,  5.0278330e-01, -1.7716159e-01,  1.5232222e-01,
       -4.4185031e-02,  1.4282545e-01, -4.3121364e-02, -1.9646448e-01,
        7.0832327e-02, -2.0538923e-01,  1.6553715e-02,  1.2250408e-02,
        2.8486386e-01,  5.7246373e-03,  2.7733743e-01,  3.9052942e-01,
       -3.7644553e-01, -5.7954472e-01, -2.2947466e-01,  4.5308942e-04,
       -5.2892751e-01, -2.0448387e-01,  1.3774104e-01,  4.1630098e-01,
      

In [23]:
def question_seen_in_model(model, test, case_insensitive=True, restrict_vocab = 300000):
    ok_vocab = [w for w in model.index2word[:restrict_vocab]]
    if(case_insensitive):
        tmp_vocab = [w.lower() for w in ok_vocab]
        ok_vocab = tmp_vocab
    with open(test, 'r') as analogies:
        sections, section = {}, None
        qs_section, qt_section = 0, 0
        for line_no, line in enumerate(analogies):
            if ': ' in line:
                if section:
                    old_section = section
                    sections[old_section]['qs'] = qs_section
                    sections[old_section]['qt'] = qt_section
                    sections[old_section]['coverage'] = float(qs_section/qt_section)
                section = line
                sections[section] = {}
                qs_section = 0
                qt_section = 0
            else:
                if not section:
                    raise ValueError('missing entry header')
                else:
                    if case_insensitive:
                        a,b,c,d = [word.lower() for word in line.split()] 
                    else:
                        a,b,c,d = [word for word in line.split()]
                    if a in ok_vocab and b in ok_vocab and c in ok_vocab and d in ok_vocab:
                        qs_section += 1
                    qt_section += 1
        old_section = section
        sections[old_section]['qs'] = qs_section
        sections[old_section]['qt'] = qt_section
        sections[old_section]['coverage'] = float(qs_section/qt_section)
    
    total = {'q_no':0, 'q_seen': 0}
    for sec_key in sections:
        total['q_no'] += sections[sec_key]['qt']
        total['q_seen'] += sections[sec_key]['qs']
    
    return {'sections':sections, 'total':total}
                    

In [204]:
question_seen_noPP_skip_100_lxPT_ci = question_seen_in_model(noPP_skip_100['model'], testset[1], True )
question_seen_noPP_skip_100_lxPT_cs = question_seen_in_model(noPP_skip_100['model'], testset[1], False )
question_seen_noPP_skip_100_LxBR_ci = question_seen_in_model(noPP_skip_100['model'], testset[0], True)
question_seen_noPP_skip_100_LxBR_cs = question_seen_in_model(noPP_skip_100['model'], testset[0], False)

506
4524
866
2467
462
930
702
13
600
1056
1599
1560
1332
870
506
4524
866
2467
462
930
702
13
600
1056
1599
1560
1332
870
506
4524
866
2467
462
930
756
30
600
1056
1599
1560
1332
870
506
4524
866
2467
462
930
756
30
600
1056
1599
1560
1332
870


In [None]:
ok_vocab = 

In [205]:
print(question_seen_noPP_skip_100_lxPT_ci['total'])
print(question_seen_noPP_skip_100_lxPT_cs['total'])
print(question_seen_noPP_skip_100_LxBR_ci['total'])
print(question_seen_noPP_skip_100_LxBR_cs['total'])

{'q_no': 17487, 'q_seen': 6125}
{'q_no': 17487, 'q_seen': 6125}
{'q_no': 17558, 'q_seen': 6227}
{'q_no': 17558, 'q_seen': 6227}


### Detalhes dos testset AnalogiesBr

In [119]:
sections = {}
with open(testset[0], 'r') as analogy_file:
    section = None
    qt_section = 0
    for line_no, line in enumerate(analogy_file):
        if(line_no <3):
            print(line: capital-common-countries

Atenas Grécia Bagdá Iraque

Atenas Grécia Bancoque Tailândia)
        if ': ' in line:
            if section:
                old_section = section
                sections[old_section] = qt_section
                qt_section = 0
            section = line
        else:
            qt_section += 1
    old_section = section
    sections[old_section] = qt_section
    qt_section = 0
print (sections)

: capital-common-countries

Atenas Grécia Bagdá Iraque

Atenas Grécia Bancoque Tailândia

{': capital-common-countries\n': 506, ': capital-world\n': 4524, ': currency\n': 866, ': city-in-state\n': 2467, ': family\n': 462, ': gram1-adjective-to-adverb\n': 930, ': gram2-opposite\n': 756, ': gram3-comparative\n': 30, ': gram4-superlative\n': 600, ': gram5-present-participle\n': 1056, ': gram6-nationality-adjective\n': 1599, ': gram7-past-tense\n': 1560, ': gram8-plural\n': 1332, ': gram9-plural-verbs\n': 870}


In [118]:
si, se = 0,0
for key in sections:
    if 'gram' in key:
        si += sections[key]
    else:
        se += sections[key]
print ('SI %d - SE %d' % (si,  se))

SI 8733 - SE 8825


### Detalhes do Testset AnalogiesPT

In [120]:
sections = {}
with open(testset[1], 'r') as analogy_file:
    section = None
    qt_section = 0
    for line_no, line in enumerate(analogy_file):
        if ': ' in line:
            if section:
                old_section = section
                sections[old_section] = qt_section
                qt_section = 0
            section = line
        else:
            qt_section += 1
    old_section = section
    sections[old_section] = qt_section
    qt_section = 0
print (testset[1])
print(sections)

si_pt, se_pt = 0,0
for key in sections:
    if 'gram' in key:
        si_pt += sections[key]
    else:
        se_pt += sections[key]
print ('SI %d - SE %d' % (si_pt,  se_pt))

../testsets/LX-4WAnalogies.txt
{': capital-common-countries\n': 506, ': capital-world\n': 4524, ': currency\n': 866, ': city-in-state\n': 2467, ': family\n': 462, ': gram1-adjective-to-adverb\n': 930, ': gram2-opposite\n': 702, ': gram3-comparative\n': 13, ': gram4-superlative\n': 600, ': gram5-present-participle\n': 1056, ': gram6-nationality-adjective\n': 1599, ': gram7-past-tense\n': 1560, ': gram8-plural\n': 1332, ': gram9-plural-verbs\n': 870}
SI 8662 - SE 8825


## Acuracia

### Testes para modelo ptWiki0920-noPP

../testsets/LX-4WAnalogiesBr.txt Total accuracy: 45.83 % Questions seen / total: 6227 17558 35.47 %

../testsets/LX-4WAnalogies.txt Total accuracy: 46.96 % Questions seen / total: 6125 17487 35.03 %

In [10]:
noPP_skip_100 = get_model_info('../models/ptWiki0920-noPP-skip-100.vec', testset, get_facebook_vecs_word_analogy_accuracy, True)

In [137]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-noPP-skip-100.vec ../testsets/LX-4WAnalogiesBr.txt 300000


      capital-common-countries: ACCURACY TOP1: 92.86 %  (39 / 42)	  Total accuracy: 92.86 %   Semantic accuracy: 92.86 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 57.37 %  (389 / 678)	  Total accuracy: 59.44 %   Semantic accuracy: 59.44 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 55.15 %   Semantic accuracy: 55.15 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 15.37 %  (158 / 1028)	  Total accuracy: 32.48 %   Semantic accuracy: 32.48 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 68.18 %  (90 / 132)	  Total accuracy: 34.92 %   Semantic accuracy: 34.92 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 30.74 %  (142 / 462)	  Total accuracy: 34.11 %   Semantic accuracy: 34.92 %   Syntactic accuracy: 30.74 %
                gram2-opposite: ACCURACY TOP1: 19.05 %  (40 / 210)	  Total accuracy: 32.90 %

In [138]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-noPP-skip-100.vec ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 76.79 %  (43 / 56)	  Total accuracy: 76.79 %   Semantic accuracy: 76.79 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 56.85 %  (386 / 679)	  Total accuracy: 58.37 %   Semantic accuracy: 58.37 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 54.24 %   Semantic accuracy: 54.24 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 10.23 %  (90 / 880)	  Total accuracy: 31.06 %   Semantic accuracy: 31.06 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 63.46 %  (99 / 156)	  Total accuracy: 33.83 %   Semantic accuracy: 33.83 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 30.95 %  (143 / 462)	  Total accuracy: 33.25 %   Semantic accuracy: 33.83 %   Syntactic accuracy: 30.95 %
                gram2-opposite: ACCURACY TOP1: 19.52 %  (41 / 210)	  Total accuracy: 32.09 %  

### Testes para modelo '../models/ptWiki0920-noPP-skip-300'

In [4]:
noPP_skip_300 = get_model_info('../models/ptWiki0920-noPP-skip-300.bin', testset, get_facebook_vecs_word_analogy_accuracy, facebook_model=True)

In [5]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-noPP-skip-300.bin ../testsets/LX-4WAnalogiesBr.txt 300000
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-noPP-skip-300.bin ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 100.00 %  (42 / 42)	  Total accuracy: 100.00 %   Semantic accuracy: 100.00 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 59.88 %  (406 / 678)	  Total accuracy: 62.22 %   Semantic accuracy: 62.22 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 57.73 %   Semantic accuracy: 57.73 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 25.58 %  (263 / 1028)	  Total accuracy: 39.41 %   Semantic accuracy: 39.41 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 68.94 %  (91 / 132)	  Total accuracy: 41.43 %   Semantic accuracy: 41.43 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 37.23 %  (172 / 462)	  Total accuracy: 40.62 %   Semantic accuracy: 41.43 %   Syntactic accuracy: 37.23 %
                gram2-opposite: ACCURACY TOP1: 26.19 %  (55 / 210)	  Total accuracy: 39.4

### Testes para modelo ptWiki0920-norm-stop 

* Variação de Vocabulário 200000 X 300000

In [36]:
norm_stop_skip_100 = get_model_info('../models/ptWiki0920-norm-stop-skip-100.vec', testset, get_facebook_vecs_word_analogy_accuracy, facebook_model=True)

In [123]:
print(norm_stop_skip_100['model_ref'], norm_stop_skip_100['benchmark'][testset[0]]['log'])
print(norm_stop_skip_100['model_ref'], norm_stop_skip_100['benchmark'][testset[1]]['log'])

../models/ptWiki0920-norm-stop-skip-100.vec ../testsets/LX-4WAnalogiesBr.txt 0.4525  | case insensitive
../models/ptWiki0920-norm-stop-skip-100.vec ../testsets/LX-4WAnalogies.txt 0.4560  | case insensitive


../testsets/LX-4WAnalogiesBr.txt Total accuracy: 45.25 % Questions seen / total: 6113 17558   34.82 %

../testsets/LX-4WAnalogies.txt Total accuracy: 45.60 % Questions seen / total: 6005 17487   34.34 %

In [126]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-skip-100.vec ../testsets/LX-4WAnalogiesBr.txt 300000


      capital-common-countries: ACCURACY TOP1: 90.48 %  (38 / 42)	  Total accuracy: 90.48 %   Semantic accuracy: 90.48 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 57.08 %  (387 / 678)	  Total accuracy: 59.03 %   Semantic accuracy: 59.03 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 1.79 %  (1 / 56)	  Total accuracy: 54.90 %   Semantic accuracy: 54.90 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 19.26 %  (198 / 1028)	  Total accuracy: 34.59 %   Semantic accuracy: 34.59 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 50.00 %  (45 / 90)	  Total accuracy: 35.32 %   Semantic accuracy: 35.32 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 28.35 %  (131 / 462)	  Total accuracy: 33.96 %   Semantic accuracy: 35.32 %   Syntactic accuracy: 28.35 %
                gram2-opposite: ACCURACY TOP1: 19.52 %  (41 / 210)	  Total accuracy: 32.77 % 

In [127]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-skip-100.vec ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 76.79 %  (43 / 56)	  Total accuracy: 76.79 %   Semantic accuracy: 76.79 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 56.11 %  (381 / 679)	  Total accuracy: 57.69 %   Semantic accuracy: 57.69 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 1.79 %  (1 / 56)	  Total accuracy: 53.73 %   Semantic accuracy: 53.73 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 13.30 %  (117 / 880)	  Total accuracy: 32.44 %   Semantic accuracy: 32.44 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 45.45 %  (50 / 110)	  Total accuracy: 33.24 %   Semantic accuracy: 33.24 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 29.00 %  (134 / 462)	  Total accuracy: 32.37 %   Semantic accuracy: 33.24 %   Syntactic accuracy: 29.00 %
                gram2-opposite: ACCURACY TOP1: 19.52 %  (41 / 210)	  Total accuracy: 31.27 % 

In [17]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-skip-100.vec ../testsets/LX-4WAnalogiesBr.txt 200000


      capital-common-countries: ACCURACY TOP1: 90.48 %  (38 / 42)	  Total accuracy: 90.48 %   Semantic accuracy: 90.48 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 58.65 %  (383 / 653)	  Total accuracy: 60.58 %   Semantic accuracy: 60.58 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 1.79 %  (1 / 56)	  Total accuracy: 56.19 %   Semantic accuracy: 56.19 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 19.84 %  (204 / 1028)	  Total accuracy: 35.19 %   Semantic accuracy: 35.19 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 52.22 %  (47 / 90)	  Total accuracy: 36.01 %   Semantic accuracy: 36.01 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 29.44 %  (136 / 462)	  Total accuracy: 34.71 %   Semantic accuracy: 36.01 %   Syntactic accuracy: 29.44 %
                gram2-opposite: ACCURACY TOP1: 20.88 %  (38 / 182)	  Total accuracy: 33.70 % 

In [20]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-skip-100.vec ../testsets/LX-4WAnalogies.txt 200000


      capital-common-countries: ACCURACY TOP1: 76.79 %  (43 / 56)	  Total accuracy: 76.79 %   Semantic accuracy: 76.79 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 57.73 %  (377 / 653)	  Total accuracy: 59.24 %   Semantic accuracy: 59.24 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 1.79 %  (1 / 56)	  Total accuracy: 55.03 %   Semantic accuracy: 55.03 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 13.86 %  (122 / 880)	  Total accuracy: 33.01 %   Semantic accuracy: 33.01 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 47.27 %  (52 / 110)	  Total accuracy: 33.90 %   Semantic accuracy: 33.90 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 30.09 %  (139 / 462)	  Total accuracy: 33.11 %   Semantic accuracy: 33.90 %   Syntactic accuracy: 30.09 %
                gram2-opposite: ACCURACY TOP1: 20.88 %  (38 / 182)	  Total accuracy: 32.18 % 

DIM 300

In [33]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-skip-300.bin ../testsets/LX-4WAnalogiesBr.txt 300000


      capital-common-countries: ACCURACY TOP1: 97.62 %  (41 / 42)	  Total accuracy: 97.62 %   Semantic accuracy: 97.62 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 57.96 %  (393 / 678)	  Total accuracy: 60.28 %   Semantic accuracy: 60.28 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 55.93 %   Semantic accuracy: 55.93 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 26.75 %  (275 / 1028)	  Total accuracy: 39.30 %   Semantic accuracy: 39.30 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 60.00 %  (54 / 90)	  Total accuracy: 40.29 %   Semantic accuracy: 40.29 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 37.66 %  (174 / 462)	  Total accuracy: 39.77 %   Semantic accuracy: 40.29 %   Syntactic accuracy: 37.66 %
                gram2-opposite: ACCURACY TOP1: 22.86 %  (48 / 210)	  Total accuracy: 38.39 % 

In [32]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-skip-300.bin ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 85.71 %  (48 / 56)	  Total accuracy: 85.71 %   Semantic accuracy: 85.71 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 56.85 %  (386 / 679)	  Total accuracy: 59.05 %   Semantic accuracy: 59.05 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 54.87 %   Semantic accuracy: 54.87 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 22.05 %  (194 / 880)	  Total accuracy: 37.58 %   Semantic accuracy: 37.58 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 57.27 %  (63 / 110)	  Total accuracy: 38.80 %   Semantic accuracy: 38.80 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 38.53 %  (178 / 462)	  Total accuracy: 38.74 %   Semantic accuracy: 38.80 %   Syntactic accuracy: 38.53 %
                gram2-opposite: ACCURACY TOP1: 23.81 %  (50 / 210)	  Total accuracy: 37.46 % 

In [19]:
!ls ../models/

ext				 ptWiki0920-norm-stop-cbow-100.bin
mock.txt			 ptWiki0920-norm-stop-cbow-300.bin
oldModels			 ptWiki0920-norm-stop-skip-100.vec
ptWiki0920300-noPP-cbow-300.bin  ptWiki0920-norm-stop-skip-300.bin
ptWiki0920-noPP-cbow-100.bin	 ptWiki0920-stop-skip-100.bin
ptWiki0920-noPP-skip-100.vec	 ptWiki0920-stop-skip-300.bin
ptWiki0920-noPP-skip-300.bin


#### Testes para modelo ptWiki0920-stop

../testsets/LX-4WAnalogiesBr.txt Total accuracy: 45.05 % Questions seen / total: 6113 17558   34.82 %

../testsets/LX-4WAnalogies.txt Total accuracy:  45.25 % Questions seen / total: 6005 17487   34.34 %

In [130]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-stop-skip-100.bin ../testsets/LX-4WAnalogiesBr.txt 300000


      capital-common-countries: ACCURACY TOP1: 92.86 %  (39 / 42)	  Total accuracy: 92.86 %   Semantic accuracy: 92.86 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 57.67 %  (391 / 678)	  Total accuracy: 59.72 %   Semantic accuracy: 59.72 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 1.79 %  (1 / 56)	  Total accuracy: 55.54 %   Semantic accuracy: 55.54 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 18.09 %  (186 / 1028)	  Total accuracy: 34.20 %   Semantic accuracy: 34.20 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 48.89 %  (44 / 90)	  Total accuracy: 34.90 %   Semantic accuracy: 34.90 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 30.74 %  (142 / 462)	  Total accuracy: 34.08 %   Semantic accuracy: 34.90 %   Syntactic accuracy: 30.74 %
                gram2-opposite: ACCURACY TOP1: 20.00 %  (42 / 210)	  Total accuracy: 32.93 % 

In [136]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-stop-skip-100.bin ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 78.57 %  (44 / 56)	  Total accuracy: 78.57 %   Semantic accuracy: 78.57 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 55.23 %  (375 / 679)	  Total accuracy: 57.01 %   Semantic accuracy: 57.01 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 1.79 %  (1 / 56)	  Total accuracy: 53.10 %   Semantic accuracy: 53.10 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 13.41 %  (118 / 880)	  Total accuracy: 32.20 %   Semantic accuracy: 32.20 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 47.27 %  (52 / 110)	  Total accuracy: 33.13 %   Semantic accuracy: 33.13 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 30.95 %  (143 / 462)	  Total accuracy: 32.68 %   Semantic accuracy: 33.13 %   Syntactic accuracy: 30.95 %
                gram2-opposite: ACCURACY TOP1: 19.52 %  (41 / 210)	  Total accuracy: 31.55 % 

#### Testes para modelo ptWiki0920-stop300

../testsets/LX-4WAnalogiesBr.txt Total accuracy: 54.87 % Questions seen / total: 6113 17558   34.82 %

../testsets/LX-4WAnalogies.txt Total accuracy: 55.22 % Questions seen / total: 6005 17487   34.34 %

In [134]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-stop-skip-300.bin ../testsets/LX-4WAnalogiesBr.txt 300000


      capital-common-countries: ACCURACY TOP1: 97.62 %  (41 / 42)	  Total accuracy: 97.62 %   Semantic accuracy: 97.62 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 59.73 %  (405 / 678)	  Total accuracy: 61.94 %   Semantic accuracy: 61.94 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 57.47 %   Semantic accuracy: 57.47 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 27.33 %  (281 / 1028)	  Total accuracy: 40.30 %   Semantic accuracy: 40.30 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 61.11 %  (55 / 90)	  Total accuracy: 41.29 %   Semantic accuracy: 41.29 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 38.53 %  (178 / 462)	  Total accuracy: 40.75 %   Semantic accuracy: 41.29 %   Syntactic accuracy: 38.53 %
                gram2-opposite: ACCURACY TOP1: 23.33 %  (49 / 210)	  Total accuracy: 39.32 % 

In [135]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-stop-skip-300.bin ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 85.71 %  (48 / 56)	  Total accuracy: 85.71 %   Semantic accuracy: 85.71 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 56.41 %  (383 / 679)	  Total accuracy: 58.64 %   Semantic accuracy: 58.64 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 54.49 %   Semantic accuracy: 54.49 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 21.82 %  (192 / 880)	  Total accuracy: 37.28 %   Semantic accuracy: 37.28 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 58.18 %  (64 / 110)	  Total accuracy: 38.57 %   Semantic accuracy: 38.57 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 38.96 %  (180 / 462)	  Total accuracy: 38.65 %   Semantic accuracy: 38.57 %   Syntactic accuracy: 38.96 %
                gram2-opposite: ACCURACY TOP1: 25.71 %  (54 / 210)	  Total accuracy: 37.55 % 

## Treinamento de Modelo com CBOW

In [29]:
print('Referencia de corpus')
!ls

Referencia de corpus
 fastText	        ptwiki-20190920-pages-articles-multistream.xml
 ptWiki0920	        ptwiki-20190920-pages-articles-multistream.xml.bz2
 ptWiki0920-norm-stop  'Testes de Analogias para bases-Copy1.ipynb'
 ptWiki0920-stop       'Testes de Analogias para bases.ipynb'


In [9]:
testset = glob('../testsets/LX-4WAnalogies*')

### Modelo treinado sem PP, Cbow, dimensao 100

In [30]:
trainUnsupervisedTxt('ptWiki0920', '../models/ptWiki0920-noPP-cbow-100.bin', model="cbow", dimension=100)

In [118]:
noPP_cbow_100 = get_model_info('../models/ptWiki0920-noPP-cbow-100.bin', testset, get_facebook_vecs_word_analogy_accuracy, facebook_model=True)

In [119]:
print(noPP_cbow_100['model_ref'], noPP_cbow_100['benchmark'][testset[0]]['log'])
print(noPP_cbow_100['model_ref'], noPP_cbow_100['benchmark'][testset[1]]['log'] )

../models/ptWiki0920-noPP-cbow-100.bin ../testsets/LX-4WAnalogiesBr.txt 0.4659  | case insensitive
../models/ptWiki0920-noPP-cbow-100.bin ../testsets/LX-4WAnalogies.txt 0.4963  | case insensitive


In [139]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-noPP-cbow-100.bin ../testsets/LX-4WAnalogiesBr.txt 300000
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-noPP-cbow-100.bin ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 78.57 %  (33 / 42)	  Total accuracy: 78.57 %   Semantic accuracy: 78.57 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 21.24 %  (144 / 678)	  Total accuracy: 24.58 %   Semantic accuracy: 24.58 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 22.81 %   Semantic accuracy: 22.81 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 9.82 %  (101 / 1028)	  Total accuracy: 15.41 %   Semantic accuracy: 15.41 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 68.94 %  (91 / 132)	  Total accuracy: 19.06 %   Semantic accuracy: 19.06 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 51.30 %  (237 / 462)	  Total accuracy: 25.27 %   Semantic accuracy: 19.06 %   Syntactic accuracy: 51.30 %
                gram2-opposite: ACCURACY TOP1: 33.33 %  (70 / 210)	  Total accuracy: 25.92 % 

#### Resumo
../models/ptWiki0920-noPP-cbow-100.bin ../testsets/LX-4WAnalogiesBr.txt 0.4659  Semantic accuracy: 19.06 %   Syntactic accuracy: 59.01 %
../models/ptWiki0920-noPP-cbow-100.bin ../testsets/LX-4WAnalogies.txt 0.4963  Semantic accuracy: 19.43 %   Syntactic accuracy: 62.47 %

### Modelo treinado Remoção de Stopwords e normalização, Cbow, dimensao 100

In [33]:
trainUnsupervisedTxt('ptWiki0920-norm-stop', '../models/ptWiki0920-norm-stop-cbow-100.bin', model="cbow", dimension=100)

In [115]:
cbow_norm_stop_cbow_100 = get_model_info('../models/ptWiki0920-norm-stop-cbow-100.bin', testset, get_facebook_vecs_word_analogy_accuracy, facebook_model=True)

In [116]:
print(cbow_norm_stop_cbow_100['model_ref'], cbow_norm_stop_cbow_100['benchmark'][testset[0]]['log'] )
print(cbow_norm_stop_cbow_100['model_ref'], cbow_norm_stop_cbow_100['benchmark'][testset[1]]['log'])

../models/ptWiki0920-norm-stop-cbow-100.bin ../testsets/LX-4WAnalogiesBr.txt 0.5061  | case insensitive
../models/ptWiki0920-norm-stop-cbow-100.bin ../testsets/LX-4WAnalogies.txt 0.5347  | case insensitive


In [141]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-cbow-100.bin ../testsets/LX-4WAnalogiesBr.txt 300000
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-cbow-100.bin ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 83.33 %  (35 / 42)	  Total accuracy: 83.33 %   Semantic accuracy: 83.33 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 26.40 %  (179 / 678)	  Total accuracy: 29.72 %   Semantic accuracy: 29.72 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 27.58 %   Semantic accuracy: 27.58 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 11.96 %  (123 / 1028)	  Total accuracy: 18.68 %   Semantic accuracy: 18.68 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 56.67 %  (51 / 90)	  Total accuracy: 20.49 %   Semantic accuracy: 20.49 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 54.11 %  (250 / 462)	  Total accuracy: 27.08 %   Semantic accuracy: 20.49 %   Syntactic accuracy: 54.11 %
                gram2-opposite: ACCURACY TOP1: 32.38 %  (68 / 210)	  Total accuracy: 27.51 % 

#### Resumo 
../models/ptWiki0920-norm-stop-cbow-100.bin ../testsets/LX-4WAnalogiesBr.txt 0.5061 Semantic accuracy: 20.49 %   Syntactic accuracy: 64.14 %

../models/ptWiki0920-norm-stop-cbow-100.bin ../testsets/LX-4WAnalogies.txt 0.5347 Semantic accuracy: 19.93 %   Syntactic accuracy: 67.61 %

### Modelo treinado sem pre-processamento, CBOW, Dimensao 300

In [9]:
trainUnsupervisedTxt('ptWiki0920', '../models/ptWiki0920300-noPP-cbow-300'+'.bin', model="cbow", dimension=300)

In [112]:
noPP_cbow_300 = get_model_info('../models/ptWiki0920300-noPP-cbow-300.bin', testset, get_facebook_vecs_word_analogy_accuracy, facebook_model=True)

In [113]:
print(noPP_cbow_300['model_ref'],' @ ', noPP_cbow_300['benchmark'][testset[0]]['log'])
print(noPP_cbow_300['model_ref'],' @ ', noPP_cbow_300['benchmark'][testset[1]]['log'])

../models/ptWiki0920300-noPP-cbow-300.bin  @  ../testsets/LX-4WAnalogiesBr.txt 0.5202  | case insensitive
../models/ptWiki0920300-noPP-cbow-300.bin  @  ../testsets/LX-4WAnalogies.txt 0.5549  | case insensitive


In [144]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920300-noPP-cbow-300.bin ../testsets/LX-4WAnalogiesBr.txt 300000
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920300-noPP-cbow-300.bin ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 76.19 %  (32 / 42)	  Total accuracy: 76.19 %   Semantic accuracy: 76.19 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 20.50 %  (139 / 678)	  Total accuracy: 23.75 %   Semantic accuracy: 23.75 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 22.04 %   Semantic accuracy: 22.04 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 5.93 %  (61 / 1028)	  Total accuracy: 12.86 %   Semantic accuracy: 12.86 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 66.67 %  (88 / 132)	  Total accuracy: 16.53 %   Semantic accuracy: 16.53 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 64.07 %  (296 / 462)	  Total accuracy: 25.69 %   Semantic accuracy: 16.53 %   Syntactic accuracy: 64.07 %
                gram2-opposite: ACCURACY TOP1: 41.43 %  (87 / 210)	  Total accuracy: 26.96 %  

#### Resumo
../models/ptWiki0920300-noPP-cbow-300.bin  @  ../testsets/LX-4WAnalogiesBr.txt 0.5202 Semantic accuracy: 16.53 %   Syntactic accuracy: 68.03 %

../models/ptWiki0920300-noPP-cbow-300.bin  @  ../testsets/LX-4WAnalogies.txt 0.5549  Semantic accuracy: 17.35 %   Syntactic accuracy: 71.71 %


### Modelo treinado com remoção de stopwords e normalização, CBOW, dimensão 300**

In [10]:
trainUnsupervisedTxt('ptWiki0920-norm-stop', '../models/ptWiki0920-norm-stop-300'+'.bin', model="cbow", dimension=300)

In [107]:
norm_stop_300 = get_model_info('../models/ptWiki0920-norm-stop-cbow-300.bin', testset, get_facebook_vecs_word_analogy_accuracy, facebook_model=True)

In [108]:
print(norm_stop_300['model_ref'], norm_stop_300['benchmark'][testset[0]]['log'])
print(norm_stop_300['model_ref'], norm_stop_300['benchmark'][testset[1]]['log'])

../models/ptWiki0920-norm-stop-cbow-300.bin ../testsets/LX-4WAnalogiesBr.txt 0.5405  | case insensitive
../models/ptWiki0920-norm-stop-cbow-300.bin ../testsets/LX-4WAnalogies.txt 0.5749  | case insensitive


In [143]:
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-cbow-300.bin ../testsets/LX-4WAnalogiesBr.txt 300000
! python fastText/python/doc/examples/compute_accuracy.py ../models/ptWiki0920-norm-stop-cbow-300.bin ../testsets/LX-4WAnalogies.txt 300000


      capital-common-countries: ACCURACY TOP1: 80.95 %  (34 / 42)	  Total accuracy: 80.95 %   Semantic accuracy: 80.95 %   Syntactic accuracy: 0.00 %
                 capital-world: ACCURACY TOP1: 21.53 %  (146 / 678)	  Total accuracy: 25.00 %   Semantic accuracy: 25.00 %   Syntactic accuracy: 0.00 %
                      currency: ACCURACY TOP1: 0.00 %  (0 / 56)	  Total accuracy: 23.20 %   Semantic accuracy: 23.20 %   Syntactic accuracy: 0.00 %
                 city-in-state: ACCURACY TOP1: 7.49 %  (77 / 1028)	  Total accuracy: 14.25 %   Semantic accuracy: 14.25 %   Syntactic accuracy: 0.00 %
                        family: ACCURACY TOP1: 58.89 %  (53 / 90)	  Total accuracy: 16.37 %   Semantic accuracy: 16.37 %   Syntactic accuracy: 0.00 %
     gram1-adjective-to-adverb: ACCURACY TOP1: 59.96 %  (277 / 462)	  Total accuracy: 24.92 %   Semantic accuracy: 16.37 %   Syntactic accuracy: 59.96 %
                gram2-opposite: ACCURACY TOP1: 37.14 %  (78 / 210)	  Total accuracy: 25.92 %   

#### Resumo
../models/ptWiki0920-norm-stop-cbow-300.bin ../testsets/LX-4WAnalogiesBr.txt 0.5405 Semantic accuracy: 16.37 %   Syntactic accuracy: 70.96 %

../models/ptWiki0920-norm-stop-cbow-300.bin ../testsets/LX-4WAnalogies.txt 0.5749 Semantic accuracy: 17.01 %   Syntactic accuracy: 74.55 %


### Avaliação conjunta com Pool Multiprocessing

In [None]:
pt_wiki_model_list = glob('../models/ptWiki*')
testset = ['../testsets/LX-4WAnalogiesBr.txt', '../testsets/LX-4WAnalogies.txt']
ptWikiPool = multiprocessing.Pool()

PtWikiPool_res = ptWikiPool.starmap(eval_default_param, build_model_accuracy_tuple(pt_wiki_model_list, testset, get_facebook_vecs_word_analogy_accuracy, True
                                                                                  ))

### Análise de acurácia para perguntas permutadas

* Observamos abaixo perguntas retiradas do testset, na categoria de país-capital. O objetivo é avaliar as permutações da perguntas para prever qualquer uma das palavras na pergunta, e não somente a ultima.

In [17]:
model = load_facebook_vectors('../models/ptWiki0920-norm-stop-cbow-300.bin')

In [19]:
model.most_similar(['PEQUIM','BERLIM'],['ALEMANHA'])

[('fonologia', 0.319120317697525),
 ('gerativismo', 0.307516485452652),
 ('brasileirismo', 0.3042718470096588),
 ('informativas', 0.2936704158782959),
 ('brasileiropop', 0.2915341854095459),
 ('simporte', 0.29116183519363403),
 ('sbpbrasil', 0.29109519720077515),
 ('folclorismo', 0.28938236832618713),
 ('xadrezismo', 0.288007915019989),
 ('brasileirocopa', 0.2872498035430908)]

In [21]:
model.most_similar(['pequim','berlim'],['alemanha'])

[('xangai', 0.5828249454498291),
 ('bequim', 0.5500543117523193),
 ('nanquim', 0.5461136102676392),
 ('zhangzhou', 0.5403828620910645),
 ('moscou', 0.535670280456543),
 ('sequim', 0.5303146839141846),
 ('pequiz', 0.5237914323806763),
 ('perlim', 0.5233457684516907),
 ('moscovo', 0.5150915384292603),
 ('equim', 0.5134990811347961)]

In [22]:
model.most_similar(['pequim','alemanha'],['berlim'])

[('china', 0.7143341302871704),
 ('chinantec', 0.6527328491210938),
 ('chinasa', 0.647666335105896),
 ('chinanteca', 0.6465834379196167),
 ('chinantecas', 0.6324456930160522),
 ('chinanteco', 0.6302004456520081),
 ('pequis', 0.6272468566894531),
 ('pequisa', 0.6269203424453735),
 ('chinampa', 0.6263154745101929),
 ('pequi', 0.6219850778579712)]

In [25]:
model.most_similar(['pequim','egito'],['cairo'])

[('china', 0.6310254335403442),
 ('chineses', 0.5763074159622192),
 ('obshchina', 0.5600889921188354),
 ('pequis', 0.5575646758079529),
 ('chinas', 0.5570805072784424),
 ('chinesas', 0.5569696426391602),
 ('equim', 0.5536428689956665),
 ('rpchina', 0.548112154006958),
 ('dchina', 0.5416480302810669),
 ('qinu', 0.5392635464668274)]

In [26]:
model.most_similar(['cairo','china'],['pequim'])

[('egito', 0.6022971868515015),
 ('cairota', 0.5998178124427795),
 ('cairom', 0.5903569459915161),
 ('cairos', 0.5544114112854004),
 ('chinameca', 0.53751540184021),
 ('hadramaute', 0.5334338545799255),
 ('cairoli', 0.5247357487678528),
 ('chade', 0.5201066136360168),
 ('cairate', 0.5193891525268555),
 ('cairotas', 0.5183498859405518)]

In [31]:
model.most_similar(['china','cairo'],['egito'])

[('changzhou', 0.7101659178733826),
 ('chengzhou', 0.6909531354904175),
 ('chengdong', 0.6797289848327637),
 ('changjiang', 0.6790489554405212),
 ('chenzhou', 0.6721263527870178),
 ('nanjing', 0.6717712879180908),
 ('hangzhou', 0.6648585200309753),
 ('suzhou', 0.6613156199455261),
 ('shengzhou', 0.6567591428756714),
 ('chongqing', 0.655300498008728)]

In [32]:
model.most_similar(['egito','pequim'],['china'])

[('cairo', 0.6343752145767212),
 ('egipto', 0.6325528621673584),
 ('atenas', 0.5384305119514465),
 ('egipcio', 0.5347484946250916),
 ('egitaniense', 0.5332800149917603),
 ('nfis', 0.5308082103729248),
 ('fara', 0.5289286375045776),
 ('vegito', 0.5287285447120667),
 ('egipcianos', 0.5267181396484375),
 ('egira', 0.5257495045661926)]

In [23]:
print(results[0]['correct'][:5])

[('PEQUIM', 'CHINA', 'BERLIM', 'ALEMANHA'), ('PEQUIM', 'CHINA', 'CAIRO', 'EGITO'), ('PEQUIM', 'CHINA', 'HAVANA', 'CUBA'), ('PEQUIM', 'CHINA', 'LONDRES', 'INGLATERRA'), ('BERLIM', 'ALEMANHA', 'HAVANA', 'CUBA')]


In [27]:
model.most_similar(['pequim', 'espanha'],['madrid'])

[('china', 0.6547271013259888),
 ('bequim', 0.617762565612793),
 ('equim', 0.6094659566879272),
 ('pequi', 0.6092944145202637),
 ('chinanteca', 0.6052078008651733),
 ('rpchina', 0.6035335063934326),
 ('chinameca', 0.6020873188972473),
 ('pequis', 0.5938908457756042),
 ('pequito', 0.5929934978485107),
 ('chineses', 0.5906508564949036)]

In [28]:
model.most_similar(['madrid','china'],['pequim'])

[('lamadrid', 0.6548174619674683),
 ('madridreal', 0.6370139718055725),
 ('vaciamadrid', 0.611453652381897),
 ('madridborder', 0.601535439491272),
 ('realmadrid', 0.5999178290367126),
 ('madri', 0.5934512615203857),
 ('atlmadrid', 0.5856846570968628),
 ('telemadrid', 0.5812236070632935),
 ('espanha', 0.5767569541931152),
 ('valladolid', 0.5739721059799194)]

In [29]:
model.most_similar(['madrid','china'],['espanha'])

[('shangqing', 0.6008337736129761),
 ('chines', 0.5941194295883179),
 ('chinawut', 0.5900689363479614),
 ('nanjing', 0.5899882316589355),
 ('changjiang', 0.5898436307907104),
 ('chinese', 0.589677095413208),
 ('chengzhou', 0.5894589424133301),
 ('chengjiang', 0.5893243551254272),
 ('changzhou', 0.5885428786277771),
 ('chongqing', 0.5862443447113037)]

In [34]:
model.most_similar(['pequim','noruega'],['oslo'])

[('china', 0.7162374258041382),
 ('chinanteca', 0.6867305040359497),
 ('pequis', 0.6672309041023254),
 ('chinantec', 0.655956506729126),
 ('chinantecas', 0.6506609916687012),
 ('pequito', 0.6477333307266235),
 ('chinampa', 0.6447343826293945),
 ('pequiz', 0.644403874874115),
 ('rpchina', 0.6389163732528687),
 ('chinasa', 0.6379231214523315)]

In [35]:
model.most_similar(['pequim','noruega'],['china'])

[('oslo', 0.7026902437210083),
 ('noruego', 0.6771032810211182),
 ('norueg', 0.6520507335662842),
 ('norueguesa', 0.6439430713653564),
 ('noruegues', 0.6313618421554565),
 ('noruegu', 0.6305091977119446),
 ('estocolmo', 0.6284868717193604),
 ('noruegueses', 0.608055591583252),
 ('copenhague', 0.6065146327018738),
 ('copenhaga', 0.603462815284729)]

In [36]:
model.most_similar(['oslo','china'],['noruega'])

[('pequim', 0.6368434429168701),
 ('xangai', 0.6351529955863953),
 ('chinai', 0.6303622722625732),
 ('zhongyang', 0.6198270320892334),
 ('zhongjing', 0.6137735843658447),
 ('qianjiang', 0.6114833950996399),
 ('nanjing', 0.611340343952179),
 ('chinawut', 0.609315812587738),
 ('changjiang', 0.6040310859680176),
 ('chengjiang', 0.6036378741264343)]

In [38]:
model.most_similar(['oslo','china'],['pequim'])

[('oslofjord', 0.6721451282501221),
 ('noruega', 0.6696286201477051),
 ('norueg', 0.6479545831680298),
 ('noruego', 0.596128523349762),
 ('norwegen', 0.5603457689285278),
 ('trondheimsfjord', 0.5560199022293091),
 ('norwegian', 0.5538036227226257),
 ('nordvik', 0.5483222603797913),
 ('noruegu', 0.5473634600639343),
 ('noruegues', 0.5457675457000732)]

In [24]:
print(results[0]['incorrect'][:5])

[('PEQUIM', 'CHINA', 'MADRID', 'ESPANHA'), ('PEQUIM', 'CHINA', 'OSLO', 'NORUEGA'), ('BERLIM', 'ALEMANHA', 'CAIRO', 'EGITO'), ('HAVANA', 'CUBA', 'MADRID', 'ESPANHA'), ('HAVANA', 'CUBA', 'OSLO', 'NORUEGA')]


In [148]:
!rm ../models/ptWiki0920-noPP-skip-300.bin

* A partir dos resultados, observamos que não necessariamente a corretude de uma formação da pergunta garante o mesmo resultado para suas permutações.

* Embeddings multi-idiomas
* Treino com AutoTune de parâmetros  

### Comparação OwnModel Br-Eng

In [9]:
ownTestset = glob('../testsets/own*')

In [12]:
models_for_own_testset = ['../models/ptWiki0920300-noPP-cbow-300.bin','../models/ext/fasttext/cc.pt.300.vec','../models/ext/nilc/skip_s300.txt']
keyed_vec_models_for_own_testset = ['../models/ext/nilc/skip_s300.txt']

In [16]:
print('testsets: ', ownTestset)
print('models to test: ', models_for_own_testset)

testsets:  ['../testsets/ownEng.txt', '../testsets/ownBR.txt']
models to test:  ['../models/ptWiki0920300-noPP-cbow-300.bin', '../models/ext/fasttext/cc.pt.300.vec', '../models/ext/nilc/skip_s300.txt']


In [19]:
pool = multiprocessing.Pool()
own_test_br = '../testsets/ownBR.txt'
pool_res = pool.starmap(eval_all_params, [('../models/ptWiki0920300-noPP-cbow-300.bin',own_test_br, get_facebook_vecs_word_analogy_accuracy, True),
                                         ('../models/ext/fasttext/cc.pt.300.vec',own_test_br, get_facebook_vecs_word_analogy_accuracy, False),
                                         ('../models/ext/nilc/skip_s300.txt',own_test_br, get_facebook_vecs_word_analogy_accuracy, False)])


../models/ptWiki0920300-noPP-cbow-300.bin @ ../testsets/ownBR.txt0.0 | setting 0 to missing words | case insensitive
../models/ptWiki0920300-noPP-cbow-300.bin @ ../testsets/ownBR.txt0.0 | case insensitive
../models/ptWiki0920300-noPP-cbow-300.bin @ ../testsets/ownBR.txt0.0 | setting 0 to missing words | case sensitive
../models/ptWiki0920300-noPP-cbow-300.bin @ ../testsets/ownBR.txtNone | case sensitive
../models/ext/nilc/skip_s300.txt @ ../testsets/ownBR.txt0.07407407407407407 | setting 0 to missing words | case insensitive
../models/ext/nilc/skip_s300.txt @ ../testsets/ownBR.txt0.125 | case insensitive
../models/ext/nilc/skip_s300.txt @ ../testsets/ownBR.txt0.0 | setting 0 to missing words | case sensitive
../models/ext/nilc/skip_s300.txt @ ../testsets/ownBR.txtNone | case sensitive
../models/ext/fasttext/cc.pt.300.vec @ ../testsets/ownBR.txt0.0 | setting 0 to missing words | case insensitive
../models/ext/fasttext/cc.pt.300.vec @ ../testsets/ownBR.txt0.0 | case insensitive
../models

In [22]:
pool = multiprocessing.Pool()
own_test_eng = '../testsets/ownEng.txt'
pool_res = pool.starmap(eval_all_params, [('../models/ext/fasttext//wiki-news-300d-1M-subword.vec',own_test_eng, get_facebook_vecs_word_analogy_accuracy, False),
                                         ('../models/ext/fasttext/wiki-news-300d-1M.vec',own_test_eng, get_facebook_vecs_word_analogy_accuracy, False)])


../models/ext/fasttext//wiki-news-300d-1M-subword.vec @ ../testsets/ownEng.txt0.07407407407407407 | setting 0 to missing words | case insensitive
../models/ext/fasttext//wiki-news-300d-1M-subword.vec @ ../testsets/ownEng.txt0.2222222222222222 | case insensitive
../models/ext/fasttext//wiki-news-300d-1M-subword.vec @ ../testsets/ownEng.txt0.0 | setting 0 to missing words | case sensitive
../models/ext/fasttext//wiki-news-300d-1M-subword.vec @ ../testsets/ownEng.txt0.0 | case sensitive
../models/ext/fasttext/wiki-news-300d-1M.vec @ ../testsets/ownEng.txt0.07407407407407407 | setting 0 to missing words | case insensitive
../models/ext/fasttext/wiki-news-300d-1M.vec @ ../testsets/ownEng.txt0.2222222222222222 | case insensitive
../models/ext/fasttext/wiki-news-300d-1M.vec @ ../testsets/ownEng.txt0.0 | setting 0 to missing words | case sensitive
../models/ext/fasttext/wiki-news-300d-1M.vec @ ../testsets/ownEng.txt0.0 | case sensitive


In [18]:
eval_all_params('../models/ptWiki0920300-noPP-cbow-300.bin',own_test_br, get_facebook_vecs_word_analogy_accuracy, True)

../models/ptWiki0920300-noPP-cbow-300.bin @ ../testsets/ownBR.txt0.0 | setting 0 to missing words | case insensitive
../models/ptWiki0920300-noPP-cbow-300.bin @ ../testsets/ownBR.txt0.0 | case insensitive
../models/ptWiki0920300-noPP-cbow-300.bin @ ../testsets/ownBR.txt0.0 | setting 0 to missing words | case sensitive
../models/ptWiki0920300-noPP-cbow-300.bin @ ../testsets/ownBR.txtNone | case sensitive


## [OLD] Treinando Modelos pre-processados e n-processados
Dump BRwiki

# Referencia

https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf