In [6]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from glob import glob


In [7]:
def getModel(modelFile):
    model = KeyedVectors.load_word2vec_format(modelFile)
    return model

# https://datascience-enthusiast.com/DL/Operations_on_word_vectors.html
def similar_cos(u, v):
    distance = 0.0

    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u, v)
    # Compute the L2 norm of u (≈1 line)
#     norm_u = np.sqrt(np.sum(u**2))
    norm_u = np.linalg.norm(u)
    
    # Compute the L2 norm of v (≈1 line)
#     norm_v = np.sqrt(np.sum(v**2))
    norm_v = np.linalg.norm(v)
    
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u * norm_v)

    return cosine_similarity

def extract_relation(target, related):
    return target - related

def find_best_related_word(analogy_vector, model):
#     related = target + relation
    max_sim = 0.0
    best_word = None
    for word in model.vocab:
#         relation1 = relation.reshape(1,-1)
#         relation2 = extract_relation(model.word_vec(target), model.word_vec(word)).reshape(1,-1)
        sim = similar_cos(analogy_vector, model[word])
        if(sim > max_sim):
            best_word = word
    
    return best_word;
    

In [3]:
def processTriadAnalogy(word1,word2,word3, modelsList):
    result = []
    for modelFileRef in modelsList:
        model = getModel(modelFileRef)
        if(word1 in model and word2 in model and word3 in model):
            
            v_word1 = model.word_vec(word1)
            v_word2 = model.word_vec(word2)
            v_word3 = model.word_vec(word3)
    #         basisRelation = extract_relation(v_word1, v_word2)
            basisRelation = v_word1 - v_word2 + v_word3
            bestWordOnModel = find_best_related_word(basisRelation, model)
            result.append('Model: ' + modelFileRef + '/n Best Word : ' + bestWordOnModel)
        else:
            print('Fail:Word not found in model')
    return result
            

In [4]:
# result = processTriadAnalogy(word1,word2,word3,modelsList)

- Teste 1 - Biblioteca KeyedVector e seus metodos de similaridade

In [5]:
modelsList = glob('../models/cbow_*.txt')
model = [getModel(x) for x in modelsList]

print(modelsList)


['../models/cbow_s300.txt', '../models/cbow_s100.txt', '../models/cbow_s50.txt']


In [21]:
model[0].most_similar(positive=['barack', 'obama'])

[('eisenhower', 0.47269853949546814),
 ('lula', 0.460972398519516),
 ('bush', 0.4601280987262726),
 ('ratsiraka', 0.43053072690963745),
 ('clinton', 0.4282917082309723),
 ('saakashvili', 0.4188674986362457),
 ('uribe', 0.4161287844181061),
 ('draghi', 0.40998852252960205),
 ('netanyahu', 0.403103768825531),
 ('yushchenko', 0.3994860053062439)]

In [8]:
word1 = 'rei'
word2 = 'homem'
word3 = 'rainha'

for index, x in enumerate(model):
    nMostSimilar = x.most_similar(positive=[word1,word3], negative=[word2])

    print(modelsList[index], nMostSimilar)
    print('Positive:  ', word1,word3, '\nNegative:',word2)
    print()

../models/cbow_s300.txt [('princesa', 0.5880060791969299), ('infanta', 0.5544092655181885), ('rainha-mãe', 0.5047087669372559), ('ex-rainha', 0.5002995729446411), ('imperatriz', 0.4982605278491974), ('raínha', 0.498224675655365), ('rainha-consorte', 0.4923296570777893), ('duquesa', 0.4890612065792084), ('condessa', 0.4880494177341461), ('regente', 0.460879385471344)]
Positive:   rei rainha 
Negative: homem

../models/cbow_s100.txt [('raínha', 0.6748343706130981), ('princesa', 0.6687842607498169), ('rainha-consorte', 0.6617846488952637), ('rainha-mãe', 0.6472653746604919), ('duquesa', 0.6418556571006775), ('pártia', 0.6328529119491577), ('imperatriz', 0.628162682056427), ('primogénita', 0.6190635561943054), ('condessa', 0.6179429292678833), ('coroação', 0.6084483861923218)]
Positive:   rei rainha 
Negative: homem

../models/cbow_s50.txt [('duquesa', 0.7385571002960205), ('princesa', 0.724755048751831), ('grã-duquesa', 0.7228216528892517), ('imperatriz', 0.7180843949317932), ('rainha-con

In [None]:
word1 = 'rei'
word2 = 'homem'
word3 = 'mulher'

listm = [modelsList[0]]

result = processTriadAnalogy(word1, word2, word3, listm)

Troca de Palavras 

In [13]:
word1 = 'rei'
word2 = 'homem'
word3 = 'mulher'

model = getModel(modelsList[2])

list = model.most_similar(positive=[word1,word3], negative=[word2])

In [14]:
print(modelsList[2], list)
print('Positive:  ', word1,word3, '\nNegative:',word2)

../models/cbow_s100.txt [('esposa', 0.688875675201416), ('filha', 0.6741011142730713), ('governanta', 0.6724554896354675), ('sobrinha', 0.669994056224823), ('madrasta', 0.6670832633972168), ('concubina', 0.6659663915634155), ('dama-de-companhia', 0.6614803075790405), ('benção', 0.659320056438446), ('múmia', 0.6535747051239014), ('bênção', 0.6500141620635986)]
Positive:   rei mulher 
Negative: homem


## Testando os métodos indicados pelo Paper - FastText 300 e  Glove 300

In [5]:
models = []
fast = '../models/skip_s300fast.txt'
glove = '../models/glove_s300.txt'
models.append(getModel(fast))
models.append(getModel(glove))

In [6]:
model = models[0]
word1 = 'rei'
word2 = 'homem'
word3 = 'mulher'

list = model.most_similar(positive=[word1,word3], negative=[word2])

In [8]:
model = models[1]
list2 = model.most_similar(positive=[word1,word3], negative=[word2])b

In [9]:
print(list)
print(list2)

[('rainha', 0.7449502348899841), ('rainha-regente', 0.6796571016311646), ('princesa-regente', 0.6528101563453674), ('esposa', 0.6449019908905029), ('princesa', 0.6373282670974731), ('consorte', 0.6351529955863953), ('«rainha', 0.6276739835739136), ('filha', 0.6266849040985107), ('desposa', 0.6201632022857666), ('rainha-a', 0.6200650930404663)]
[('rainha', 0.7193283438682556), ('filha', 0.6310628652572632), ('esposa', 0.627386212348938), ('princesa', 0.6068346500396729), ('isabel', 0.5972704887390137), ('irmã', 0.5631763935089111), ('consorte', 0.5383861064910889), ('trono', 0.5359665155410767), ('príncipe', 0.5172233581542969), ('joana', 0.5126430988311768)]


In [16]:
models[0].most_similar(positive=['primeira-dama','obama'])

[('primeira-damas', 0.8656221628189087),
 ('barack', 0.8468867540359497),
 ('ex-primeira-dama', 0.7975351214408875),
 ('obamas', 0.762374997138977),
 ('clinton', 0.7279771566390991),
 ('obama.a', 0.7279264330863953),
 ('segunda-dama', 0.721454918384552),
 ('vice-primeira-dama', 0.7174794673919678),
 ('robama', 0.713536262512207),
 ('hillary', 0.7119442224502563)]

## Avaliando precisao dos modelos


In [20]:
def eval_analogy_accuracy(modelRef, testSet):
    model = getModel(modelRef)
    acc = model.accuracy(testSet)
    print(acc[0]['correct'])
    corr = len(acc[0]['correct'])
    incorr = len(acc[0]['incorrect'])
    
    return [corr, incorr]

In [21]:
modelsList = glob('../models/*fast.txt')
testSet = glob('../testsets/*Analogies*')
acc = {}
for model in modelsList :
    for test in testSet:        
        acc_ratings = eval_analogy_accuracy(model,test)
        acc[model+test] = acc_ratings
        accuracy = acc_ratings[0] / (acc_ratings[0] + acc_ratings[1])
        print(model + '@' + test + '-' + str(accuracy) + '%')

[('ATENAS', 'GRÉCIA', 'PEQUIM', 'CHINA'), ('ATENAS', 'GRÉCIA', 'BERLIM', 'ALEMANHA'), ('ATENAS', 'GRÉCIA', 'CAIRO', 'EGITO'), ('ATENAS', 'GRÉCIA', 'HAVANA', 'CUBA'), ('ATENAS', 'GRÉCIA', 'HELSÍNQUIA', 'FINLÂNDIA'), ('ATENAS', 'GRÉCIA', 'LONDRES', 'INGLATERRA'), ('ATENAS', 'GRÉCIA', 'MADRID', 'ESPANHA'), ('ATENAS', 'GRÉCIA', 'MOSCOVO', 'RÚSSIA'), ('ATENAS', 'GRÉCIA', 'OSLO', 'NORUEGA'), ('ATENAS', 'GRÉCIA', 'OTTAWA', 'CANADÁ'), ('ATENAS', 'GRÉCIA', 'PARIS', 'FRANÇA'), ('ATENAS', 'GRÉCIA', 'ROMA', 'ITÁLIA'), ('ATENAS', 'GRÉCIA', 'ESTOCOLMO', 'SUÉCIA'), ('ATENAS', 'GRÉCIA', 'TÓQUIO', 'JAPÃO'), ('PEQUIM', 'CHINA', 'BERLIM', 'ALEMANHA'), ('PEQUIM', 'CHINA', 'CAIRO', 'EGITO'), ('PEQUIM', 'CHINA', 'HAVANA', 'CUBA'), ('PEQUIM', 'CHINA', 'HELSÍNQUIA', 'FINLÂNDIA'), ('PEQUIM', 'CHINA', 'LONDRES', 'INGLATERRA'), ('PEQUIM', 'CHINA', 'MOSCOVO', 'RÚSSIA'), ('PEQUIM', 'CHINA', 'OSLO', 'NORUEGA'), ('PEQUIM', 'CHINA', 'OTTAWA', 'CANADÁ'), ('PEQUIM', 'CHINA', 'PARIS', 'FRANÇA'), ('PEQUIM', 'CHINA', 'ROM

[('ATENAS', 'GRÉCIA', 'BAGDÁ', 'IRAQUE'), ('ATENAS', 'GRÉCIA', 'PEQUIM', 'CHINA'), ('ATENAS', 'GRÉCIA', 'BERLIM', 'ALEMANHA'), ('ATENAS', 'GRÉCIA', 'CAIRO', 'EGITO'), ('ATENAS', 'GRÉCIA', 'HAVANA', 'CUBA'), ('ATENAS', 'GRÉCIA', 'CABUL', 'AFEGANISTÃO'), ('ATENAS', 'GRÉCIA', 'LONDRES', 'INGLATERRA'), ('ATENAS', 'GRÉCIA', 'MADRID', 'ESPANHA'), ('ATENAS', 'GRÉCIA', 'MOSCOU', 'RÚSSIA'), ('ATENAS', 'GRÉCIA', 'OSLO', 'NORUEGA'), ('ATENAS', 'GRÉCIA', 'OTTAWA', 'CANADÁ'), ('ATENAS', 'GRÉCIA', 'PARIS', 'FRANÇA'), ('ATENAS', 'GRÉCIA', 'ROMA', 'ITÁLIA'), ('ATENAS', 'GRÉCIA', 'ESTOCOLMO', 'SUÉCIA'), ('ATENAS', 'GRÉCIA', 'TEERÃ', 'IRÃ'), ('ATENAS', 'GRÉCIA', 'TÓQUIO', 'JAPÃO'), ('BAGDÁ', 'IRAQUE', 'PEQUIM', 'CHINA'), ('BAGDÁ', 'IRAQUE', 'BERLIM', 'ALEMANHA'), ('BAGDÁ', 'IRAQUE', 'HAVANA', 'CUBA'), ('BAGDÁ', 'IRAQUE', 'CABUL', 'AFEGANISTÃO'), ('BAGDÁ', 'IRAQUE', 'MOSCOU', 'RÚSSIA'), ('BAGDÁ', 'IRAQUE', 'OSLO', 'NORUEGA'), ('BAGDÁ', 'IRAQUE', 'OTTAWA', 'CANADÁ'), ('BAGDÁ', 'IRAQUE', 'PARIS', 'FRANÇA')

In [22]:
modelsList = glob('../models/*fast.txt')
model  =  getModel(modelsList[0])


In [28]:
model.most_similar(positive=['atenas','pequim'],negative=['grécia'])

[('seul', 0.6353664398193359),
 ('pequim-taipé', 0.6326602101325989),
 ('xangai', 0.6200852394104004),
 ('pequim-xangai', 0.6111860275268555),
 ('pequim-pyongyang', 0.6070454120635986),
 ('pequim-china', 0.6038728952407837),
 ('taipé', 0.5943089723587036),
 ('xiaoshuang', 0.5923561453819275),
 ('pequim.o', 0.590029239654541),
 ('xangai-china', 0.5886864066123962)]

In [29]:
model.most_similar(positive=['atenas','china'],negative=['grécia'])

[('pequim', 0.8225739002227783),
 ('xangai-china', 0.7064758539199829),
 ('tianjing', 0.699294924736023),
 ('shenzhen-china', 0.689010500907898),
 ('zhongzhou', 0.6873874664306641),
 ('zhendong', 0.6834413409233093),
 ('chinesa', 0.6772347688674927),
 ('zhongjing', 0.6769419312477112),
 ('xangai', 0.6768250465393066),
 ('zhongyang', 0.6758601665496826)]

In [30]:
model.most_similar(positive=['grécia','china'],negative=['atenas'])

[('coreia', 0.6270121335983276),
 ('taiwan', 0.621976375579834),
 ('tailândia', 0.6031404137611389),
 ('china.', 0.5803037881851196),
 ('coréia', 0.558039665222168),
 ('japão', 0.5508037209510803),
 ('china-taiwan', 0.550620436668396),
 ('índia', 0.5471624135971069),
 ('malásia', 0.5458478927612305),
 ('rússia', 0.5385729670524597)]

In [31]:
model.most_similar(positive=['grécia','pequim'],negative=['atenas'])

[('china', 0.7808929681777954),
 ('taiwan', 0.658018171787262),
 ('pequim-pyongyang', 0.6476016044616699),
 ('coreia', 0.6296229958534241),
 ('pequim-china', 0.609553337097168),
 ('china-taiwan', 0.6077865362167358),
 ('indonésia', 0.6074162721633911),
 ('pequim-taipé', 0.6063328385353088),
 ('tibete', 0.6040211915969849),
 ('china.', 0.5988819599151611)]

In [19]:
modelsList = glob('../models/*fast.txt')
testSet = glob('../testsets/*.txt')
acc = {}
for model in modelsList :
    for test in testSet:        
        acc_ratings = eval_analogy_accuracy(model,test)
        acc[model+test] = acc_ratings
        accuracy = acc_ratings[0] / (acc_ratings[0] + acc_ratings[1])
        print(model + '@' + test + '-' + str(accuracy) + '%')

ValueError: missing section header before line #0 in ../testsets/LX-WordSim-353.txt

# Referencia

https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf