In [1]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from glob import glob


In [2]:
def getModel(modelFile):
    model = KeyedVectors.load_word2vec_format(modelFile)
    return model

# https://datascience-enthusiast.com/DL/Operations_on_word_vectors.html
def similar_cos(u, v):
    distance = 0.0

    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u, v)
    # Compute the L2 norm of u (≈1 line)
#     norm_u = np.sqrt(np.sum(u**2))
    norm_u = np.linalg.norm(u)
    
    # Compute the L2 norm of v (≈1 line)
#     norm_v = np.sqrt(np.sum(v**2))
    norm_v = np.linalg.norm(v)
    
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u * norm_v)

    return cosine_similarity

def extract_relation(target, related):
    return target - related

def find_best_related_word(analogy_vector, model):
#     related = target + relation
    max_sim = 0.0
    best_word = None
    for word in model.vocab:
#         relation1 = relation.reshape(1,-1)
#         relation2 = extract_relation(model.word_vec(target), model.word_vec(word)).reshape(1,-1)
        sim = similar_cos(analogy_vector, model[word])
        if(sim > max_sim):
            best_word = word
    
    return best_word;
    

In [3]:
def processTriadAnalogy(word1,word2,word3, modelsList):
    result = []
    for modelFileRef in modelsList:
        model = getModel(modelFileRef)
        if(word1 in model and word2 in model and word3 in model):
            
            v_word1 = model.word_vec(word1)
            v_word2 = model.word_vec(word2)
            v_word3 = model.word_vec(word3)
    #         basisRelation = extract_relation(v_word1, v_word2)
            basisRelation = v_word1 - v_word2 + v_word3
            bestWordOnModel = find_best_related_word(basisRelation, model)
            result.append('Model: ' + modelFileRef + '/n Best Word : ' + bestWordOnModel)
        else:
            print('Fail:Word not found in model')
    return result
            

In [4]:
# result = processTriadAnalogy(word1,word2,word3,modelsList)

- Teste 1 - Biblioteca KeyedVector e seus metodos de similaridade

In [4]:
modelsList = glob('../models/cbow_*.txt')
model = [getModel(x) for x in modelsList]

print(modelsList)


['../models/cbow_s300.txt', '../models/cbow_s100.txt', '../models/cbow_s50.txt']


In [8]:
word1 = 'rei'
word2 = 'homem'
word3 = 'rainha'

for index, x in enumerate(model):
    nMostSimilar = x.most_similar(positive=[word1,word3], negative=[word2])

    print(modelsList[index], nMostSimilar)
    print('Positive:  ', word1,word3, '\nNegative:',word2)
    print()

../models/cbow_s300.txt [('princesa', 0.5880060791969299), ('infanta', 0.5544092655181885), ('rainha-mãe', 0.5047087669372559), ('ex-rainha', 0.5002995729446411), ('imperatriz', 0.4982605278491974), ('raínha', 0.498224675655365), ('rainha-consorte', 0.4923296570777893), ('duquesa', 0.4890612065792084), ('condessa', 0.4880494177341461), ('regente', 0.460879385471344)]
Positive:   rei rainha 
Negative: homem

../models/cbow_s100.txt [('raínha', 0.6748343706130981), ('princesa', 0.6687842607498169), ('rainha-consorte', 0.6617846488952637), ('rainha-mãe', 0.6472653746604919), ('duquesa', 0.6418556571006775), ('pártia', 0.6328529119491577), ('imperatriz', 0.628162682056427), ('primogénita', 0.6190635561943054), ('condessa', 0.6179429292678833), ('coroação', 0.6084483861923218)]
Positive:   rei rainha 
Negative: homem

../models/cbow_s50.txt [('duquesa', 0.7385571002960205), ('princesa', 0.724755048751831), ('grã-duquesa', 0.7228216528892517), ('imperatriz', 0.7180843949317932), ('rainha-con

In [None]:
word1 = 'rei'
word2 = 'homem'
word3 = 'mulher'

listm = [modelsList[0]]

result = processTriadAnalogy(word1, word2, word3, listm)

Troca de Palavras 

In [13]:
word1 = 'rei'
word2 = 'homem'
word3 = 'mulher'

model = getModel(modelsList[2])

list = model.most_similar(positive=[word1,word3], negative=[word2])

In [14]:
print(modelsList[2], list)
print('Positive:  ', word1,word3, '\nNegative:',word2)

../models/cbow_s100.txt [('esposa', 0.688875675201416), ('filha', 0.6741011142730713), ('governanta', 0.6724554896354675), ('sobrinha', 0.669994056224823), ('madrasta', 0.6670832633972168), ('concubina', 0.6659663915634155), ('dama-de-companhia', 0.6614803075790405), ('benção', 0.659320056438446), ('múmia', 0.6535747051239014), ('bênção', 0.6500141620635986)]
Positive:   rei mulher 
Negative: homem


# Referencia

https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf