In [21]:
import numpy as np
import pandas as pd

In [8]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

In [101]:
glove2word2vec(glove_input_file="./model_300/vectors.txt", word2vec_output_file="./model_300/gensim_glove_vectors.txt")

(336145, 300)

In [100]:
glove_model = KeyedVectors.load_word2vec_format("./model_300/gensim_glove_vectors.txt", binary=False)

In [11]:
glove_model.vocab

{'suyo_possessivePronoun': <gensim.models.keyedvectors.Vocab at 0x13949e700>,
 'prpers_pronoun': <gensim.models.keyedvectors.Vocab at 0x13949e1f0>,
 'suyo_determiner': <gensim.models.keyedvectors.Vocab at 0x13949ea30>,
 'el_pronoun': <gensim.models.keyedvectors.Vocab at 0x13949e370>,
 'el_seu_determiner': <gensim.models.keyedvectors.Vocab at 0x13949e2b0>,
 'el_seu_possessivePronoun': <gensim.models.keyedvectors.Vocab at 0x13949e550>,
 'ce_determiner': <gensim.models.keyedvectors.Vocab at 0x13949eac0>,
 'el_determiner': <gensim.models.keyedvectors.Vocab at 0x13949ea90>,
 'ce_demonstrativePronoun': <gensim.models.keyedvectors.Vocab at 0x13949e340>,
 'aquel_demonstrativePronoun': <gensim.models.keyedvectors.Vocab at 0x13949e490>,
 'aquel_determiner': <gensim.models.keyedvectors.Vocab at 0x13949ea00>,
 'en_preposition': <gensim.models.keyedvectors.Vocab at 0x13949e2e0>,
 'cap_noun': <gensim.models.keyedvectors.Vocab at 0x13949e970>,
 'on_adverb': <gensim.models.keyedvectors.Vocab at 0x1394

In [85]:
glove_model.most_similar('science_noun')

[('ciencia_noun', 0.9301629662513733),
 ('zientzia_noun', 0.5177814960479736),
 ('impertèrrit_adjective', 0.2623631954193115),
 ('Hoyales_de_Roa_properNoun', 0.2546602487564087),
 ('jivarització_noun', 0.24314603209495544),
 ('uniformjako_noun', 0.23933115601539612),
 ('ĥaldeo_noun', 0.2355116754770279),
 ('Carlita_properNoun', 0.23205572366714478),
 ('alabaster_noun', 0.23179756104946136),
 ('dializilo_noun', 0.23042315244674683)]

In [22]:
"""
this function return to the vocabulary from source of dictionary.
input: path of lists dictionaries
output: all word from 'source' tag of these dics. word with form: word_pos
"""

def to_word(list_df):
    words = []
    for path in list_df:
        df = pd.read_csv('../code/Small/' + path)
        words += (df['source'].astype(str) + '_' + df['POS'].astype(str)).to_list()
    return set(words)

In [17]:
# get all vocabulary
vocabulary = list(glove_model.vocab.keys())

In [28]:
path_dict_en = ['TransSetEN-CA_small.csv', 'TransSetEN-ES_small.csv', 'TransSetEN-GL_small.csv']
words_en = to_word(path_dict_en)
#interaction beetween words english with vocabulary
words_en = list(set(words_en) & set(vocabulary))
english_vectors = glove_model[words_en]
print(english_vectors.shape)

(47491, 300)


In [29]:
path_dict_fr = ['TransSetFR-CA_small.csv', 'TransSetFR-ES_small.csv']
words_fr = to_word(path_dict_fr)
words_fr = list(set(words_fr) & set(vocabulary))
france_vectors = glove_model[words_fr]
print(france_vectors.shape)

(62855, 300)


In [79]:
words_fr[23706]

'livre_noun'

In [30]:
#get part of speech of word in vocabulary ex: shape_noun -> noun
def get_pos(word):
    return word.split('_')[-1]

In [93]:
"""
This function find the trans word most similarity of input word
----------
params:
    word_vec: vector embedding of input word
    matrix_vec_trans: matrix embedding of the target langague
    dic_trans: vocabulary of the target langague
    pos: Part of speech of input word
"""
def find_translate(word_vec, matrix_vec_trans, dic_trans, pos):
    assert len(matrix_vec_trans) == len(dic_trans)
    #calculate the cosine similary of input vec and matrix of target langague
    cosine_similary = glove_model.cosine_similarities(word_vec, matrix_vec_trans) 
    #sort and return index
    # why return negative? :))) 
    sort_cosine = np.argsort(-cosine_similary)
    for index in sort_cosine:
        trans_word = dic_trans[index]
        if(get_pos(trans_word)==pos):
            return trans_word

In [34]:
# thai_ba_tuan => thai ba tuan
def get_word(word):
    return ' '.join(word.split('_')[0:-1])

In [90]:
test_word_vec = glove_model['kernel_noun']
cosine_similary = glove_model.cosine_similarities(test_word_vec, france_vectors) 
sort_cosine = np.argsort(-cosine_similary)
for index in sort_cosine[0:10]:
    print(words_fr[index])

lance-grenades_noun
Eslida_properNoun
massacrer_verb
sacrifier_verb
promission_noun
uranium_noun
maison_noun
XXVIIème_adjective
Oda_properNoun
sucrer_verb


In [92]:
glove_model.similarity('shit_noun', 'merde_noun')

-0.95093447

In [94]:
# building the dictionary of the word
dict_en_fr = []
index = 0
for word in words_en:
    pos = get_pos(word)
    if index % 1000 == 0:
        print(index,end=' ')
    index+=1
#     print(word)
#     print(pos)
    word_vec = glove_model[word]
    trans_word = find_translate(word_vec, france_vectors, words_fr, pos)
    if trans_word != None:
        trans_word = get_word(trans_word)
#     print(trans_word)
    dict_en_fr.append(get_word(word)+'\t'+str(trans_word)+'\t'+str(pos)+'\t1')
#     print('======================')

0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 

In [95]:
len(dict_en_fr)

47491

In [96]:
with open('../code/BuildDictionary/TIADbaseline_glove_trans_en-fr.tsv', 'w') as f:
    for item in dict_en_fr:
        f.write("%s\n" % item)

In [69]:
!pwd

/Users/batuan/Documents/Master1Luminy/semetre2/TER/GloVe
