## Python2Vec

In [1]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.doc2vec import TaggedLineDocument

### Word2Vec Model

In [2]:
try:
    w2v_model = Word2Vec.load('models/w2v')

except FileNotFoundError:
    w2v_model = Word2Vec()

    train_data = open('python_line_breaks.txt', 'r')
    sentence = LineSentence(train_data)

    w2v_model.build_vocab(sentence)
    w2v_model.train(sentence, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)

    train_data.close()

    w2v_model.save('models/w2v')

### FastText Model

In [3]:
try:
    fast_model = FastText.load('models/fast')

except FileNotFoundError:
    fast_model = FastText()

    train_data = open('python_line_breaks.txt', 'r')
    sentence = LineSentence(train_data)

    fast_model.build_vocab(sentence)
    fast_model.train(sentence, total_examples=fast_model.corpus_count, epochs=fast_model.epochs)

    train_data.close()

    fast_model.save('models/fast')

### Doc2Vec Model

In [4]:
try:
    d2v_model = Doc2Vec.load('models/d2v')

except FileNotFoundError:
    train_data = open('python_no_breaks.txt', 'r')
    docs = TaggedLineDocument(train_data)

    d2v_model = Doc2Vec()

    d2v_model.build_vocab(docs)
    d2v_model.train(docs, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

    train_data.close()

    d2v_model.save('models/d2v')

In [16]:
N = 1000

id_freq_df = pd.read_csv('identifier_frequency.csv', index_col=0)
top_ids = id_freq_df['identifier'][:N].to_numpy()

w2v_top_ids = np.zeros((N, N))
fast_top_ids = np.zeros((N, N))
d2v_top_ids = np.zeros((N, N))

for i in range(N):
    for j in range(i):

        try:
            w2v_top_ids[i][j] = w2v_model.wv.similarity(top_ids[i], top_ids[j])
        except (KeyError, TypeError):
            pass

        try:
            fast_top_ids[i][j] = fast_model.wv.similarity(top_ids[i], top_ids[j])
        except (KeyError, TypeError):
            pass
        
        try:
            d2v_top_ids[i][j] = d2v_model.wv.similarity(top_ids[i], top_ids[j])
        except (KeyError, TypeError):
            pass

w2v_top_ids += w2v_top_ids.T
fast_top_ids += fast_top_ids.T
d2v_top_ids += d2v_top_ids.T

w2v_most_similar = np.argmax(w2v_top_ids, axis=0)
fast_most_similar = np.argmax(fast_top_ids, axis=0)
d2v_most_similar = np.argmax(d2v_top_ids, axis=0)

df = pd.DataFrame()

df['word'] = top_ids
df['w2v_most_similar'] = top_ids[w2v_most_similar]
df['fast_most_similar'] = top_ids[fast_most_similar]
df['d2v_most_similar'] = top_ids[d2v_most_similar]

df['w2v_similarity'] = np.round(w2v_top_ids[np.arange(N), w2v_most_similar], 2)
df['fast_similarity'] = np.round(fast_top_ids[np.arange(N), fast_most_similar], 2)
df['d2v_similarity'] = np.round(d2v_top_ids[np.arange(N), d2v_most_similar], 2)

df.to_csv('most_similar_identifiers.csv')