## Preliminaries

In [1]:
!pip install gensim



In [2]:
import os
import pandas as pd
import numpy as np
import tqdm

import gensim
from gensim.models import Word2Vec, Phrases

## Import data

In [3]:
CORPUS_COL = 'content-cleaned'
VOCAB_COL = 'content-no-top'
# Maybe use bi-grams?

In [4]:
df = pd.read_json('comments-semantic-processed.json')
df[CORPUS_COL] = df[CORPUS_COL].str.split()
documents = df[CORPUS_COL].tolist()
len(documents)

15924

## Embedding model

In [5]:
# Train and save model
bigram_transformer = Phrases(documents)
model_embedding = Word2Vec(
    sentences=documents,
    vector_size=300,
    window=3,
    min_count=10,
    workers=4,
    sg=1
)
model_embedding.save('test-Word2Vec-all.model')

In [7]:
# Save embedding matrix for vocab
project_ids = df['project-id'].unique().tolist()
corpus_keys = model_embedding.wv.key_to_index.keys()

for id in project_ids:
    print(f'Creating embedding matrix for {id}...')
    count = 0
    df_project = df[df['project-id'] == id]
    vocabulary = list(set(df_project[VOCAB_COL].str.cat(sep=' ').split()))
    f = open(f'test-embedMatrix-{id}.txt', 'w')
    for v in corpus_keys:
        if v in vocabulary:
            vec = model_embedding.wv[v]
            vec_str = ['%.9f' % val for val in vec]
            vec_str = ' '.join(vec_str)
            f.write(v + ' ' + vec_str + '\n')
            count += 1
    f.close()
    print(f'Embedding matrix created for {count} words in {id}.\n')
    break

Creating embedding matrix for P1...
Embedding matrix created for 2921 words in P1.

