In [1]:
import pandas as pd
from numpy.random import randint, choice, seed
from gensim.models import Word2Vec
from dask import delayed, compute
from dask.distributed import Client
import os

In [2]:
corpus = pd.read_csv('sentence_tokens.tsv', sep = '\\t', engine = 'python')

In [3]:
corpus = corpus.dropna(subset = ['text'])

In [4]:
corpus['text'] = corpus['text'].str.split(',').fillna('')

In [5]:
corpus = corpus.groupby('month_yr')['text'].agg(list).reset_index()

In [6]:
corpus

Unnamed: 0,month_yr,text
0,1-1948,"[[well, known, now, india, has, made, referenc..."
1,1-1949,"[[your, excellency, mr, premier, vice-chancell..."
2,1-1950,"[[mr, chancellor, mr, vice-chancellor, fellow,..."
3,1-1951,"[[your, highness, should, just, like, say, how..."
4,1-1955,"[[french, establishments, land, customs, cordo..."
...,...,...
824,9-2014,"[[official, spokespersons, response, queries, ..."
825,9-2015,"[[most, venerable, dr, founder, chancellor, in..."
826,9-2016,"[[your, excellency, president, abdel, fattah, ..."
827,9-2017,"[[your, excellencies, president, new developme..."


In [7]:
seed(932487)
seeds = randint(1, 100000, 100)

In [8]:
def generate_hyperparameters(rseed):
    seed(rseed)
    vsize = choice(range(50, 360, 10))
    wsize = choice(range(2, 11, 1))
    nsize = choice(range(5, 21, 1))
    return vsize, wsize, nsize

In [9]:
def run_word2vec(seed_idx, rseed, month, tokens, vector_dir):
    # sample tokens with replacement
    sample_tokens = tokens.sample(n = len(tokens), replace = True, random_state = rseed)
    # randomly generate SGNS hyperparameters
    vsize, wsize, nsize = generate_hyperparameters(rseed)
    print('Training model', seed_idx, 'for', month)
    # train word embeddings 
    ## set workers to max available threads on node if using more than one node in cluster
    ## providing seed, but not fully reproducible without multithreading hash function if workers > 1
    model = Word2Vec(sentences = sample_tokens, vector_size = vsize, window = wsize, 
                     min_count = 1, workers = 1, sg = 1, negative = nsize, seed = rseed)
    word_vectors = model.wv
    print('Saving results\n')
    # save model to avoid running out of memory
    filepath = vector_dir + month + '-' + str(seed_idx) + '.wordvectors'
    word_vectors.save(filepath)
    return

In [10]:
client = Client(n_workers = 8)

In [11]:
vector_dir = 'G:/vectors/'
if not os.path.exists(vector_dir):
    os.makedirs(vector_dir)

In [12]:
for row in corpus.itertuples(index = False):
    # extract month and tokens
    month = row[0]
    ## convert to pandas series for easier sampling
    tokens = pd.Series(row[1])
    # tokens object is too large to simply pass to function; scatter to all workers before executing
    tokens = client.scatter(tokens, broadcast = True)
    compute([delayed(run_word2vec)(seed_idx, rseed, month, tokens, vector_dir) for seed_idx, 
             rseed in enumerate(seeds)])

In [13]:
client.close()