In [2]:
import matplotlib as mpl

import matplotlib.pyplot as plt
#import seaborn as sns
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

import networkx as nx

import pymysql
import pickle
import json

import numpy as np
from sklearn.decomposition import PCA

import time
import hdbscan
import umap

import spacy
from gensim.models import Word2Vec

import random
import re
from pmids2vec import pmids2vec, pmids2vec_titlesOnly
from pmids2corpus import pmids2corpus    #  todo integrate this into pmids2vec

### control parameters

In [3]:
path2clusteredPMIDs = 'data_processing_feb2020/pmids_2018_250k.json'
model_export_prefix = '2018_50k'
embed_titles_text = True
embed_abstracts_text = False


### get metadata

In [4]:
with open(path2clusteredPMIDs, 'r') as f:
    data = json.load(f)

N_samples = len(data['pmids'])

In [None]:
### embed abstract text

In [4]:
# todo force samples to have the same size, test impact of vocabulary ? 
#    or maybe just test variance over multiple samplesets to check for effects of sample size asymmetry

# load pmids

if embed_abstracts_text:

    for sample_id in range(N_samples):
        model_names = []

        with open(path2clusteredPMIDs, 'r') as f:
            data = json.load(f)

        #print(data['pmids'])   # i_sample, i_cluster
        sample_pmids = []

        clustered_pmids = data['pmids'][str(sample_id)]
        for cluster_id_str, pmids_list in clustered_pmids.items():

            model_export_path = 'data_processing_feb2020/abstracts_{}_cluster{}_{}.model'.format(
                                    sample_id, cluster_id_str, model_export_prefix)
            model_names.append(model_export_path)

            print('calling pmids2vec...')
            pmids2vec(pmids_list, model_export_path)
    

calling pmids2vec...
SQL join executed in 41.85774827003479 s
SQL results fetched and cast in 0.015706539154052734 s
training word2vec model...
params: 20 dimensions, 5 window size, 10 min count
elapsed: 7.961929798126221
calling pmids2vec...
SQL join executed in 40.65176296234131 s
SQL results fetched and cast in 0.006037235260009766 s
training word2vec model...
params: 20 dimensions, 5 window size, 10 min count
elapsed: 3.0556695461273193
calling pmids2vec...
SQL join executed in 44.107752561569214 s
SQL results fetched and cast in 0.013501405715942383 s
training word2vec model...
params: 20 dimensions, 5 window size, 10 min count
elapsed: 8.80967116355896
calling pmids2vec...
SQL join executed in 39.87824749946594 s
SQL results fetched and cast in 0.016404390335083008 s
training word2vec model...
params: 20 dimensions, 5 window size, 10 min count
elapsed: 7.9926722049713135
calling pmids2vec...
SQL join executed in 44.23684525489807 s
SQL results fetched and cast in 0.00403690338134

In [5]:
# recreate & save the corpus for good measure

if embed_abstracts_text:

    for sample_id in range(N_samples):
        model_names = []

        with open(path2clusteredPMIDs, 'r') as f:
            data = json.load(f)
        sample_pmids = []

        clustered_pmids = data['pmids'][str(sample_id)]
        for cluster_id_str, pmids_list in clustered_pmids.items():

            corpus_export_path = 'data_processing_feb2020/abstracts_{}_cluster{}_{}_corpus.json'.format(
                                    sample_id, cluster_id_str, model_export_prefix)
            model_names.append(corpus_export_path)

            print('calling pmids2corpus...')
            pmids2corpus(pmids_list, corpus_export_path)


calling pmids2corpus...
SQL join executed in 40.627949714660645 s
SQL results fetched and cast in 0.010574579238891602 s
saving new work to data_processing_feb2020/abstracts_0_cluster0_2018_50k_corpus.json
calling pmids2corpus...
SQL join executed in 42.31782245635986 s
SQL results fetched and cast in 0.00613093376159668 s
saving new work to data_processing_feb2020/abstracts_0_cluster1_2018_50k_corpus.json
calling pmids2corpus...
SQL join executed in 42.913472175598145 s
SQL results fetched and cast in 0.015854597091674805 s
saving new work to data_processing_feb2020/abstracts_0_cluster2_2018_50k_corpus.json
calling pmids2corpus...
SQL join executed in 42.918598890304565 s
SQL results fetched and cast in 0.010352373123168945 s
saving new work to data_processing_feb2020/abstracts_1_cluster0_2018_50k_corpus.json
calling pmids2corpus...
SQL join executed in 43.48508048057556 s
SQL results fetched and cast in 0.0036995410919189453 s
saving new work to data_processing_feb2020/abstracts_1_cl

In [6]:
### embed titles only

In [5]:
# titles only (saves the corpus while training the titles-only models)

# todo force samples to have the same size, test impact of vocabulary ? 
#    or maybe just test variance over multiple samplesets to check for effects of sample size asymmetry

# load pmids

if embed_titles_text:
    
    for sample_id in range(N_samples):
        model_names = []

                                 #  todo need much larger set of pmids since this is titles only
        with open(path2clusteredPMIDs, 'r') as f:
            data = json.load(f)

        #print(data['pmids'])   # i_sample, i_cluster
        sample_pmids = []
        clustered_pmids = data['pmids'][str(sample_id)]
        for cluster_id_str, pmids_list in clustered_pmids.items():

            model_export_path = 'data_processing_feb2020/titles_{}_cluster{}_{}'.format(
                    sample_id, cluster_id_str, model_export_prefix)

            print('calling pmids2vec...')
            pmids2vec_titlesOnly(pmids_list, model_export_path)


calling pmids2vec...
SQL join executed in 31.25151252746582 s
SQL results fetched and cast in 0.04737281799316406 s
saving corpus of titles to data_processing_feb2020/titles_0_cluster0_2018_50k_titles__corpus.json
training word2vec model...
params: 20 dimensions, 25 window size, 10 min count
elapsed: 3.403808355331421
calling pmids2vec...
SQL join executed in 27.916691303253174 s
SQL results fetched and cast in 0.007487297058105469 s
saving corpus of titles to data_processing_feb2020/titles_0_cluster1_2018_50k_titles__corpus.json
training word2vec model...
params: 20 dimensions, 25 window size, 10 min count
elapsed: 0.41690778732299805
calling pmids2vec...
SQL join executed in 27.726476430892944 s
SQL results fetched and cast in 0.017646074295043945 s
saving corpus of titles to data_processing_feb2020/titles_0_cluster2_2018_50k_titles__corpus.json
training word2vec model...
params: 20 dimensions, 25 window size, 10 min count
elapsed: 1.23164963722229
calling pmids2vec...
SQL join execu