In [1]:
import matplotlib as mpl

import matplotlib.pyplot as plt
#import seaborn as sns
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

import networkx as nx

import pymysql
import pickle
import json

import numpy as np
from sklearn.decomposition import PCA

import time
import hdbscan
import umap

import spacy
from gensim.models import Word2Vec

import random
import re
from pmids2vec import pmids2vec, pmids2vec_titlesOnly
from pmids2corpus import pmids2corpus    #  todo integrate this into pmids2vec

### control parameters

In [2]:
path2clusteredPMIDs = 'data_processing_feb2020/pmids_2018_250k.json'
model_export_prefix = '2018_250k'
embed_titles_text = True
embed_abstracts_text = False


### get metadata

In [3]:
with open(path2clusteredPMIDs, 'r') as f:
    data = json.load(f)

N_samples = len(data['pmids'])

In [4]:
### embed abstract text

In [5]:
# todo force samples to have the same size, test impact of vocabulary ? 
#    or maybe just test variance over multiple samplesets to check for effects of sample size asymmetry

# load pmids

if embed_abstracts_text:

    for sample_id in range(N_samples):
        model_names = []

        with open(path2clusteredPMIDs, 'r') as f:
            data = json.load(f)

        #print(data['pmids'])   # i_sample, i_cluster
        sample_pmids = []

        clustered_pmids = data['pmids'][str(sample_id)]
        for cluster_id_str, pmids_list in clustered_pmids.items():

            model_export_path = 'data_processing_feb2020/abstracts_{}_cluster{}_{}.model'.format(
                                    sample_id, cluster_id_str, model_export_prefix)
            model_names.append(model_export_path)

            print('calling pmids2vec...')
            pmids2vec(pmids_list, model_export_path)
            
    print('--------------')
    

calling pmids2vec...
SQL join executed in 28.77197265625 s
SQL results fetched and cast in 0.020276546478271484 s
training word2vec model...
params: 20 dimensions, 5 window size, 165 min count
elapsed: 7.030775308609009
calling pmids2vec...
SQL join executed in 38.83341073989868 s
SQL results fetched and cast in 0.013315200805664062 s
training word2vec model...
params: 20 dimensions, 5 window size, 79 min count
elapsed: 3.1573305130004883
calling pmids2vec...
SQL join executed in 33.82392621040344 s
SQL results fetched and cast in 0.004324197769165039 s
training word2vec model...
params: 20 dimensions, 5 window size, 45 min count
elapsed: 2.3456482887268066
calling pmids2vec...
SQL join executed in 34.66364002227783 s
SQL results fetched and cast in 0.004043102264404297 s
training word2vec model...
params: 20 dimensions, 5 window size, 23 min count
elapsed: 1.3864455223083496
calling pmids2vec...
SQL join executed in 35.77225875854492 s
SQL results fetched and cast in 0.002981901168823

In [6]:
# recreate & save the corpus for good measure

if embed_abstracts_text:

    for sample_id in range(N_samples):
        model_names = []

        with open(path2clusteredPMIDs, 'r') as f:
            data = json.load(f)
        sample_pmids = []

        clustered_pmids = data['pmids'][str(sample_id)]
        for cluster_id_str, pmids_list in clustered_pmids.items():

            corpus_export_path = 'data_processing_feb2020/abstracts_{}_cluster{}_{}_corpus.json'.format(
                                    sample_id, cluster_id_str, model_export_prefix)
            model_names.append(corpus_export_path)

            print('calling pmids2corpus...')
            pmids2corpus(pmids_list, corpus_export_path)
            
    print('---------------')


calling pmids2corpus...
SQL join executed in 25.995201349258423 s
SQL results fetched and cast in 0.021398305892944336 s
saving new work to data_processing_feb2020/abstracts_0_cluster0_2018_200k_corpus.json
calling pmids2corpus...
SQL join executed in 27.134095430374146 s
SQL results fetched and cast in 0.0072422027587890625 s
saving new work to data_processing_feb2020/abstracts_0_cluster1_2018_200k_corpus.json
calling pmids2corpus...
SQL join executed in 27.42020320892334 s
SQL results fetched and cast in 0.0070378780364990234 s
saving new work to data_processing_feb2020/abstracts_0_cluster2_2018_200k_corpus.json
calling pmids2corpus...
SQL join executed in 26.31787919998169 s
SQL results fetched and cast in 0.0035982131958007812 s
saving new work to data_processing_feb2020/abstracts_0_cluster3_2018_200k_corpus.json
calling pmids2corpus...
SQL join executed in 27.294188022613525 s
SQL results fetched and cast in 0.004287004470825195 s
saving new work to data_processing_feb2020/abstrac

In [7]:
### embed titles only

In [8]:
# titles only (saves the corpus while training the titles-only models)

# todo force samples to have the same size, test impact of vocabulary ? 
#    or maybe just test variance over multiple samplesets to check for effects of sample size asymmetry

# load pmids

if embed_titles_text:
    
    for sample_id in range(N_samples):
        model_names = []

                                 #  todo need much larger set of pmids since this is titles only
        with open(path2clusteredPMIDs, 'r') as f:
            data = json.load(f)

        #print(data['pmids'])   # i_sample, i_cluster
        sample_pmids = []
        clustered_pmids = data['pmids'][str(sample_id)]
        for cluster_id_str, pmids_list in clustered_pmids.items():

            model_export_path = 'data_processing_feb2020/titles_{}_cluster{}_{}'.format(
                    sample_id, cluster_id_str, model_export_prefix)

            print('calling pmids2vec...')
            pmids2vec_titlesOnly(pmids_list, model_export_path)
            
        print('---------------------------------')
