In [1]:
import pymysql
import pickle
import json

import math
import numpy as np
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['svg.fonttype'] = 'none'

import time
import hdbscan


import random
import re
from pmids2vec import pmids2vec, pmids2vec_titlesOnly
from pmids2corpus import pmids2corpus    #  todo integrate this into pmids2vec

In [2]:
target_file = 'data_processing_feb2020/pmids_2018_100k_baseline.json'  # name to assign exported results
model_export_prefix = '2018_100k_baseline'

K_sample = 100000   # rule of thumb - at least 5K samples per cluster for abstracts analysis
                   #                 - at least 25K samples per cluster for titles analysis

N_samplesets = 3
year = 2018
path2dir = '/home/brendan/FastData/pubmed2019/pubmed_data_processing/year_pmids/'  # knowledge-garden
        #path2dir = '/project2/jevans/brendan/pubmed_data_processing/year_pmids/'  # RCC Midway2

In [3]:
config_path = '/home/brendan/Projects/AttentionWildfires/attention_wildfires/mysql_config.json'
db_name = 'test_pubmed'  # db name collisons? https://stackoverflow.com/questions/14011968/user-cant-access-a-database
                        # todo should move this db_name into config file
with open(config_path, 'r') as f:
    config_data = json.load(f)
    
client_config = {'database': db_name,
                'user': config_data['user'],
                 'password': config_data['lock']}

## init db connection
db = pymysql.connect(**client_config)


In [4]:
filename = 'pubmed_state_{}'.format(year)
path2pmids = path2dir + filename
with open(path2pmids,'r') as f:
    data = json.load(f)

year_pub_pmids = data['publications']
N_pubs = len(year_pub_pmids)
print("N pubs: {}".format(N_pubs))
del data # clean up


N pubs: 1205220


In [5]:
samples = {}
for i in range(N_samplesets):
    samples[i] = {}
    samples[i]['pmids'] = np.random.choice(year_pub_pmids, K_sample)

In [6]:
pmids = {}  # for export
for i_sample in range(N_samplesets):
    print("sample {}".format(i_sample))
    
    pmids[i_sample] = samples[i_sample]['pmids'].tolist()   # for export
                

sample 0
sample 1
sample 2


In [7]:
save_obj = {'pmids': pmids}

with open(target_file,'w') as f:
    json.dump(save_obj, f, indent=2, sort_keys=True)

In [8]:
path2baselinePMIDs = target_file  # use the file we just saved
embed_titles_text = True
embed_abstracts_text = True

N_samples = len(pmids)

In [9]:

if embed_abstracts_text:

    for sample_id in range(N_samples):
        model_names = []

        with open(path2baselinePMIDs, 'r') as f:
            data = json.load(f)

        clustered_pmids = data['pmids'][str(sample_id)]

        model_export_path = 'data_processing_feb2020/abstracts_{}_baseline_{}.model'.format(
                                sample_id, model_export_prefix)
        model_names.append(model_export_path)

        print('calling pmids2vec...')
        pmids2vec(pmids[sample_id], model_export_path)
            
    print('--------------')
    

calling pmids2vec...
SQL join executed in 40.69608807563782 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL results fetched and cast in 0.6567766666412354 s
training word2vec model...
params: 20 dimensions, 5 window size, 5000 min count
elapsed: 204.728924036026
calling pmids2vec...
SQL join executed in 49.44707918167114 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL results fetched and cast in 0.7187507152557373 s
training word2vec model...
params: 20 dimensions, 5 window size, 5000 min count
elapsed: 243.15041422843933
calling pmids2vec...
SQL join executed in 43.64170861244202 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL results fetched and cast in 0.7757980823516846 s
training word2vec model...
params: 20 dimensions, 5 window size, 5000 min count
elapsed: 135.54032826423645
--------------


In [10]:
# recreate & save the corpus for good measure

if embed_abstracts_text:

    for sample_id in range(N_samples):
        model_names = []

        with open(path2baselinePMIDs, 'r') as f:
            data = json.load(f)

        corpus_export_path = 'data_processing_feb2020/abstracts_{}_baseline_{}_corpus.json'.format(
                                sample_id, model_export_prefix)
        model_names.append(corpus_export_path)

        print('calling pmids2corpus...')
        pmids2corpus(pmids[sample_id], corpus_export_path)
            
    print('---------------')


calling pmids2corpus...
SQL join executed in 41.72226142883301 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL results fetched and cast in 0.4413745403289795 s
saving new work to data_processing_feb2020/abstracts_0_baseline_2018_500k_baseline_corpus.json
calling pmids2corpus...
SQL join executed in 35.266695976257324 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL results fetched and cast in 0.3869459629058838 s
saving new work to data_processing_feb2020/abstracts_1_baseline_2018_500k_baseline_corpus.json
calling pmids2corpus...
SQL join executed in 46.53581190109253 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL results fetched and cast in 0.5192344188690186 s
saving new work to data_processing_feb2020/abstracts_2_baseline_2018_500k_baseline_corpus.json
---------------


In [11]:
# load pmids

if embed_titles_text:
    
    for sample_id in range(N_samples):
        model_names = []

                                 #  need a large set of pmids since this is titles only
        with open(path2baselinePMIDs, 'r') as f:
            data = json.load(f)

        model_export_path = 'data_processing_feb2020/titles_{}_baseline_{}'.format(
                sample_id, model_export_prefix)

        print('calling pmids2vec...')
        pmids2vec_titlesOnly(pmids[sample_id], model_export_path)
            
        print('---------------------------------')


calling pmids2vec...
SQL join executed in 42.63529896736145 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL results fetched and cast in 0.3005862236022949 s
saving corpus of titles to data_processing_feb2020/titles_0_baseline_2018_500k_baseline_titles__corpus.json
training word2vec model...
params: 20 dimensions, 5 window size, 500 min count
elapsed: 20.610513925552368
---------------------------------
calling pmids2vec...
SQL join executed in 48.69512963294983 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL results fetched and cast in 0.31005001068115234 s
saving corpus of titles to data_processing_feb2020/titles_1_baseline_2018_500k_baseline_titles__corpus.json
training word2vec model...
params: 20 dimensions, 5 window size, 500 min count
elapsed: 16.33456301689148
---------------------------------
calling pmids2vec...
SQL join executed in 49.06857490539551 s
fetched 100000 rows...
fetched 100000 rows...
fetched 100000 rows...
SQL