In [1]:
# take a look at multiple representation methods

# determine which are doing worse and which are doing better

In [4]:
import numpy as np
import csv
import os
import time
import json
import mysql.connector as mysql

from sklearn.decomposition import PCA
import umap

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm  # for kdemultivariate 

In [18]:
client_config = {'unix_socket':'/home/brendanchambers/.sql.sock',
                            'database':'test_pubmed',
                            'use_pure':True}



journals = ['J. Neurophysiol.', 'Neuroimage']  # journals = ['Brain Res.', 'J. Virol.']

journal_colors = [[0.75, 0.75, 0, 0.5],
                  [0, 0.75, 0.75, 0.5]]


# table names corresponding to embedding conditions
embedding_conditions = ['emb_bert_cls',
               'emb_bert_longtokens_mean',
               'emb_bert_tokens_mean',
               'emb_scibert_cls',
               'emb_scibert_longtokens_mean',
               'emb_scibert_tokens_mean']

# todo also check representations composed from cls & tokens
#  maybe directly optimize on this

compare_embeddings = {}


In [13]:
for condition in embedding_conditions:

    compare_embeddings[condition] = {}

    db = mysql.connect(**client_config)

    for i_journal, journal in enumerate(journals):

        start_time = time.time()

        sql = '''SELECT {}.embedding
                FROM {}
                JOIN metadata ON {}.pmid=metadata.pmid
                WHERE metadata.journal = '{}' '''.format(condition, condition, condition, journal)
        print(sql)

        cursor = db.cursor()
        cursor.execute(sql)
        results = cursor.fetchall()
        cursor.close()
                
        compare_embeddings[condition][journal] = [np.frombuffer(e[0],
                                                                dtype="float64") for e in results]

        end_time = time.time()
        print("journals enumerated in {} s".format(end_time - start_time))
        print()
        
    db.close()
    
    # take a quick look    
    print(condition, journal)
    print('first few entries of the embedding vector ok?: ')
    print(compare_embeddings[condition][journal][0][:10])
    print()

SELECT emb_bert_cls.embedding
                FROM emb_bert_cls
                JOIN metadata ON emb_bert_cls.pmid=metadata.pmid
                WHERE metadata.journal = 'J. Neurophysiol.' 
journals enumerated in 0.4591634273529053 s

SELECT emb_bert_cls.embedding
                FROM emb_bert_cls
                JOIN metadata ON emb_bert_cls.pmid=metadata.pmid
                WHERE metadata.journal = 'Neuroimage' 
journals enumerated in 0.5635430812835693 s

emb_bert_cls Neuroimage
first few entries of the embedding vector ok?: 
[-0.26149625 -0.54567975 -0.34452844  0.1171126  -0.10863885  0.27445945
 -0.35322297  0.24213642  0.14350182 -0.66447151]

SELECT emb_bert_longtokens_mean.embedding
                FROM emb_bert_longtokens_mean
                JOIN metadata ON emb_bert_longtokens_mean.pmid=metadata.pmid
                WHERE metadata.journal = 'J. Neurophysiol.' 
journals enumerated in 0.16512131690979004 s

SELECT emb_bert_longtokens_mean.embedding
                FROM emb_b

In [14]:
print(compare_embeddings.keys())

dict_keys(['emb_bert_cls', 'emb_bert_longtokens_mean', 'emb_bert_tokens_mean', 'emb_scibert_cls', 'emb_scibert_longtokens_mean', 'emb_scibert_tokens_mean'])


In [19]:
# todo would be nice to align these spaces and then compare

D = 768
D_pca = 2   # most of the variance preserved at around 300 d

pca_models = []

for i_cond, condition in enumerate(embedding_conditions):
    
    all_articles = np.zeros((0,D))
    
    # full list of samples across journals, to feed to pca
    for journal in journals:
        thisjournal = compare_embeddings[condition][journal]
        all_articles = np.vstack( (all_articles, thisjournal))
        
    # get pca for this embedding condition
    pca = PCA(n_components=D_pca)
    pc = pca.fit_transform(all_articles)
    pca_models.append(pca)
    
    # var explained
    plt.figure(figsize=(1,1))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.title('PCA component var explained')  
    plt.show()
    
    ## 2D pca comparison of journals ##
    (f, ax) = plt.subplots(1, len(journals), sharex='all', sharey='all',
                      figsize=(8,4))
    
    for i_journal, journal in enumerate(journals):
        print(journal)
        samples = compare_embeddings[embedding_conditions[i_cond]][journals[i_journal]]
        print(len(samples))
        pc = pca_models[i_cond].transform(samples)
        ax[i_journal].scatter(pc[:,0], pc[:,1],
                    color=journal_colors[i_journal],marker='.')

    plt.title('2D pca for {} '.format(condition))
    
    


#plt.savefig('J Neurophysiol vs Neuroimaging.png')
#plt.savefig('J Neurophysiol vs Neuroimaging.svg')

KeyError: 'emb_bert_cls'