In [5]:
#  step through years and visualize in PCA space

import mysql.connector as mysql
import pickle
import json

import numpy as np
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['svg.fonttype'] = 'none'

import time

### mysql client

In [9]:
db_name = 'test_pubmed'  # db name collisons? https://stackoverflow.com/questions/14011968/user-cant-access-a-database
client_config = {'unix_socket':'/home/brendanchambers/.sql.sock',
                'database': db_name,
                'use_pure': True}  # for python connector

## load pre-fit pca model

In [7]:
pca_path = 'develop_samples/pca_model1.pkl'  # more located at /project2...
with open(pca_path, 'rb') as file:
    pca_model = pickle.load(file)

# load year publication pmids  & join to embeddings 

(todo join to text as well)

In [10]:
start_year = 2000
end_year = 2005
D_truncate = 10

path2dir = '/project2/jevans/brendan/pubmed_data_processing/year_pmids/'

year_pubs = {}

for year in range(start_year, end_year+1):
    
    print(year)
    
    db = mysql.connect(**client_config)

    filename = 'pubmed_state_{}'.format(year)
    path2pmids = path2dir + filename
    with open(path2pmids,'r') as f:
        data = json.load(f)
    
    year_pub_pmids = data['publications']
    del data # clean up
    
    str_fmt = ', '.join([str(pmid) for pmid in year_pub_pmids])
    
    sql = '''SELECT E.pmid, E.embedding
            FROM scibert_mean_embedding as E
            WHERE E.pmid IN ({})'''.format(str_fmt)
    
    start_time = time.time()
    cursor = db.cursor()
    cursor.execute(sql)
    output = cursor.fetchall()
    cursor.close()

    end_time = time.time()
    elapsed = end_time - start_time
    print("sql join executed in {} s".format(elapsed))
    
    pub_embeddings = np.array([np.frombuffer(row[1],dtype='float64') for row in output])
    print(np.shape(pub_embeddings))
    
    year_pubs[year] = pca_model.fit_transform(pub_embeddings)[:,:D_truncate]
    
    db.close()
    print()

2000
sql join executed in 500.9125597476959 s
(518190, 768)

2001
sql join executed in 474.5056354999542 s
(537436, 768)

2002
sql join executed in 427.461900472641 s
(549713, 768)

2003
sql join executed in 438.1993021965027 s
(568117, 768)

2004
sql join executed in 457.96510910987854 s
(609031, 768)

2005
sql join executed in 472.5503866672516 s
(641928, 768)



In [11]:
year_pubs_path = 'year_pubs_test.json'

year_pubs_lists = {}
for year in range(start_year, end_year+1):
    year_pubs_lists[year] = year_pubs[year].tolist()

save_obj = {'year_pubs_lists': year_pubs_lists,
            'D_truncate': D_truncate}
with open(year_pubs_path,'w') as f:
    json.dump(save_obj, f)

# load year citation pmids, join to embeddings

In [None]:
year_cites = {}

for year in range(start_year, end_year+1):
    
    print(year)
    
    db = mysql.connect(**client_config)

    filename = 'pubmed_state_{}'.format(year)
    path2pmids = path2dir + filename
    with open(path2pmids,'r') as f:
        data = json.load(f)
    
    year_cite_pmids = data['citations']
    del data # clean up
    
    str_fmt = ', '.join([str(pmid) for pmid in year_cite_pmids])
    
    sql = '''SELECT E.pmid, E.embedding
            FROM scibert_mean_embedding as E
            WHERE E.pmid IN ({})'''.format(str_fmt)
    
    start_time = time.time()
    cursor = db.cursor()
    cursor.execute(sql)
    output = cursor.fetchall()
    cursor.close()

    end_time = time.time()
    elapsed = end_time - start_time
    print("sql join executed in {} s".format(elapsed))
    
    cite_embeddings = np.array([np.frombuffer(row[1],dtype='float64') for row in output])
    print(np.shape(cite_embeddings))
    
    year_cites[year] = pca_model.fit_transform(cite_embeddings)[:,:D_truncate]
    
    db.close()
    print()

2000


In [None]:
year_cites_path = 'year_cites_test.json'

year_cites_lists = {}
for year in range(start_year, end_year+1):
    year_cites_lists[year] = year_pubs_list[year].tolist()
    
save_obj = {'year_cites_lists': year_cites_lists,
            'D_truncate': D_truncate}

with open(year_cites_path,'w') as f:
    json.dump(save_obj, f)


### plot publications and citations

In [None]:
start_year=1958
end_year=1963

(f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {}.png'.format(start_year))
plt.savefig('publications and citations prototype {}.svg'.format(start_year))
plt.show()

In [None]:
start_year=1968
end_year=1973

(f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {}.png'.format(start_year))
plt.savefig('publications and citations prototype {}.svg'.format(start_year))
plt.show()

In [None]:
start_year=1978
end_year=1983

(f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {}.png'.format(start_year))
plt.savefig('publications and citations prototype {}.svg'.format(start_year))
plt.show()

In [None]:
start_year=1988
end_year=1993

(f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {}.png'.format(start_year))
plt.savefig('publications and citations prototype {}.svg'.format(start_year))
plt.show()

In [None]:
start_year=1998
end_year=2003

(f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {}.png'.format(start_year))
plt.savefig('publications and citations prototype {}.svg'.format(start_year))
plt.show()

In [None]:
start_year=2003
end_year=2008

(f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {}.png'.format(start_year))
plt.savefig('publications and citations prototype {}.svg'.format(start_year))
plt.show()

In [None]:
start_year=2008
end_year=2013

(f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {}.png'.format(start_year))
plt.savefig('publications and citations prototype {}.svg'.format(start_year))
plt.show()

In [None]:
start_year=2013
end_year=2018

(f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {}.png'.format(start_year))
plt.savefig('publications and citations prototype {}.svg'.format(start_year))
plt.show()

In [None]:
def plot_pubs_and_cites(start_year, end_year):
    (f, ax) = plt.subplots(end_year - start_year + 1,
                       2,
                       sharex='all', sharey='all',
                       figsize=(4,12))

for i_year, year in enumerate(range(start_year, end_year+1)):
    
    print(i_year, year)

    sns.kdeplot(year_pubs[year][:,0],
                year_pubs[year][:,1],
                ax=ax[i_year,0],
                shade=True,
                cmap='Blues')
    ax[i_year,0].set_title('published: year {}'.format(year))
    
    
    sns.kdeplot(year_cites[year][:,0],
                year_cites[year][:,1],
                ax=ax[i_year,1],
                shade=True,
                cmap='Reds')
    ax[i_year,1].set_title('cited: {}'.format(year))
    
plt.savefig('publications and citations prototype {} - {}.png'.format(start_year, end_year))
plt.savefig('publications and citations prototype {} - {}.svg'.format(start_year, end_year))
plt.show()

In [None]:
plot_pubs_and_cites(2000, 2005)