# Figures
Visualize the results of the analyses for the indices paper

In [None]:
import os
import pickle as pkl
import string
import sys
from glob import glob

import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import svgutils.transform as sg
import umap
from plotnine import *
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from svgutils.compose import SVG, Figure, Panel, Text
from tqdm import tqdm

sys.path.append('../indices')
from utils import load_percentile_data, load_journal_data, load_single_heading, parse_metadata

In [None]:
headings = [('nanotechnology', 'microscopy'), ('immunochemistry', 'anatomy'), 
            ('proteomics', 'metabolomics'), ('computational_biology', 'human_genetics')]

In [None]:
for heading1, heading2 in headings:
    percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
    
    hist_data = percentile_data.melt(id_vars='doi', value_vars=[f'{heading1}_pagerank', f'{heading2}_pagerank'],
                                     value_name='PageRank',)
    
    h1 = string.capwords(heading1.replace('_', ' '))
    h2 = string.capwords(heading2.replace('_', ' '))
    
    new_names = {f'{heading1}_pagerank': f'{h1}',
                 f'{heading2}_pagerank': f'{h2}'}
    
    hist_data['Field'] = hist_data['variable'].map(new_names)
        
    plot = ggplot(hist_data, aes(x='PageRank', fill='Field'))
    plot += geom_histogram(position='identity', alpha=.7)
    plot += scale_x_log10()
    plot += scale_y_continuous(name='Count')
    plot += ggtitle(f'{h1} and {h2} Pagerank Distribution')
    plot += theme_classic()
    ggsave(plot, f'../figures/{heading1}-{heading2}-hist.svg')

    plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank',))
    plot += geom_bin2d()
    plot += scale_x_log10(name=f'{h1} Pagerank')
    plot += scale_y_log10(name=f'{h2} Pagerank')
    plot += ggtitle(f'{h1} vs {h2} Pageranks')
    plot += scale_fill_gradient(trans='log')
    plot += theme_classic()

    ggsave(plot, f'../figures/{heading1}-{heading2}-heatmap.svg')
    
    plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', 
              color=f'{heading1}-{heading2}'))
    plot += geom_point()
    plot += scale_x_log10(name=f'{h1} Pagerank')
    plot += scale_y_log10(name=f'{h2} Pagerank')
    plot += ggtitle(f'{h1} and {h2} Percentile Scores')
    plot += scale_color_gradient2(low='purple', mid='#e2e2e2', high='green')
    plot += theme_classic()

    ggsave(plot, f'../figures/{heading1}-{heading2}-difference.svg')

## Combine histograms

In [None]:
plot1 = f'../figures/{headings[0][0]}-{headings[0][1]}-hist.svg'
plot2 = f'../figures/{headings[1][0]}-{headings[1][1]}-hist.svg'
plot3 = f'../figures/{headings[2][0]}-{headings[2][1]}-hist.svg'
plot4 = f'../figures/{headings[3][0]}-{headings[3][1]}-hist.svg'

y_2 = 310
x_2 = 520

fig = Figure("160cm", "160cm",
       Panel(
          SVG(plot1),
          Text("A", 0, 40, size=30),
          ),
       Panel(
          SVG(plot2).move(x_2, 0),
          Text("B", 20, 40, size=30).move(x_2-20, 0),
          ),
       Panel(
          SVG(plot3).move(0, y_2),
          Text("C", 0, 50, size=30).move(0, y_2),
          ),
       Panel(
          SVG(plot4).move(x_2, y_2),
          Text("D", 20, 50, size=30).move(x_2-20, y_2),
          )
       )
fig.save('../figures/combined_histogram.svg')

In [None]:
!inkscape --export-area-drawing -w 1060 -h 636 --export-png=../figures/combined_histogram.png ../figures/combined_histogram.svg -b "#ffffffff"

## Combine heatmaps

In [None]:
plot1 = f'../figures/{headings[0][0]}-{headings[0][1]}-heatmap.svg'
plot2 = f'../figures/{headings[1][0]}-{headings[1][1]}-heatmap.svg'
plot3 = f'../figures/{headings[2][0]}-{headings[2][1]}-heatmap.svg'
plot4 = f'../figures/{headings[3][0]}-{headings[3][1]}-heatmap.svg'

y_2 = 325
x_2 = 500

fig = Figure("1007", "656",
       Panel(
          SVG(plot1),
          Text("A", 25, 20, size=30),
          ),
       Panel(
          SVG(plot2).move(x_2, 0),
          Text("B", 25, 20, size=30).move(x_2-20, 0),
          ),
       Panel(
          SVG(plot3).move(0, y_2),
          Text("C", 25, 20, size=30).move(0, y_2),
          ),
       Panel(
          SVG(plot4).move(x_2, y_2),
          Text("D", 25, 20, size=30).move(x_2-20, y_2),
          )
       )
fig.save('../figures/combined_heatmap.svg')

In [None]:
!inkscape --export-area-drawing -w 1007 -h 656 --export-png=../figures/combined_heatmap.png ../figures/combined_heatmap.svg -b "#ffffffff"

## Combine percentile plots

In [None]:
plot1 = f'../figures/{headings[0][0]}-{headings[0][1]}-difference.svg'
plot2 = f'../figures/{headings[1][0]}-{headings[1][1]}-difference.svg'
plot3 = f'../figures/{headings[2][0]}-{headings[2][1]}-difference.svg'
plot4 = f'../figures/{headings[3][0]}-{headings[3][1]}-difference.svg'

y_2 = 325
x_2 = 550

fig = Figure("1007", "656",
       Panel(
          SVG(plot1),
          Text("A", 25, 20, size=30),
          ),
       Panel(
          SVG(plot2).move(x_2, 0),
          Text("B", 25, 20, size=30).move(x_2-20, 0),
          ),
       Panel(
          SVG(plot3).move(0, y_2),
          Text("C", 25, 20, size=30).move(0, y_2-25),
          ),
       Panel(
          SVG(plot4).move(x_2, y_2),
          Text("D", 25, 20, size=30).move(x_2-20, y_2-25),
          )
       )
fig.save('../figures/combined_difference.svg')

In [None]:
# The SVG version is ~150MB due to all the plotted points; we'll convert to a PNG to allow fast loading
!inkscape --export-area-drawing -w 1007 -h 656 --export-png=../figures/combined_difference.png ../figures/combined_difference.svg -b "#ffffffff"

## Build journal plots

In [None]:
with open('../viz_dataframes/journals/nanotechnology-microscopy.pkl', 'rb') as in_file:
    nanotech_df = pkl.load(in_file)
nanotech_df.head()
science_row = nanotech_df[nanotech_df['journal_title'] == 'Science']
science_x_loc = science_row['nanotechnology_pagerank']
science_y_loc = science_row['microscopy_pagerank']

In [None]:
plot = ggplot(nanotech_df, aes(x='nanotechnology_pagerank', y='microscopy_pagerank'))
plot += geom_point()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle('Common microscopy/nanotechnology journals')
plot += annotate('text', x=science_x_loc - 5e-5, y=science_y_loc, label='Science',)
plot += annotate('point', x=science_x_loc, y=science_y_loc, fill='red', size=2, color='red')
ggsave(plot, '../figures/microscopy_journals.svg')
plot

In [None]:
with open('../viz_dataframes/journals/immunochemistry-anatomy.pkl', 'rb') as in_file:
    immunochem_df = pkl.load(in_file)
immunochem_df.head()
science_row = immunochem_df[immunochem_df['journal_title'] == 'Science']
science_x_loc = science_row['immunochemistry_pagerank']
science_y_loc = science_row['anatomy_pagerank']

In [None]:
nature_row = immunochem_df[immunochem_df['journal_title'] == 'Nature']
nature_x_loc = nature_row['immunochemistry_pagerank']
nature_y_loc = nature_row['anatomy_pagerank']

In [None]:
cell_row = immunochem_df[immunochem_df['journal_title'] == 'Cell']
cell_x_loc = cell_row['immunochemistry_pagerank']
cell_y_loc = cell_row['anatomy_pagerank']

In [None]:
plot = ggplot(immunochem_df, aes(x='immunochemistry_pagerank', y='anatomy_pagerank'))
plot += geom_point()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle('Common immunochemistry/anatomy journals')
plot += annotate('text', x=science_x_loc - 1.9e-6, y=science_y_loc, label='Science',)
plot += annotate('point', x=science_x_loc, y=science_y_loc, fill='red', size=2, color='red')
plot += annotate('text', x=nature_x_loc - 2.7e-6, y=nature_y_loc, label='Nature',)
plot += annotate('point', x=nature_x_loc, y=nature_y_loc, fill='red', size=2, color='red')
plot += annotate('text', x=cell_x_loc - 1.6e-6, y=cell_y_loc, label='Cell',)
plot += annotate('point', x=cell_x_loc, y=cell_y_loc, fill='red', size=2, color='red')
ggsave(plot, '../figures/immunochemistry_journals.svg')
plot

## Combine journal plots

In [None]:
plot1 = f'../figures/microscopy_journals.svg'
plot2 = f'../figures/immunochemistry_journals.svg'

y_2 = 325
x_2 = 415

fig = Figure("828", "331",
       Panel(
          SVG(plot1),
          Text("A", 25, 20, size=30),
          ),
       Panel(
          SVG(plot2).move(x_2, 0),
          Text("B", 25, 20, size=30).move(x_2-20, 0),
          ),
       )
fig.save('../figures/combined_journals.svg')

In [None]:
# The SVG version is ~150MB due to all the plotted points; we'll convert to a PNG to allow fast loading
!inkscape --export-area-drawing -w 828 -h 331 --export-png=../figures/combined_journals.png ../figures/combined_journals.svg -b "#ffffffff"

## Create per-field distribution plots

In [None]:
for heading1, heading2 in headings:
    with open(f'../output/{heading1}-{heading2}-pagerank.pkl', 'rb') as in_file:
        h1_dict = pkl.load(in_file)
        h1_df = pd.DataFrame.from_dict(h1_dict, orient='index', columns=['pagerank'])
    with open(f'../output/{heading2}-{heading1}-pagerank.pkl', 'rb') as in_file:
        h2_dict = pkl.load(in_file)
        h2_df = pd.DataFrame.from_dict(h2_dict, orient='index', columns=['pagerank'])
    
    h1_text = string.capwords(heading1.replace('_', ' '))
    h2_text = string.capwords(heading2.replace('_', ' '))
    
    plot = ggplot(h1_df, aes(x='pagerank'))
    plot += geom_histogram()
    plot += ggtitle(f'{h1_text} PageRank Distribution')
    plot += scale_x_log10(name='PageRank')
    plot += scale_y_continuous(name='Count')
    plot += theme_classic()
    ggsave(plot, f'../figures/{heading1}-hist.svg')
    
    plot = ggplot(h2_df, aes(x='pagerank'))
    plot += geom_histogram()
    plot += ggtitle(f'{h2_text} PageRank Distribution')
    plot += scale_x_log10(name='PageRank')
    plot += scale_y_continuous(name='Count')
    plot += theme_classic()
    ggsave(plot, f'../figures/{heading2}-hist.svg')
    

In [None]:
plot1 = f'../figures/{headings[0][0]}-hist.svg'
plot5 = f'../figures/{headings[0][1]}-hist.svg'
plot2 = f'../figures/{headings[1][0]}-hist.svg'
plot6 = f'../figures/{headings[1][1]}-hist.svg'
plot3 = f'../figures/{headings[2][0]}-hist.svg'
plot7 = f'../figures/{headings[2][1]}-hist.svg'
plot4 = f'../figures/{headings[3][0]}-hist.svg'
plot8 = f'../figures/{headings[3][1]}-hist.svg'

y_2 = 325
x_2 = 420
x_3 = 840
x_4 = 1260

fig = Figure("1693", "654",
       Panel(
          SVG(plot1),
          Text("A", 0, 30, size=30),
          ),
       Panel(
          SVG(plot2).move(x_2, 0),
          Text("B", 20, 30, size=30).move(x_2-20, 0),
          ),
       Panel(
          SVG(plot3).move(x_3, 0),
          Text("C", 0, 30, size=30).move(x_3, 0)
          ),
       Panel(
          SVG(plot4).move(x_4, 0),
          Text("D", 20, 30, size=30).move(x_4, 0),
          ),
       Panel(
          SVG(plot5).move(0, y_2),
          Text("E", 0, 30, size=30).move(0, y_2),
          ),
       Panel(
          SVG(plot6).move(x_2, y_2),
          Text("F", 20, 30, size=30).move(x_2-20, y_2),
          ),
       Panel(
          SVG(plot7).move(x_3, y_2),
          Text("G", 0, 30, size=30).move(x_3, y_2),
          ),
       Panel(
          SVG(plot8).move(x_4, y_2),
          Text("H", 20, 30, size=30).move(x_4, y_2),
          ),
     )

fig.save('../figures/per-field.svg')

In [None]:
# The SVG version is ~150MB due to all the plotted points; we'll convert to a PNG to allow fast loading
!inkscape  --export-area-drawing -w 1693 -h 654 --export-png=../figures/per-field.png ../figures/per-field.svg -b "#ffffffff"

## Specialty journal analysis

In [None]:
output_files = glob('../output/*-*-pagerank.pkl')
print(len(output_files))

In [None]:
journal_info = {'journal': [], 'count': [], 'field': []}
headings_seen = set()

for file in tqdm(output_files):
    heading = os.path.basename(file).split('-')[0]
    if heading in headings_seen:
        continue
    else:
        headings_seen.add(heading)
    
    with open(file, 'rb') as in_file:
        heading_dict = pkl.load(in_file)
    pagerank_df = pd.DataFrame.from_dict(heading_dict, orient='index', columns=['pagerank'])
    pagerank_df = pagerank_df.reset_index()
    pagerank_df = pagerank_df.rename({'index': 'doi'}, axis='columns')
    
    
    metadata_path = f'../data/pubmed/efetch/{heading}.xml.xz'
    metadata = parse_metadata(metadata_path)
    
    doi_to_journal = dict(zip(metadata['doi'], metadata['journal']))
    
    pagerank_df['journal'] = pagerank_df['doi'].map(doi_to_journal)
    for journal, count in pagerank_df['journal'].value_counts().iteritems():
        journal_info['journal'].append(journal)
        journal_info['count'].append(count)
        journal_info['field'].append(heading)
    
# Find count of each journal for full dataset
# Set cutoff # of articles for inclusion
# Find overlap 



In [None]:
journal_df = pd.DataFrame(journal_info)
journal_df

In [None]:
journal_df['field'].nunique()

In [None]:
# There are 5178 journals with at least 100 articles in the dataset
(journal_df.groupby('journal').sum() > 50 ).sum()

In [None]:
journal_series = journal_df.groupby('journal').sum() > 50 
common_journals = set(journal_series[journal_series['count']].index)

In [None]:
common_journal_df = journal_df.loc[journal_df['journal'].isin(common_journals)]
common_journal_df

In [None]:
journal_total_articles = common_journal_df.groupby('journal').sum()['count']
journal_total_articles

In [None]:
largest_field_percent = common_journal_df.groupby('journal')['count'].max() / journal_total_articles
largest_field_percent

In [None]:
field_specific_journals = largest_field_percent[largest_field_percent > .9].index
largest_field_percent[largest_field_percent > .9].count()

In [None]:
common_journal_df[common_journal_df['journal'].isin(field_specific_journals)].groupby('journal').value_counts()

#### Result
Of the 5178 journals with at least 50 articles in the dataset, only 56 (1.1%) of journals have more than 90 percent of their articles from one field

## Top journal analysis

In [None]:
top_journal_info = {'top_journal': [], 'field': []}
headings_seen = set()

for file in tqdm(output_files):
    heading = os.path.basename(file).split('-')[0]
    if heading in headings_seen:
        continue
    else:
        headings_seen.add(heading)
    
    with open(file, 'rb') as in_file:
        heading_dict = pkl.load(in_file)
    pagerank_df = pd.DataFrame.from_dict(heading_dict, orient='index', columns=['pagerank'])
    pagerank_df = pagerank_df.reset_index()
    pagerank_df = pagerank_df.rename({'index': 'doi'}, axis='columns')
    
    metadata_path = f'../data/pubmed/efetch/{heading}.xml.xz'
    metadata = parse_metadata(metadata_path)
    
    doi_to_journal = dict(zip(metadata['doi'], metadata['journal']))
    
    pagerank_df['journal'] = pagerank_df['doi'].map(doi_to_journal)
        
    journal_series = pagerank_df.groupby('journal').count() > 50 
    common_journals = set(journal_series[journal_series['doi']].index)
    
    common_journal_df = pagerank_df[pagerank_df['journal'].isin(common_journals)]
    top_journals = common_journal_df.groupby('journal').median().sort_values(by='pagerank', ascending=False)
        
    top_journal = top_journals.index[0]
    
    top_journal_info['top_journal'].append(top_journal)
    top_journal_info['field'].append(heading)

In [None]:
top_journal_df = pd.DataFrame(top_journal_info)
top_journal_list = top_journal_df['top_journal'].value_counts().index.tolist()
tj_categorical = pd.Categorical(top_journal_df['top_journal'], categories=top_journal_list)
top_journal_df['cat'] = tj_categorical

top_journal_df
# count_df = top_journal_counts.to_frame()
# count_df = count_df.reset_index()
# count_df = count_df.rename({'index': 'journal', 'top_journal': 'count'}, axis=1)
# count_df

# Sort by count
#manufacturer_cat = pd.Categorical(top_journal_df['manufacturer'], categories=manufacturer_list)

In [None]:
plot = ggplot(top_journal_df, aes(x='cat'))
plot += geom_bar(stat='count')
plot += theme_classic()
plot += theme(axis_text_x=element_text(rotation=90, hjust=.5))
plot += ylab('Count')
plot += xlab('Journal')
plot += scale_y_continuous(breaks=list(range(0,10, 2)))
plot += ggtitle('Top Journal Prevalence')

print(plot)

## Scratch

In [None]:
heading1 = 'nanotechnology'
heading2 = 'microscopy'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by='nanotechnology-microscopy')

In [None]:
plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank',))
plot += geom_bin2d()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle(f'{heading1} vs {heading2} pageranks')
plot += scale_fill_gradient(trans='log')
plot += theme_dark()

plot

In [None]:
plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', 
              color=f'{heading1}-{heading2}'))
plot += geom_point()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle(f'{heading1} vs {heading2} pageranks')
plot += scale_color_gradient2(low='red', mid='white', high='blue')
plot += theme_dark()

plot

In [None]:
heading1 = 'immunochemistry'
heading2 = 'anatomy'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by='immunochemistry-anatomy')

In [None]:
plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank',))
plot += geom_bin2d()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle(f'{heading1} vs {heading2} pageranks')
plot += scale_fill_gradient(trans='log')
plot += theme_dark()

plot

In [None]:
plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', 
              color=f'{heading1}-{heading2}'))
plot += geom_point()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle(f'{heading1} vs {heading2} pageranks')
plot += scale_color_gradient2(low='red', mid='white', high='blue')
plot += theme_dark()

plot

## Plotly plots

In [None]:
heading1 = 'nanotechnology'
heading2 = 'microscopy'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by='nanotechnology-microscopy')

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                 opacity=1, color=f'{heading1}-{heading2}', color_continuous_scale='oxy', hover_data=['doi', 'title'],
                 title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

In [None]:
largest_dois = set(percentile_data.nlargest(5, 'nanotechnology-microscopy')['doi'])
percentile_data['top_five'] = percentile_data['doi'].isin(largest_dois)

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                 opacity=1, color='top_five', hover_data=['doi', 'title'],
                 title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

In [None]:
main_data = percentile_data[percentile_data['nanotechnology_pagerank'] > 0.000015]
smallest_dois = set(main_data.nsmallest(5, 'nanotechnology-microscopy')['doi'])
percentile_data['bot_five'] = percentile_data['doi'].isin(smallest_dois)

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                 opacity=1, color='bot_five', hover_data=['doi', 'title'],
                 title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

In [None]:
heading1 = 'immunochemistry'
heading2 = 'anatomy'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by=f'{heading1}-{heading2}')

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                  opacity=1, color=f'{heading1}-{heading2}', color_continuous_scale='oxy', hover_data=['doi', 'title'],
                  title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

In [None]:
heading1 = 'proteomics'
heading2 = 'metabolomics'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by=f'{heading1}-{heading2}')

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                 opacity=1, color=f'{heading1}-{heading2}', color_continuous_scale='oxy', hover_data=['doi', 'title'],
                 title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

## Pairwise difference ranks