# Figures
Visualize the results of the analyses for the indices paper

In [None]:
import os
import pickle as pkl
import sys
from glob import glob

import numpy as np
import pandas as pd
import plotly.express as px
from plotnine import *

sys.path.append('../indices')
from utils import load_percentile_data, load_journal_data, load_pair_headings

In [None]:
headings = [('nanotechnology', 'microscopy'), ('immunochemistry', 'anatomy'), 
            ('computational_biology', 'human_genetics')]

In [None]:
for heading1, heading2 in headings:
    percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
    
    plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank',))
    plot += geom_bin2d()
    plot += scale_x_log10()
    plot += scale_y_log10()
    plot += ggtitle(f'{heading1} vs {heading2} pageranks')
    plot += scale_fill_gradient(trans='log')
    plot += theme_dark()

    ggsave(plot, f'../figures/{heading1}-{heading2}-heatmap.svg')
    
    plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', 
              color=f'{heading1}-{heading2}'))
    plot += geom_point()
    plot += scale_x_log10()
    plot += scale_y_log10()
    plot += ggtitle(f'{heading1} vs {heading2} pageranks')
    plot += scale_color_gradient2(low='red', mid='white', high='blue')
    plot += theme_dark()

    ggsave(plot, f'../figures/{heading1}-{heading2}-difference.svg')

In [None]:
heading1 = 'nanotechnology'
heading2 = 'microscopy'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by='nanotechnology-microscopy')

In [None]:
plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank',))
plot += geom_bin2d()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle(f'{heading1} vs {heading2} pageranks')
plot += scale_fill_gradient(trans='log')
plot += theme_dark()

plot

In [None]:
plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', 
              color=f'{heading1}-{heading2}'))
plot += geom_point()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle(f'{heading1} vs {heading2} pageranks')
plot += scale_color_gradient2(low='red', mid='white', high='blue')
plot += theme_dark()

plot

In [None]:
heading1 = 'immunochemistry'
heading2 = 'anatomy'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by='immunochemistry-anatomy')

In [None]:
plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank',))
plot += geom_bin2d()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle(f'{heading1} vs {heading2} pageranks')
plot += scale_fill_gradient(trans='log')
plot += theme_dark()

plot

In [None]:
plot = ggplot(percentile_data, aes(x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', 
              color=f'{heading1}-{heading2}'))
plot += geom_point()
plot += scale_x_log10()
plot += scale_y_log10()
plot += ggtitle(f'{heading1} vs {heading2} pageranks')
plot += scale_color_gradient2(low='red', mid='white', high='blue')
plot += theme_dark()

plot

## Plotly plots

In [None]:
heading1 = 'nanotechnology'
heading2 = 'microscopy'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by='nanotechnology-microscopy')

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                 opacity=1, color=f'{heading1}-{heading2}', color_continuous_scale='oxy', hover_data=['doi', 'title'],
                 title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

In [None]:
largest_dois = set(percentile_data.nlargest(5, 'nanotechnology-microscopy')['doi'])
percentile_data['top_five'] = percentile_data['doi'].isin(largest_dois)

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                 opacity=1, color='top_five', hover_data=['doi', 'title'],
                 title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

In [None]:
main_data = percentile_data[percentile_data['nanotechnology_pagerank'] > 0.000015]
smallest_dois = set(main_data.nsmallest(5, 'nanotechnology-microscopy')['doi'])
percentile_data['bot_five'] = percentile_data['doi'].isin(smallest_dois)

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                 opacity=1, color='bot_five', hover_data=['doi', 'title'],
                 title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

In [None]:
heading1 = 'immunochemistry'
heading2 = 'anatomy'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by=f'{heading1}-{heading2}')

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                  opacity=1, color=f'{heading1}-{heading2}', color_continuous_scale='oxy', hover_data=['doi', 'title'],
                  title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

In [None]:
heading1 = 'proteomics'
heading2 = 'metabolomics'
percentile_data = load_percentile_data(heading1, heading2, base_dir='../viz_dataframes')
percentile_data.sort_values(by=f'{heading1}-{heading2}')

In [None]:
plot = px.scatter(percentile_data, x=f'{heading1}_pagerank', y=f'{heading2}_pagerank', log_x=True, log_y=True,
                 opacity=1, color=f'{heading1}-{heading2}', color_continuous_scale='oxy', hover_data=['doi', 'title'],
                 title=f'Relative importance of papers in {heading1} and {heading2}',)
plot

## Journal plots

In [None]:
journal_data_files = glob('../viz_dataframes/journals/*')

results = {'percentile': [], 'journal': [], 'heading': []}

for file in journal_data_files:
    file_base = os.path.basename(file)
    file_base = os.path.splitext(file_base)[0]
    heading1, heading2 = file_base.split('-')
        
    with open(file, 'rb') as in_file:
        df = pkl.load(in_file)
        for _, row in df.iterrows():
            results['percentile'].append(row[f'{heading1}_percentile'])
            results['journal'].append(row[f'journal_title'])
            results['heading'].append(heading1)
            
            results['percentile'].append(row[f'{heading2}_percentile'])
            results['journal'].append(row[f'journal_title'])
            results['heading'].append(heading2)



In [None]:
result_df = pd.DataFrame(results)

# Consolidate multiples
result_df = result_df.groupby(['journal', 'heading']).mean().reset_index()

result_df[result_df['journal'] == 'ACS Appl Mater Interfaces']

In [None]:
plot = ggplot(result_df, aes(x='journal', y='heading', fill='percentile'))
plot += geom_tile()
plot

In [None]:
frequent_headings = result_df.heading.value_counts()[result_df.heading.value_counts() > 250].index
frequent_headings

In [None]:
frequent_df = result_df[result_df['heading'].isin(set(frequent_headings))]

In [None]:
plot = ggplot(frequent_df, aes(x='journal', y='heading', fill='percentile'))
plot += geom_tile()
plot += theme(axis_text_x=element_text(rotation=90, hjust=1))
plot += ggtitle('Headings with many journals')
plot

In [None]:
unmelted_df = frequent_df.pivot(index='journal', columns='heading', values='percentile')
# Remove journals not shared by all fields
unmelted_df = unmelted_df.dropna(axis='index')
unmelted_df 

In [None]:
import seaborn as sns
sns.set()

In [None]:
%matplotlib inline
sns.clustermap(unmelted_df)