In [1]:
import glob
import itertools
import os
import pickle

import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
def reformat_results(results, field):

    reformatted_results = {'doi': [], 'pagerank': []}

    for doi, score in results.items():
        reformatted_results['doi'].append(doi)
        reformatted_results['pagerank'].append(score)
    result_df = pd.DataFrame(reformatted_results)
    result_df['field'] = field
    
    return result_df

In [3]:
metric_files = glob.glob('../output/*-pagerank.pkl')
metric_files = [f for f in metric_files if 'first_degree' not in f]

all_results = pd.DataFrame()

for file in metric_files:
    field = os.path.basename(file).split('-')[0]
    print(field)
    
    with open(file, 'rb') as in_file:
        results = pickle.load(in_file)
    result_df = reformat_results(results, field)
    all_results = pd.concat([all_results, result_df])
all_results

dental_informatics
artificial_intelligence
sociobiology
natural_history
synthetic_biology
cellular_automata
public_health_informatics
computational_biology
zoology
developmental_biology
cryobiology
consumer_health_informatics
exobiology
latent_class_analysis
radiobiology
genetics
neurobiology
parasitology
botany
photobiology
laboratory_animal_science
cheminformatics
cell_biology
ecology
microbiology
medical_informatics
nursing_informatics


Unnamed: 0,doi,pagerank,field
0,10.1016/j.oooo.2015.02.484,0.031623,dental_informatics
1,10.14219/jada.archive.2008.0105,0.078961,dental_informatics
2,10.1177/0022034513487560,0.031623,dental_informatics
3,10.14219/jada.archive.2013.0013,0.058503,dental_informatics
4,10.1016/j.cden.2011.02.010,0.031623,dental_informatics
...,...,...,...
452,10.1590/s0104-11692011000600015,0.001154,nursing_informatics
453,10.1016/j.outlook.2008.07.003,0.001154,nursing_informatics
454,10.1016/j.apnr.2005.02.001,0.002135,nursing_informatics
455,10.1097/00006216-200010000-00026,0.009236,nursing_informatics


In [4]:
all_results['doi'].value_counts()

10.1038/s41556-021-00787-7        5
10.1016/j.tig.2017.11.002         5
10.1371/journal.pbio.3000999      5
10.1126/scisignal.2004657         5
10.1093/bib/bbx035                5
                                 ..
10.1016/j.virusres.2012.10.023    1
10.1016/j.virusres.2012.07.028    1
10.1016/j.virusres.2012.06.023    1
10.1097/coh.0b013e32832c0672      1
10.12927/cjnl.2020.26188          1
Name: doi, Length: 754771, dtype: int64

In [5]:
fields = all_results['field'].unique()
fields

array(['dental_informatics', 'artificial_intelligence', 'sociobiology',
       'natural_history', 'synthetic_biology', 'cellular_automata',
       'public_health_informatics', 'computational_biology', 'zoology',
       'developmental_biology', 'cryobiology',
       'consumer_health_informatics', 'exobiology',
       'latent_class_analysis', 'radiobiology', 'genetics',
       'neurobiology', 'parasitology', 'botany', 'photobiology',
       'laboratory_animal_science', 'cheminformatics', 'cell_biology',
       'ecology', 'microbiology', 'medical_informatics',
       'nursing_informatics'], dtype=object)

In [6]:
groups = all_results.groupby("field")

In [7]:
%%time
pairs = []
intersections = []
correlations = []
intersection_counts = []
for field1, field2 in itertools.combinations_with_replacement(fields, 2):
    if field1 == field2:
        continue
    
    group1 = groups.get_group(field1)
    group2 = groups.get_group(field2)
    intersect_count = len(np.intersect1d(group1.doi, group2.doi))
    intersect_percent = intersect_count / (len(group1) + len(group2) - intersect_count)
    
    if intersect_count < 100:
        continue
    
    merged = group1.merge(group2, on='doi')
    correlation = merged['pagerank_x'].corr(merged['pagerank_y'])
    pairs.append((field1, field2))
    intersections.append(intersect_percent)
    intersection_counts.append(intersect_count)
    correlations.append(correlation)

CPU times: user 30.2 s, sys: 3.32 ms, total: 30.2 s
Wall time: 30.2 s


In [8]:
fields1 = [pair[0] for pair in pairs]
fields2 = [pair[1] for pair in pairs]

In [9]:
overlap_df = pd.DataFrame({'field1': fields1, 'field2': fields2, 'intersection': intersections, 
                           'correlation': correlations, 'intersection_counts': intersection_counts})

In [10]:
overlap_df

Unnamed: 0,field1,field2,intersection,correlation,intersection_counts
0,artificial_intelligence,computational_biology,0.030411,0.800098,8816
1,artificial_intelligence,genetics,0.025441,0.456514,7709
2,artificial_intelligence,microbiology,0.000862,0.371046,176
3,artificial_intelligence,medical_informatics,0.080909,0.702763,30607
4,synthetic_biology,computational_biology,0.00176,0.475627,351
5,synthetic_biology,genetics,0.000652,0.446581,138
6,synthetic_biology,microbiology,0.000968,0.858166,102
7,computational_biology,genetics,0.38229,0.933075,112008
8,computational_biology,ecology,0.002125,0.453726,453
9,computational_biology,microbiology,0.004703,0.843605,1398


In [18]:
px.scatter(overlap_df, x='intersection', y='correlation', 
           hover_data=['field1', 'field2'], log_x=True, trendline='ols', 
           trendline_options=dict(log_x=True),
           title='Correlation vs Doi Overlap')