In [1]:
import glob
import itertools
import os
import pickle

import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
def reformat_results(results, field):

    reformatted_results = {'doi': [], 'pagerank': []}

    for doi, score in results.items():
        reformatted_results['doi'].append(doi)
        reformatted_results['pagerank'].append(score)
    result_df = pd.DataFrame(reformatted_results)
    result_df['field'] = field
    
    return result_df

In [3]:
metric_files = glob.glob('../output/*-pagerank.pkl')
metric_files = [f for f in metric_files if 'first_degree' not in f]

all_results = pd.DataFrame()

for file in metric_files:
    field = os.path.basename(file).split('-')[0]
    print(field)
    
    with open(file, 'rb') as in_file:
        results = pickle.load(in_file)
    result_df = reformat_results(results, field)
    all_results = pd.concat([all_results, result_df])
all_results

dental_informatics
artificial_intelligence
sociobiology
natural_history
synthetic_biology
cellular_automata
public_health_informatics
computational_biology
zoology
developmental_biology
cryobiology
consumer_health_informatics
exobiology
latent_class_analysis
radiobiology
genetics
neurobiology
parasitology
botany
photobiology
laboratory_animal_science
cheminformatics
cell_biology
ecology
microbiology
medical_informatics
nursing_informatics


Unnamed: 0,doi,pagerank,field
0,10.1016/j.oooo.2015.02.484,0.031623,dental_informatics
1,10.14219/jada.archive.2008.0105,0.078961,dental_informatics
2,10.1177/0022034513487560,0.031623,dental_informatics
3,10.14219/jada.archive.2013.0013,0.058503,dental_informatics
4,10.1016/j.cden.2011.02.010,0.031623,dental_informatics
...,...,...,...
452,10.1590/s0104-11692011000600015,0.001154,nursing_informatics
453,10.1016/j.outlook.2008.07.003,0.001154,nursing_informatics
454,10.1016/j.apnr.2005.02.001,0.002135,nursing_informatics
455,10.1097/00006216-200010000-00026,0.009236,nursing_informatics


In [4]:
all_results['doi'].value_counts()

10.1038/s41556-021-00787-7        5
10.1016/j.tig.2017.11.002         5
10.1371/journal.pbio.3000999      5
10.1126/scisignal.2004657         5
10.1093/bib/bbx035                5
                                 ..
10.1016/j.virusres.2012.10.023    1
10.1016/j.virusres.2012.07.028    1
10.1016/j.virusres.2012.06.023    1
10.1097/coh.0b013e32832c0672      1
10.12927/cjnl.2020.26188          1
Name: doi, Length: 754771, dtype: int64

In [5]:
fields = all_results['field'].unique()
fields

array(['dental_informatics', 'artificial_intelligence', 'sociobiology',
       'natural_history', 'synthetic_biology', 'cellular_automata',
       'public_health_informatics', 'computational_biology', 'zoology',
       'developmental_biology', 'cryobiology',
       'consumer_health_informatics', 'exobiology',
       'latent_class_analysis', 'radiobiology', 'genetics',
       'neurobiology', 'parasitology', 'botany', 'photobiology',
       'laboratory_animal_science', 'cheminformatics', 'cell_biology',
       'ecology', 'microbiology', 'medical_informatics',
       'nursing_informatics'], dtype=object)

In [6]:
groups = all_results.groupby("field")

In [7]:
%%time
pairs = []
intersections = []
correlations = []
intersection_counts = []
for field1, field2 in itertools.combinations_with_replacement(fields, 2):
#     if field1 == field2:
#         continue
    
    group1 = groups.get_group(field1)
    group2 = groups.get_group(field2)
    intersect_count = len(np.intersect1d(group1.doi, group2.doi))
    intersect_percent = intersect_count / (len(group1) + len(group2) - intersect_count)
    
    if intersect_count < 100:
        continue
    
    merged = group1.merge(group2, on='doi')
    correlation = merged['pagerank_x'].corr(merged['pagerank_y'])
    pairs.append((field1, field2))
    intersections.append(intersect_percent)
    intersection_counts.append(intersect_count)
    correlations.append(correlation)

dental_informatics dental_informatics 21 21 21
dental_informatics artificial_intelligence 21 102221 1
dental_informatics sociobiology 21 24 0
dental_informatics natural_history 21 74 0
dental_informatics synthetic_biology 21 3279 0
dental_informatics cellular_automata 21 3 0
dental_informatics public_health_informatics 21 242 0
dental_informatics computational_biology 21 196494 0
dental_informatics zoology 21 1174 0
dental_informatics developmental_biology 21 1443 0
dental_informatics cryobiology 21 5 0
dental_informatics consumer_health_informatics 21 19 0
dental_informatics exobiology 21 1259 0
dental_informatics latent_class_analysis 21 529 0
dental_informatics radiobiology 21 988 0
dental_informatics genetics 21 208506 0
dental_informatics neurobiology 21 486 0
dental_informatics parasitology 21 2211 0
dental_informatics botany 21 2542 0
dental_informatics photobiology 21 260 0
dental_informatics laboratory_animal_science 21 173 0
dental_informatics cheminformatics 21 60 0
dental_i

computational_biology parasitology 196494 2211 29
computational_biology botany 196494 2542 30
computational_biology photobiology 196494 260 0
computational_biology laboratory_animal_science 196494 173 1
computational_biology cheminformatics 196494 60 5
computational_biology cell_biology 196494 927 37
computational_biology ecology 196494 17171 453
computational_biology microbiology 196494 102183 1398
computational_biology medical_informatics 196494 306675 28939
computational_biology nursing_informatics 196494 457 5
zoology zoology 1174 1174 1174
zoology developmental_biology 1174 1443 6
zoology cryobiology 1174 5 0
zoology consumer_health_informatics 1174 19 0
zoology exobiology 1174 1259 0
zoology latent_class_analysis 1174 529 0
zoology radiobiology 1174 988 0
zoology genetics 1174 208506 23
zoology neurobiology 1174 486 0
zoology parasitology 1174 2211 7
zoology botany 1174 2542 5
zoology photobiology 1174 260 0
zoology laboratory_animal_science 1174 173 0
zoology cheminformatics 117

microbiology medical_informatics 102183 306675 445
microbiology nursing_informatics 102183 457 0
medical_informatics medical_informatics 306675 306675 306675
medical_informatics nursing_informatics 306675 457 157
nursing_informatics nursing_informatics 457 457 457
CPU times: user 34.6 s, sys: 57.3 ms, total: 34.6 s
Wall time: 34.6 s


In [8]:
fields1 = [pair[0] for pair in pairs]
fields2 = [pair[1] for pair in pairs]

In [9]:
overlap_df = pd.DataFrame({'field1': fields1, 'field2': fields2, 'intersection': intersections, 
                           'correlation': correlations, 'intersection_counts': intersection_counts})

In [10]:
overlap_df

Unnamed: 0,field1,field2,intersection,correlation,intersection_counts
0,artificial_intelligence,artificial_intelligence,1.0,1.0,102221
1,artificial_intelligence,computational_biology,0.030411,0.800098,8816
2,artificial_intelligence,genetics,0.025441,0.456514,7709
3,artificial_intelligence,microbiology,0.000862,0.371046,176
4,artificial_intelligence,medical_informatics,0.080909,0.702763,30607
5,synthetic_biology,synthetic_biology,1.0,1.0,3279
6,synthetic_biology,computational_biology,0.00176,0.475627,351
7,synthetic_biology,genetics,0.000652,0.446581,138
8,synthetic_biology,microbiology,0.000968,0.858166,102
9,public_health_informatics,public_health_informatics,1.0,1.0,242


In [23]:
px.scatter(overlap_df, x='intersection', y='correlation', hover_data=['field1', 'field2'], log_x=True, trendline='ols', trendline_options=dict(log_x=True), color='intersection')

In [13]:
group1 = groups.get_group('dental_informatics')
print(group1.doi)
print(len(group1.doi))

0           10.1016/j.oooo.2015.02.484
1      10.14219/jada.archive.2008.0105
2             10.1177/0022034513487560
3      10.14219/jada.archive.2013.0013
4           10.1016/j.cden.2011.02.010
5      10.14219/jada.archive.2004.0120
6      10.14219/jada.archive.2007.0138
7             10.1038/sj.bdj.2008.1151
8                 10.1038/bdj.2008.150
9                  10.1197/jamia.m2827
10                   10.2196/jmir.1799
11                    10.2196/jmir.971
12         10.1016/j.ajodo.2006.07.024
13         10.1016/j.ajodo.2005.04.017
14     10.14219/jada.archive.2011.0082
15    10.1111/j.1600-0757.2011.00403.x
16    10.1111/j.1601-0825.2011.01794.x
17                   10.1159/000328669
18                10.1038/bdj.2008.193
19         10.1016/j.ajodo.2006.07.023
20         10.1016/j.ajodo.2004.11.027
Name: doi, dtype: object
21


In [14]:
group1 = groups.get_group('computational_biology')
print(group1.doi)
print(len(group1.doi))
print(group1.doi.value_counts())

0          10.1016/j.neubiorev.2016.10.031
1         10.1016/j.jpsychires.2011.03.015
2                10.1016/j.jad.2011.07.003
3             10.1080/13811118.2010.524025
4                     10.1038/npp.2009.236
                        ...               
196489                 10.1093/jac/dkab417
196490                10.1155/2022/2204981
196491              10.1093/plphys/kiab466
196492                10.1155/2022/6192190
196493               10.1631/jzus.b1900070
Name: doi, Length: 196494, dtype: object
196494
10.1016/j.neubiorev.2016.10.031       1
10.1186/s12859-020-03782-1            1
10.1093/bioinformatics/btw304         1
10.1109/tcbb.2018.2865729             1
10.1186/s12859-020-03738-5            1
                                     ..
10.1007/s10969-010-9098-3             1
10.1152/physiolgenomics.00009.2013    1
10.1007/s11033-010-0532-1             1
10.1371/journal.pone.0049463          1
10.1631/jzus.b1900070                 1
Name: doi, Length: 196494, dtype: int64

There appears to be some duplicate nodes in the network. These are probably coming from upstream, maybe in the graph building step?

In [15]:
print(group1[group1['doi'] == '10.1007/s00284-020-02191-0'])

                               doi  pagerank                  field
189484  10.1007/s00284-020-02191-0  0.000001  computational_biology


### Do these duplications appear upstream?

Check graph for duplicate nodes:

In [16]:
with open('../data/networks/computational_biology.pkl', 'rb') as in_file:
    network = pickle.load(in_file)

In [17]:
dois = set()

for node in network.nodes:
    if node not in dois:
        dois.add(node)
    else:
        print(f'{node} is duplicated')

Turns out duplication was from the file loading (there were four of many comp bio dois because they were counted from the first-degree and intrafield graphs for pagerank and betweenness centrality)