# Network EDA
This notebook compares citation networks for various MeSH terms

In [34]:
import pickle as pkl
import sys

import networkx as nx
import pandas as pd
import plotly.express as px
from pubmedpy.efetch import extract_all                                         
from pubmedpy.xml import iter_extract_elems

In [20]:
sys.path.append('../src')
from build_network import parse_metadata

In [21]:
comp_bio_metadata = parse_metadata('../data/pubmed/efetch/computational_biology.xml.xz')
comp_bio_metadata

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
0,1302603,,10.1093/hmg/1.9.663,Hum Mol Genet,9208958,La carte des microsatellites est arrivée! [The...,1992-12,"[{'fore_name': 'J A', 'last_name': 'Todd', 'af..."
1,1303183,,10.1093/hmg/1.3.211,Hum Mol Genet,9208958,Standards for reporting alleles at highly poly...,1992-06,"[{'fore_name': 'N K', 'last_name': 'Spurr', 'a..."
2,1306564,,,Med Sect Proc,9710573,Values and hard choices: challenges for life i...,1992,"[{'fore_name': 'A', 'last_name': 'Caplan', 'af..."
3,1348284,,,J Pharm Pharmacol,0376363,The human genome project. Purpose and potential.,1992-02,"[{'fore_name': 'C T', 'last_name': 'Caskey', '..."
4,1357470,,,Lancet,2985213R,Another milestone in the human genome race.,1992-10-31,[]
...,...,...,...,...,...,...,...,...
220373,35325416,,10.1007/978-1-0716-1875-2_4,Methods Mol Biol,9214969,In Silico Methods for the Identification of Vi...,2022,"[{'fore_name': 'Aditya', 'last_name': 'Narayan..."
220374,35325420,,10.1007/978-1-0716-1875-2_8,Methods Mol Biol,9214969,RTBV-Based VIGS Vector for Functional Genomics...,2022,"[{'fore_name': 'Gaurav', 'last_name': 'Kumar',..."
220375,35325422,,10.1007/978-1-0716-1875-2_10,Methods Mol Biol,9214969,Virus-Induced Gene Silencing for Functional Ge...,2022,"[{'fore_name': 'Dikki Pedenla', 'last_name': '..."
220376,35325428,,10.1007/978-1-0716-1875-2_17,Methods Mol Biol,9214969,An Integrated Bioinformatics and Functional Ap...,2022,"[{'fore_name': 'Sombir', 'last_name': 'Rao', '..."


In [24]:
comp_bio_dois = set(comp_bio_metadata['doi'])
comp_bio_dois.remove(None)

In [2]:
with open('../data/networks/computational_biology.pkl', 'rb') as in_file:
    comp_bio_network = pkl.load(in_file)

In [3]:
comp_bio_network

<networkx.classes.digraph.DiGraph at 0x7f81a51007f0>

In [44]:
def in_degree(row, network):
    return network.in_degree(row['doi'])
def out_degree(row, network):
    return network.out_degree(row['doi'])

### Subsetting note
The networks are constructed to include each article has a computational biology MeSH heading and all articles it cites/citing it. As a result, if we're going to look at the degree distribution of comp bio papers we need to select a subset of the nodes

In [None]:
degrees = [(n, d) for n, d in comp_bio_network.degree() if n in comp_bio_dois]
degree_df = pd.DataFrame(degrees, columns=['doi', 'degree'])

In [50]:
tmp = lambda x: in_degree(x, comp_bio_network)
degree_df['articles_citing'] = degree_df.apply(tmp, axis=1)
tmp = lambda x: out_degree(x, comp_bio_network)
degree_df['articles_cited'] = degree_df.apply(tmp, axis=1)
degree_df

Unnamed: 0,doi,degree,in_degree,out_degree,articles_citing,articles_cited
0,10.1186/1471-2105-10-114,78,37,41,37,41
1,10.1038/mp.2008.11,224,146,78,146,78
2,10.1038/mp.2008.25,359,278,81,278,81
3,10.1055/s-0029-1216356,146,146,0,146,0
4,10.1016/j.neuint.2010.03.002,79,17,62,17,62
...,...,...,...,...,...,...
212756,10.1007/978-3-030-15950-4_28,123,0,123,0,123
212757,10.1007/978-3-030-15950-4_7,186,0,186,0,186
212758,10.7150/thno.61832,1,1,0,1,0
212759,10.21873/invivo.12176,1,1,0,1,0


In [59]:
px.histogram(degree_df['articles_citing'], log_y=True, title='The distribution of citations received in computational biology papers')

In [60]:
px.histogram(degree_df['articles_cited'], title='The distribution of citations given in computational biology papers')

### Checking outliers

The articles with extremely high numbers of articles cited are reviews

In [54]:
degree_df[degree_df['articles_cited'] > 1000]

Unnamed: 0,doi,degree,in_degree,out_degree,articles_citing,articles_cited
64,10.1007/s00204-010-0577-x,1857,262,1595,262,1595
299,10.1152/physrev.00035.2008,2236,1167,1069,1167,1069
355,10.1152/physrev.00032.2011,2130,750,1380,750,1380
3551,10.1021/cr300073p,1208,175,1033,175,1033
18614,10.1021/acs.chemrev.8b00538,1703,121,1582,121,1582


The articles with large numbers of citations are largely methods papers, though one of the most cited is actually a historical commentary

In [57]:
degree_df[degree_df['articles_citing'] > 25000]

Unnamed: 0,doi,degree,in_degree,out_degree,articles_citing,articles_cited
870,10.1038/nmeth.2089,30706,30696,10,30696,10
954,10.1038/nmeth.2019,28468,28430,38,28430,38
1284,10.1093/bioinformatics/btp352,32339,32336,3,32336,3
1363,10.1186/s13059-014-0550-8,28504,28453,51,28453,51
1900,10.1093/bioinformatics/btp324,26222,26208,14,26208,14
1934,10.1093/bioinformatics/btu170,25770,25760,10,25760,10
2622,10.1038/nmeth.1923,25750,25738,12,25738,12
2875,10.1093/molbev/msr121,30510,30481,29,30481,29


In [61]:
degree_df[degree_df['articles_cited'] == 0]

Unnamed: 0,doi,degree,in_degree,out_degree,articles_citing,articles_cited
3,10.1055/s-0029-1216356,146,146,0,146,0
5,10.1523/jneurosci.4724-06.2007,89,89,0,89,0
17,10.1523/jneurosci.2499-09.2009,82,82,0,82,0
23,10.1136/jmedgenet-2011-100242,83,83,0,83,0
31,10.1101/cshperspect.a012047,49,49,0,49,0
...,...,...,...,...,...,...
212751,10.1385/1-59259-273-2:193,1,1,0,1,0
212753,10.1109/tcbb.2017.2779141,1,1,0,1,0
212758,10.7150/thno.61832,1,1,0,1,0
212759,10.21873/invivo.12176,1,1,0,1,0


In [62]:
# TODO figure out whether the no-citation-info papers are journal specific/can be safely removed