In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
from bokeh.plotting import figure, show, from_networkx
from bokeh.transform import linear_cmap
from tqdm.notebook import tqdm
import numpy as np

In [None]:
# Get an API key from https://ui.adsabs.harvard.edu/user/settings/token
import ads
ads.config.token = '~/.ads/dev_key'
r = ads.RateLimits('SearchQuery')

In [None]:
# Seed our collection with the most-cited papers which cite SDO, published in the last five years
result = list(ads.SearchQuery(q="citations(bibcode:2012SoPh..275....3P) year:2020-2025",
                              sort='citation_count',max_pages=10,
                              fl=['id', 'bibcode', 'doi','title', 'citation_count','reference','citation','first_author','author']))
reference = {p.bibcode:p.reference for p in result}
citations = {p.bibcode:p.citation for p in result}
titles = {p.bibcode:p.title for p in result}
authors= {p.bibcode:p.author for p in result}

for p in reference:
    if reference[p] is None:
        reference[p] = set()
    else:
        reference[p] = set(reference[p])
for p in citations:
    if citations[p] is None:
        citations[p] = set()
    else:
        citations[p] = set(citations[p])

In [None]:
# Iterate through, pulling the next set of articles which are most-cited by our extant population
for depth in tqdm(range(100)):
    newQ = []
    for p in citations:
        newQ = newQ + list(citations[p]-set(citations.keys()))
    newQ = sorted(newQ,key=newQ.count,reverse=True)
    seen = set()
    newQ = [q for q in newQ if q not in seen and (seen.add(q) or True)]
    # ADS gets mad sometimes if you try to pull too many at once. 50 and 30 are hardcoded here, and it still fails sometimes.
    try:
        result = list(ads.SearchQuery(q=' or '.join(newQ[0:min(50,len(newQ))]),sort='citation_count',fl=['id', 'bibcode', 'doi','title', 'citation_count','reference','citation', 'author']))
    except:
        result = list(ads.SearchQuery(q=' or '.join(newQ[0:min(30,len(newQ))]),sort='citation_count',fl=['id', 'bibcode', 'doi','title', 'citation_count','reference','citation', 'author']))
    
    for p in result:
        if p.reference is None:
            reference[p.bibcode] = set([])
        else:
            reference[p.bibcode] = set(p.reference)
        if p.citation is None:
            citations[p.bibcode] = set([])
        else:
            citations[p.bibcode] = set(p.citation)
        titles[p.bibcode] = p.title
        authors[p.bibcode] = p.author

In [None]:
# For plotting, we create a subgraph without orphans and which is closed (ie, no connections to nodes we don't have)
orphanless = {}
for j in citations:
    for k in citations[j]:
        if (k in citations.keys()) and not (k in orphanless.keys()):
            orphanless[k] = citations[k].intersection(set(citations.keys()))
print(len(orphanless))
print(len(citations))

In [None]:
# visualization -- very preliminary

scale = 0.1
sizes = [5+scale*float(len(orphanless[p])) for p in orphanless]
colors = [float(len(reference[p])) for p in orphanless]
data = nx.Graph(orphanless)
pos = nx.kamada_kawai_layout(data)
nx.set_node_attributes(data,titles,'titles')
plot = figure(tools="hover", tooltips="@index: @titles")

graph = from_networkx(data, pos, node_size=sizes)
graph.node_renderer.data_source.data['sizes'] = sizes
graph.node_renderer.data_source.data['colors'] = colors
graph.node_renderer.glyph.update(size='sizes', fill_color=linear_cmap('colors', 'GnBu8', min(colors), max(colors)))
graph.edge_renderer.glyph.update(line_alpha=0.1,line_width=1)
plot.renderers.append(graph)

show(plot)

In [None]:
r.limits