# CORD-19 AuthorRank

An example of how AuthorRank can be applied to datasets that contain authorship information about documents, like the [CORD-19](https://www.semanticscholar.org/cord19) dataset. 

## Imports

In [25]:
import author_rank as ar
import json
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import random

## Read Data

In [2]:
cord_df = pd.read_csv("../data/CORD-19/2020-07-16/metadata.csv", low_memory=False)

In [3]:
cord_df.sample(frac=1.).head(10)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
154704,iqswl5kh,,Elsevier; Medline; PMC,Morphology and morphogenesis of a coronavirus ...,10.1016/0014-4800(76)90045-9,PMC7130198,187445.0,els-covid,Abstract The morphology and morphogenesis of v...,1976-12-31,"Doughri, A. M.; Storz, J.; Hajer, I.; Fernando...",Experimental and Molecular Pathology,,,,,,https://api.elsevier.com/content/article/pii/0...,272186.0
191408,097nifny,308576d6375abb65564b6dce15c7e3d5d014cb35,Elsevier; PMC,Enfermedad pulmonar obstructiva crónica e infe...,10.1016/j.arbres.2020.04.016,PMC7218399,,els-covid,,2020-05-13,"Gonçalves, Juan Marco Figueira; Golpe, Rafael;...",Arch Bronconeumol,,,,document_parses/pdf_json/308576d6375abb65564b6...,,https://api.elsevier.com/content/article/pii/S...,218596478.0
66502,56adb61b,,Medline,Severe acute respiratory syndrome coronavirus ...,10.1128/jvi.02406-07,,18448520.0,unk,The severe acute respiratory syndrome coronavi...,2008,"Hussain, Snawar; Perlman, Stanley; Gallagher, ...",Journal of virology,,,,,,https://doi.org/10.1128/jvi.02406-07; https://...,8483380.0
178017,fv7a9je6,b24a645b549461375cdb883c57f720e0c3c6f399,Elsevier; Medline; PMC,Going global – Travel and the 2019 novel coron...,10.1016/j.tmaid.2020.101578,PMC7128681,32044389.0,els-covid,,2020-02-29,"Rodríguez-Morales, Alfonso J.; MacGregor, Kirs...",Travel Medicine and Infectious Disease,,,,document_parses/pdf_json/b24a645b549461375cdb8...,document_parses/pmc_json/PMC7128681.xml.json,https://www.sciencedirect.com/science/article/...,211080199.0
140319,8rd412et,,Medline; WHO,Scathing COVID-19 book from Lancet editor - ru...,10.1038/d41586-020-01839-y,,32555392.0,unk,,2020,"Buranyi, Stephen",Nature,,#606953,,,,https://doi.org/10.1038/d41586-020-01839-y; ht...,219729961.0
41575,mbvmy989,,Medline,Extending the Theory of Normative Social Behav...,10.1080/10410236.2018.1461586,,29634374.0,unk,The current study tests the predictions of the...,2018,"Chung, Minwoong; Lapinski, Maria Knight",Health communication,,,,,,https://doi.org/10.1080/10410236.2018.1461586;...,205699229.0
190104,38lbgx25,4eb7174618f76ee75921a294ba3853fd5e8c3914,Medline; PMC; WHO,Early-Morning vs Spot Posterior Oropharyngeal ...,10.1093/ofid/ofaa210,PMC7299521,32577428.0,cc-by-nc-nd,BACKGROUND: Posterior oropharyngeal saliva is ...,2020-06-07,"Hung, Derek Ling-Lung; Li, Xin; Chiu, Kelvin H...",Open Forum Infect Dis,,#616714,,document_parses/pdf_json/4eb7174618f76ee75921a...,document_parses/pmc_json/PMC7299521.xml.json,https://doi.org/10.1093/ofid/ofaa210; https://...,219910486.0
121609,00qk10im,,WHO,Prevention and treatment of COVID-19 disease b...,,,,unk,The recent outbreak of coronavirus disease 201...,2020,"Schijns, Virgil; Lavelle, Ed C",Eur J Immunol,,#326966,,,,,218834399.0
27441,72uvawth,f5e08adb6fc82c9d614bab5411036890accc2f3f,PMC,The spectrum of pathological findings in coron...,10.1186/s13000-020-00999-9,PMC7359764,,cc-by,,2020-07-14,"Barth, Rolf F.; Buja, L. Maximillian; Parwani,...",Diagn Pathol,,,,document_parses/pdf_json/f5e08adb6fc82c9d614ba...,document_parses/pmc_json/PMC7359764.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,
54171,lh329mdi,,Medline,Chimeric feline coronaviruses that encode type...,10.1128/jvi.01568-09,,19906918.0,unk,Persistent infection of domestic cats with fel...,2010,"Tekes, Gergely; Hofmann-Lehmann, Regina; Bank-...",Journal of virology,,,,,,https://doi.org/10.1128/jvi.01568-09; https://...,23126066.0


In [4]:
cord_df.shape

(192509, 19)

### Subsetting the Data 

Facilitate a "search" by restricting to documents that feature the word "bronchiolitis".

In [5]:
cord_df_search = cord_df[cord_df["title"].astype(str).str.contains("bronchiolitis")]

In [6]:
cord_df_search.shape

(138, 19)

In [7]:
authors_by_document = cord_df_search["authors"].astype(str).apply(
    lambda row: [r.strip() for r in row.split(";")]
)

In [8]:
documents = list()
for doc in authors_by_document:
    doc_dict = {
        "authors": list()
    }
    for auth in doc:
        doc_dict["authors"].append(
            {"name": auth} # cord 19 has full name as represented on document
        )
    documents.append(doc_dict)

## Fit AuthorRank

In [9]:
# create an AuthorRank object
ar_graph = ar.Graph()

In [10]:
documents[0:10]

[{'authors': [{'name': 'Liet, Jean-Michel'},
   {'name': 'Dejode, Jean-Marc'},
   {'name': 'Joram, Nicolas'},
   {'name': 'Roux, Bénédicte Gaillard-Le'},
   {'name': 'Bétrémieux, Pierre'},
   {'name': 'Rozé, Jean-Christophe'}]},
 {'authors': [{'name': 'Cruces, Pablo'},
   {'name': 'González-Dambrauskas, Sebastián'},
   {'name': 'Quilodrán, Julio'},
   {'name': 'Valenzuela, Jorge'},
   {'name': 'Martínez, Javier'},
   {'name': 'Rivero, Natalia'},
   {'name': 'Arias, Pablo'},
   {'name': 'Díaz, Franco'}]},
 {'authors': [{'name': 'Robinson, Lacey B.'},
   {'name': 'Chen Arroyo, Anna J.'},
   {'name': 'Dantas, Marina A.S.'},
   {'name': 'Espinola, Janice A.'},
   {'name': 'Sullivan, Ashley F.'},
   {'name': 'Camargo, Carlos A.'}]},
 {'authors': [{'name': 'Bressan, Silvia'},
   {'name': 'Balzani, Marco'},
   {'name': 'Krauss, Baruch'},
   {'name': 'Pettenazzo, Andrea'},
   {'name': 'Zanconato, Stefania'},
   {'name': 'Baraldi, Eugenio'}]},
 {'authors': [{'name': 'Patel, N. R.'},
   {'name':

In [40]:
# fit to the data
ar_graph.fit(
    documents=random.sample(documents, 25), # limit to a small number of documents
    progress_bar=True, # use a progress bar to indicate how far along processing is
    authorship_key="authors",
    keys=set(["name"]),
)

### Show the Scores

In [None]:
# get the top authors for a set of documents
top = ar_graph.top_authors(
    normalize_scores=True,
    n=10
)

In [None]:
# print the results
for i, j in zip(top[0], top[1]):
    print(i, j)

## Visualize

In [None]:
G = ar_graph.graph

In [None]:
plt.figure(figsize=(20,10))
plt.axis('off')
pos = nx.shell_layout(G)
edgewidth = [d['weight'] for (u,v,d) in G.edges(data=True)]
edgewidth = [d for d in edgewidth]
author_scores = list()
for i in G.nodes:
    try:
        index = top[0].index(i)
        author_scores.append(top[1][index])
    except ValueError:
        pass
        
nx.draw_networkx_nodes(G, pos, node_size=2)
nx.draw_networkx_edges(G, pos, width=edgewidth, edge_color="grey")
nx.draw_networkx_labels(G, pos=pos)

plt.show()

In [None]:
G_json = ar_graph.as_json()

In [26]:
with open("../visualization/data/cord_graph.json", 'w') as f_out:
    json.dump(G_json, f_out)

In [None]:
scores_json = dict()
for t in zip(top[0], top[1]):
    scores_json[" ".join(t[0])] = t[1]
scores_json

In [28]:
with open("../visualization/data/cord_scores.json", 'w') as f_out:
    json.dump(scores_json, f_out)