In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
from Bio import Entrez
from pprint import pprint
import collections

Entrez.email = 'bdo311@gmail.com'

In [3]:
# Code to take only the relevant fields from the PubMed XML summary
# from https://github.com/bdo311/creativity/blob/master/code/download_data.ipynb

def simplify(info):
    record = {}
    article = info['MedlineCitation']['Article']

    record["pmid"] = int(info['MedlineCitation']['PMID'])
    record["title"] = article['ArticleTitle']
    try: record["journal"] = article['Journal']['Title']
    except: record["journal"] = ""
    try: record["jabbr"] = article['Journal']['ISOAbbreviation']
    except: record["jabbr"] = ""
    #record["year"] = article['Journal']['JournalIssue']['PubDate']['Year']
    record["yr"] = dict(info['PubmedData']['History'][0])['Year']
    record["mo"] = dict(info['PubmedData']['History'][0])['Month']
    #record["type"] = article["PublicationTypeList"]

    record["keywords"] = info['MedlineCitation']['KeywordList']
    try:
        record["authors"] = []
        for au in article["AuthorList"]:
            record["authors"].append([au["LastName"], au["ForeName"], au["Initials"], au["AffiliationInfo"]])
    except: pass
    return record

In [50]:
def get_paper_info(pmids):
    handle = Entrez.efetch(db="pubmed", id=','.join(pmids), retmode="xml")
    records = Entrez.read(handle)['PubmedArticle']
    return [simplify(record) for record in records]

In [188]:
# Testing on a single author

handle = Entrez.esearch(db="pubmed", term="Regev Aviv[au]", retmode="xml", retmax=10000)
pmids = Entrez.read(handle)['IdList']
print "Number of papers:", len(pmids)

papers = get_paper_info(pmids)
print papers[-1]

Number of papers: 164
{'mo': '8', 'title': 'Minreg: inferring an active regulator set.', 'jabbr': 'Bioinformatics', 'authors': [["Pe'er", 'Dana', 'D', [{u'Affiliation': 'School of Computer Science & Engineering, Hebrew University of Jerusalem.', u'Identifier': []}]], ['Regev', 'Aviv', 'A', []], ['Tanay', 'Amos', 'A', []]], 'keywords': [], 'pmid': 12169555, 'yr': '2002', 'journal': 'Bioinformatics (Oxford, England)'}


In [189]:
# make a dictionary of co-authors to their papers

author_to_papers = collections.defaultdict(lambda: [])
for paper in papers:
    pmid = paper['pmid']
    num_auth = len(paper["authors"])

    if num_auth == 0 or paper['yr'] == "": continue
#         try: all_info[pmid] = (num_auth, int(rec['yr']), rec['journal'], citation_counts.get(pmid, 0))
#         except: continue

    for i in range(len(paper["authors"])):
        a = paper["authors"][i]
        au_name = ', '.join([a[0], a[2]])
        author_to_papers[au_name].append((pmid, i+1, num_auth, int(paper['yr'])))
        #author_to_pmids[','.join(a[:2])].append((pmid, i+1, num_auth))

In [190]:
# Assumptions: 
# collaborators have a median position at >75% of the author list
# most people will have at least 3 papers
# problem: what if some people start off as students and then become collaborators?

au_to_info_list = []
for au in author_to_papers:
    papers = author_to_papers[au]
    if len(papers) < 3: continue
    median_pos = np.median([float(x[1])/x[2] for x in papers])  # pos divided by total authors
    num_first = [x[1] for x in papers].count(1)
    
    start_yr = min([x[3] for x in papers])
    end_yr = max([x[3] for x in papers])
    if median_pos < 0.75:
        au_to_info_list.append([au, 'student', start_yr, end_yr, len(papers), num_first, median_pos])
    else:
        au_to_info_list.append([au, 'collab', start_yr, end_yr, len(papers), num_first, median_pos])
        
# remove the person with the most papers -- the same person we're searching
# this assumption doesn't hold if all papers are with the same person e.g. only one PI
same_au = max(au_to_info_list, key=lambda x: x[4])[0]

# make data frame
au_to_info = pd.DataFrame.from_records(au_to_info_list, \
                                       columns=["name", "position", "start", "end", \
                                                "numpapers", "num_first", "median_pos"])
#au_to_info = au_to_info[au_to_info["name"] != same_au]  # removing same_au
au_to_info.head()

Unnamed: 0,name,position,start,end,numpapers,num_first,median_pos
0,"Parnas, O",student,2014,2016,4,1,0.252976
1,"Schier, AF",collab,2009,2014,5,0,0.833333
2,"Carr, SA",collab,2011,2015,7,0,0.866667
3,"Fan, L",student,2010,2011,5,0,0.4
4,"Cahill, DP",student,2013,2016,5,0,0.666667


In [191]:
au_to_info.sort_values("numpapers", ascending=False)

Unnamed: 0,name,position,start,end,numpapers,num_first,median_pos
86,"Regev, A",collab,2002,2017,163,1,0.900000
80,"Hacohen, N",collab,2008,2016,26,0,0.811012
7,"Friedman, N",collab,2003,2016,23,0,0.857143
14,"Yosef, N",student,2010,2016,21,3,0.320000
66,"Satija, R",student,2012,2016,19,1,0.521739
134,"Shalek, AK",student,2011,2016,18,3,0.419048
71,"Thompson, DA",student,2009,2015,18,3,0.535714
111,"Amit, I",student,2008,2016,15,3,0.687500
89,"Wapinski, I",student,2007,2013,13,4,0.250000
133,"Garber, M",student,2008,2015,12,1,0.250980
