# Impact EDA
Exploratory data analysis focused on researcher impact

__See__: 
* https://guides.library.cornell.edu/impact/tracking

In [59]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np
import urllib.request, json 

%matplotlib inline

In [108]:
%%time

# load metadata extracted data in notebook 00_load_metadata
# parse the author_parsed entries into a list of authors for each paper
arxiv_metadata = pd.read_csv('data/arxiv_metadata.zip', converters={"authors_parsed": lambda x:[entry.strip('[]') for entry in  x.split("], ")]})



CPU times: user 13.2 s, sys: 960 ms, total: 14.2 s
Wall time: 14.2 s


## Filter papers
* Keep only papers in date range, as years older than 1993 and younger than 2023 are incomplete,
* Keep only papers about gravitational waves ("gr-qc") and artificial intelligence ("cs.AI")

In [109]:
# keep only papers in this date range, as older years are incomplete
min_year = 1993  # arxiv_metadata.year.min()
max_year = 2023  # arxiv_metadata.year.max()
arxiv_metadata = arxiv_metadata[(arxiv_metadata['year'] >= min_year)]
arxiv_metadata = arxiv_metadata[(arxiv_metadata['year'] <= max_year)]

In [110]:
idx = ['gr-qc' in categories for categories in arxiv_metadata['categories']]
arxiv_grav_waves = arxiv_metadata[idx]
idx = ['cs.AI' in categories for categories in arxiv_metadata['categories']]
arxiv_cs_ai = arxiv_metadata[idx]

In [111]:
# Keep only papers from mathematics and Physics
#idx_math = arxiv_metadata['Mathematics'] == 1
#arxiv_math = arxiv_metadata[idx_math]
#idx_phys = arxiv_metadata['Physics'] == 1
#arxiv_phys = arxiv_metadata[idx_phys]

In [112]:
print(f"Number of papers in artificial intelligence: {arxiv_cs_ai.shape[0]}")
print(f"Number of papers in gravitational waves: {arxiv_grav_waves.shape[0]}")

Number of papers in artificial intelligence: 78510
Number of papers in gravitational waves: 104100


## Filter authors
* Find the 10 most prolific authors in these two subjects

In [113]:
def flatten(xss):
    """Flatten a list of lists"""
    return [x for xs in xss for x in xs]

def get_unique_authors(df):
    """Given a dataframe, return unique authors"""
    authors = flatten(df['authors_parsed'])
    return list(set(authors))

def count_authors(df):
    """Given a dataframe, return count of unique authors"""
    return len(get_unique_authors(df))

In [121]:
unique_authors_cs_ai = get_unique_authors(arxiv_cs_ai)
unique_authors_grav_waves = get_unique_authors(arxiv_grav_waves)

In [123]:
print(f"Number of unique authors in AI: {len(unique_authors_cs_ai)}")
print(f"Number of unique authors in gravitational waves: {len(unique_authors_grav_waves)}")

Number of unique authors in AI: 145559
Number of unique authors in gravitational waves: 73689


In [118]:
def count_papers_per_author(unique_authors, papers_df):
    num_papers = []
    author_counter = 0
    total_authors = len(unique_authors)
    for author in unique_authors:
        author_counter += 1
        if author_counter % 5000 == 0: print(f"Processing {author_counter}/{total_authors} authors")
        papers_count = 0    
        for paper in papers_df['authors_parsed']:
            papers_count += author in paper
        num_papers.append(papers_count)
    return num_papers

In [119]:
%%time

num_papers = count_papers_per_author(unique_authors_grav_waves, arxiv_grav_waves)

Processing 5000/73689 authors
Processing 10000/73689 authors
Processing 15000/73689 authors
Processing 20000/73689 authors
Processing 25000/73689 authors
Processing 30000/73689 authors
Processing 35000/73689 authors
Processing 40000/73689 authors
Processing 45000/73689 authors
Processing 50000/73689 authors
Processing 55000/73689 authors
Processing 60000/73689 authors
Processing 65000/73689 authors
Processing 70000/73689 authors
CPU times: user 37min 57s, sys: 356 ms, total: 37min 57s
Wall time: 37min 57s


In [122]:
papers_count = pd.DataFrame({'author': unique_authors_grav_waves, 'paper_count': num_papers})
papers_count.sort_values('paper_count', ascending=False, inplace=True, ignore_index=True)
papers_count.head(10)

Unnamed: 0,author,paper_count
0,"'Sharif', 'M.', ''",287
1,"'Cardoso', 'Vitor', ''",273
2,"'Iorio', 'Lorenzo', ''",270
3,"'Capozziello', 'Salvatore', ''",257
4,"'Mann', 'Robert B.', ''",246
5,"'Jing', 'Jiliang', ''",239
6,"'Mukohyama', 'Shinji', ''",239
7,"'Wang', 'Anzhong', ''",237
8,"'Cai', 'Rong-Gen', ''",226
9,"'Lobo', 'Francisco S. N.', ''",221


## Compute h-index for each top author

h-index see: https://guides.library.cornell.edu/c.php?g=32272&p=203391

* For each author, get all papers authored
* get DOI of paper
* Get citation count from  https://opencitations.net/
* Using citation count, compute h-index of author and store it in unique_authors_grav_waves

In [148]:
# For each author, get all papers authored
idx = [papers_count.iloc[0]['author'] in paper for paper in arxiv_grav_waves['authors_parsed']]
papers_by_author = arxiv_grav_waves[idx]

In [177]:
# keep only papers with DOI
idx = papers_by_author['doi'].isna()
papers_by_author = papers_by_author[~idx]

In [161]:
doi = papers_by_author.iloc[0]['doi']

Get citation count from https://opencitations.net/

In [194]:
def get_citation_count(doi):
    with urllib.request.urlopen(f"https://opencitations.net/index/api/v2/citation-count/doi:{doi}") as url:
        data = json.load(url)
        if data:
            citation_count = int(data[0]['count'])
        else:
            citation_count = 0
    return citation_count

In [198]:
dois = [doi for doi in papers_by_author['doi']]
citation_counts = []
for doi in dois:
    try:
        citation_counts.append(get_citation_count(doi))
    except:
        citation_counts.append(0)

In [199]:
papers_by_author['citation_count'] = citation_counts

Using citation count, compute h-index of author

In [202]:
def hIndex(citations):
    """
    https://github.com/kamyu104/LeetCode/blob/master/Python/h-index.py
    :type citations: List[int]
    :rtype: int
    
    # Given an array of citations (each citation is a non-negative integer)
    # of a researcher, write a function to compute the researcher's h-index.
    #
    # According to the definition of h-index on Wikipedia: 
    # "A scientist has index h if h of his/her N papers have
    # at least h citations each, and the other N − h papers have
    # no more than h citations each."
    #
    # For example, given citations = [3, 0, 6, 1, 5], 
    # which means the researcher has 5 papers in total
    # and each of them had received 3, 0, 6, 1, 5 citations respectively. 
    # Since the researcher has 3 papers with at least 3 citations each and 
    # the remaining two with no more than 3 citations each, his h-index is 3.
    #
    # Note: If there are several possible values for h, the maximum one is taken as the h-index.
    """
    return sum(x >= i + 1 for i, x in enumerate(sorted(  list(citations), reverse=True)))

In [203]:
hIndex(citation_counts)

38

In [205]:
citation_counts.sort(reverse=True)