In [1]:
import os
import json
import numpy as np
import pandas as pd

In [2]:
folders = ['custom_license','noncomm_use_subset','biorxiv_medrxiv','comm_use_subset']

In [3]:
path_papers = []
for root, dirs, files in os.walk(".", topdown=False):
   for name in files:
      path_papers.append(os.path.join(root, name))
#update to include only files in folders
path_papers = [path for path in path_papers if any(x in path.split('/') for x in folders)]
#update to remove "."
path_papers = [paper[1:] for paper in path_papers]

In [5]:
#create dictionary for paths of papers by folder
dict_paper_paths ={}
for paper in path_papers:
    split_path = paper.split("/")
    #create key
    key = split_path[1]
    if key not in dict_paper_paths.keys():
        dict_paper_paths[key] = [paper]
    else:
        value = dict_paper_paths[key]
        value.append(paper)


In [6]:
for i in dict_paper_paths.keys():
    print('Papers for ',i, 'is ',len(dict_paper_paths[i]))

Papers for  custom_license is  16959
Papers for  noncomm_use_subset is  2353
Papers for  biorxiv_medrxiv is  885
Papers for  comm_use_subset is  9118


In [7]:
fname = os.getcwd() + dict_paper_paths['comm_use_subset'][0]
fname

'/Users/janmichaelaustria/Google Drive/UNH Spring/ML_Part2/Covid_19/comm_use_subset/5e0c586f047ff909c8ed3fe171c8975a90608d08.json'

In [8]:
with open(fname) as f:
  data = json.load(f)

data.keys()

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])

In [9]:
def pull_paper_id(file):
    #get the paper id from json file
    return(file['paper_id'])

def pull_title(file):
    #get title from 
    return(file['metadata']['title'])

def pull_authors(file):
    #returns string of authors seperated by ;
    #access list of author dictionaries
    list_author_dicts = file['metadata']['authors']
    #traverse list
    authors = []
    for i in range(0,len(list_author_dicts)):
        first_name,last_name = list_author_dicts[i]['first'], list_author_dicts[i]['last']
        #concat fist and last
        author_name = first_name + " " + last_name
        authors.append(author_name)
    return(authors)

def pull_abstracts(file):
    #pull abstracts
    abstract_texts = []
    if len(file['abstract']) == 0:
        return(np.nan)
    else:
        abstracts = file['abstract']
        for i in range(0,len(abstracts)):
            if 'text' in abstracts[i].keys():
                text_element = abstracts[i]['text']
                abstract_texts.append(text_element)
        return(abstract_texts)

def pull_texts(file):
    #pull all text files from body
    body_text = file['body_text']
    #travese list
    #store texts
    texts = []
    for i in range(0,len(body_text)):
        if 'text' in body_text[i].keys():
            #pull text element
            text_element = body_text[i]['text']
            texts.append(text_element)
        else:
            pass
    #add in text from back_matter
    back_matter =  file['back_matter']
    for i in range(0,len(back_matter)):
        if 'text' in back_matter[i].keys():
            text_element = back_matter[i]['text']
            texts.append(text_element)
        else:
            pass
                
    return(texts)

def pull_citations(file):
    #in case i want acess to this
    #citation will be indicated by title and year, that should be enough for a lookup
    citations = []
    bib_entries = file['bib_entries']
    for i in bib_entries.keys():
        ith_bib_entry = bib_entries[i]
        title, year = ith_bib_entry['title'], ith_bib_entry['year']
        citations.append((title,year))
    return(citations)
        
    

All functions work!

## Apply functions to all json files

In [10]:
paper_types = []
paper_ids = []
paper_titles = []
paper_authors = []
paper_abstracts = []
paper_texts = []
paper_citations = []

for paper_type in dict_paper_paths.keys():
    #access list file paths
    list_paths = dict_paper_paths[paper_type]
    #move along list paths
    for i in range(0,len(list_paths)):
        #create filename to read
        fname = os.getcwd() + list_paths[i]
        #oad into json
        with open(fname) as f:
            data = json.load(f)
        #go into json, grabbing what i need
        ID, title,authors, abstract,text,citations = pull_paper_id(data), pull_title(data),pull_authors(data),pull_abstracts(data),pull_texts(data),pull_citations(data)   
        #append accordingly
        paper_types.append(paper_type)
        paper_ids.append(ID)
        paper_titles.append(title)
        paper_authors.append(authors)
        paper_abstracts.append(abstract)
        paper_texts.append(text)
        paper_citations.append(citations)
        

In [11]:
len(paper_texts)

29315

## Put all into DataFrame

In [12]:
papers = pd.DataFrame()
papers['Library']  = paper_types
papers['ID'] = paper_ids
papers['Titles'] = paper_titles
papers['Authors'] = paper_authors
papers['Abstract'] = paper_abstracts
papers['Text'] = paper_texts
papers['Citations'] = paper_citations

In [13]:
papers.shape

(29315, 7)

In [14]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29315 entries, 0 to 29314
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Library    29315 non-null  object
 1   ID         29315 non-null  object
 2   Titles     29315 non-null  object
 3   Authors    29315 non-null  object
 4   Abstract   21264 non-null  object
 5   Text       29315 non-null  object
 6   Citations  29315 non-null  object
dtypes: object(7)
memory usage: 1.6+ MB


In [16]:
papers.head()

Unnamed: 0,Library,ID,Titles,Authors,Abstract,Text,Citations
0,custom_license,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,Evolutionary Medicine IV. Evolution and Emerge...,[S Scarpino],,[The evolutionary history of humans is charact...,[(Rapid evolutionary dynamics and disease thre...
1,custom_license,6599ebbef3d868afac9daa4f80fa075675cf03bc,International aviation emissions to 2025: Can ...,"[Andrew Macintosh, Lailey Wallace]","[International aviation is growing rapidly, re...","[Sixty years ago, civil aviation was an infant...",[(CONSAVE 2050-constrained scenarios on aviati...
2,custom_license,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,2 Mechanisms of diarrhoea,"[I Booth, A Mcneish Baillikre&apos;]",,[Acute infections of the gastrointestinal trac...,[(A typing scheme for Aeromonas hydrophila bas...
3,custom_license,b87b790c96c75faa22a085cb560f7b3d8e018b24,Features of Host Cells: Cellular and Molecular...,[],,"[There are three domains of life-Bacteria, Arc...",[(Epidemics to eradication: the modern history...
4,custom_license,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Spring 2020 | 1 Beijing's Hard and Soft Repres...,[Victoria Tin-Bor Hui],[Hong Kong's new Police Commissioner Chris Tan...,"[It is also noteworthy that Tang, who was once...",[(Listed companies appoint mainlanders to boar...


In [17]:
papers.to_csv('covid19_papers_compiled.csv')

In [18]:
papers.iloc[10]

Library                                         custom_license
ID                    3db98d76b1a91a3940955132dc9fccc95d1600c5
Titles         Bali Bombings: A Whole of Government Response 1
Authors                               [Jeff Roach, Ian Kemish]
Abstract                                                   NaN
Text         [which provide services to the community. Loca...
Citations    [(Restoring tourism destinations in crisis: A ...
Name: 10, dtype: object

## Compare against meta data

In [19]:
meta = pd.read_csv('metadata.csv')

In [21]:
meta.shape

(44220, 15)

## Common papers between meta and all papers

In [31]:
len(set(meta['sha'][meta['sha'].notnull()]).intersection(set(papers['ID'])))

27678

## Papers in meta not in papers

In [34]:
len(set(meta['sha'][meta['sha'].notnull()]).difference(set(papers['ID'])))

772

## Papers in papers not in meta

In [36]:
len(set(papers['ID']).difference(set(meta['sha'][meta['sha'].notnull()])))

1637