In [1]:
import numpy as np
import pandas as pd
import glob
import json
import matplotlib.pyplot as plt

In [2]:
plt.style.use('ggplot')

In [7]:
data_path = '/home/vc/SelfLearn/datasets/cord19/'
metadata_path = f'{data_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype = {'pubmed_id':str,'Microsoft Academic Paper ID': str, 'doi':str})
meta_df.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


In [8]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 15 columns):
sha                            28462 non-null object
source_x                       44220 non-null object
title                          43996 non-null object
doi                            40750 non-null object
pmcid                          23319 non-null object
pubmed_id                      22943 non-null object
license                        44220 non-null object
abstract                       35806 non-null object
publish_time                   34197 non-null object
authors                        41074 non-null object
journal                        33173 non-null object
Microsoft Academic Paper ID    964 non-null object
WHO #Covidence                 1767 non-null object
has_full_text                  44220 non-null bool
full_text_file                 32829 non-null object
dtypes: bool(1), object(14)
memory usage: 4.8+ MB


In [11]:
all_json = glob.glob(f'{data_path}/**/*.json', recursive=True)
len(all_json)

29315

# Helper Functions

In [21]:
class FileReader:
    def __init__(self,file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            
            for body in content['abstract']:
                self.abstract.append(body['text'])
            for body in content['body_text']:
                self.body_text.append(body['text'])
            
            self.abstract = '/n'.join(self.abstract)
            self.body_text = '/n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
print(FileReader(all_json[0]))

aba67455b71206b20d9b978e994cb571392dd404: Members of the carcinoembryonic antigen family (CEACAMs) are widely expressed, and, depending on the tissue, capable of regulating diverse functions including tumor promotion, tumor suppression, angio... The carcinoembryonic antigen (CEA) 2 family consists of two subfamilies, the CEACAM subgroup and the pregnancy specific glycoprotein (PSG) subgroup. Members of this family have been redundantly named ...


This helper function will add breaks after every word when character length reach a certain amount. Helps with hover tool

In [22]:
def get_breaks(content,length):
    data = ""
    total_chars = 0
    words = content.split(" ")
    
    for i in range(len(words)):
        total_chars += len(words[i])
        if(total_chars>length):
            data += "<br>"
            total_char = 0
        data += words[i]+ " "
    
    return data

Putting the data into an easy-to-use dataframe

In [23]:
dict_ = {'paper_id':[], 'abstract':[], 'body_text':[], 'authors':[], 
         'title': [], 'journal': [], 'abstract_summary': []}
N = len(all_json)
for idx, entry in enumerate(all_json):
    if(idx%(N//10) == 0):
        print(f'Processing index: {idx} of {N}')
    content = FileReader(entry)
    
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    if(len(meta_data) == 0):
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    #abstract summary
    abstract = content.abstract.split(" ")
    abstract_length = len(abstract)
    if(abstract_length == 0):
        dict_['abstract_summary'].append("Not provided")
    elif(abstract_length > 100):
        #abstract too long
        info = abstract[:100]
        data = get_breaks(' '.join(info),40)
        dict_['abstract_summary'].append(data)
    else:
        data = get_breaks(content.abstract,40)
        dict_['abstract_summary'].append(data)
    
    #Format for multiple authors
    try:
        authors = meta_data['authors'].values[0].split(';')
        if(len(authors) > 2):
            dict_['authors'].append(' '.join(authors[:2])+"...")
        else:
            dict_['authors'].append(' '.join(authors))
    except Exception as e:
        dict_['authors'].append(meta_data['authors'].values[0])
    
    #Title information
    try:
        title = get_breaks(meta_data['title'].values[0],40)
        dict_['title'].append(title)
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    #Journal information
    dict_['journal'].append(meta_data['journal'].values[0])

df_covid = pd.DataFrame(dict_, columns = ['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

Processing index: 0 of 29315
Processing index: 2931 of 29315
Processing index: 5862 of 29315
Processing index: 8793 of 29315
Processing index: 11724 of 29315
Processing index: 14655 of 29315
Processing index: 17586 of 29315
Processing index: 20517 of 29315
Processing index: 23448 of 29315
Processing index: 26379 of 29315
Processing index: 29310 of 29315


Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,aba67455b71206b20d9b978e994cb571392dd404,Members of the carcinoembryonic antigen family...,The carcinoembryonic antigen (CEA) 2 family co...,"Skubitz, Keith M Skubitz, Amy PN","Interdependency of CEACAM-1, -3, -6, and -8 <b...",J Transl Med,Members of the carcinoembryonic antigen <br>fa...
1,d15dfd546e238b29cacf7f28b83eb1cdb98f6ca8,Complement is an essential element in both inn...,"role in the infection process, while the paras...","Sandri, Thaisa Lucas Lidani, Kárita Cláudia F...",Human complement receptor type 1 (CR1) protein...,Sci Rep,Complement is an essential element in both <br...
2,d4d0092e3e79f9d5ad3c48f23778317ad867d61c,The ongoing emergence of human infections orig...,Information about the commensal and pathogenic...,"Wittekindt, Nicola E. Padhi, Abinash...",Nodeomics: Pathogen Detection in Vertebrate <b...,PLoS One,The ongoing emergence of human infections <br>...
3,26f8c5fbd310c95c2e1fb04c34ed6f5d10901d07,Background: Severe Acute Respiratory Syndrome ...,The 2003 outbreaks of Severe Acute Respiratory...,"Abdirizak, Fatima Lewis, Rayleen...",Evaluating the potential impact of targeted <b...,Theor Biol Med Model,Background: Severe Acute Respiratory <br>Syndr...
4,425be277181521bab21a8e64d54657dfdcac6bde,,Acute respiratory infections (ARIs) are the le...,"Moreno-Valencia, Yazmin Hernandez-Hernandez, ...",Detection and characterization of <br>respirat...,Influenza Other Respir Viruses,
