# Load Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

## File check

In [2]:
!ls /kaggle/input/CORD-19-research-challenge/

COVID.DATA.LIC.AGMT.pdf		      json_schema.txt
biorxiv_medrxiv			      metadata.csv
comm_use_subset			      metadata.readme
cord19_specter_embeddings_2020-04-10  noncomm_use_subset
custom_license


# Explore meta data

In [3]:
root_path = '/kaggle/input/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",Genome Biol,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


In [5]:
meta_df.shape

(51078, 18)

In [8]:
sum(meta_df.abstract.isna())

8726

In [19]:
sum(meta_df.abstract.str.contains(r'hum[ae]n | m[ae]n |treatment')[meta_df.abstract.str.contains(r'hum[ae]n | m[ae]n |treatment').notnull()])

13648

In [11]:
meta_df.apply(lambda x:sum(x.isna()))

cord_uid                           0
sha                            13056
source_x                           0
title                            158
doi                             3337
pmcid                           9996
pubmed_id                      13217
license                            0
abstract                        8726
publish_time                       8
authors                         2187
journal                         4710
Microsoft Academic Paper ID    50114
WHO #Covidence                 49310
has_pdf_parse                      0
has_pmc_xml_parse                  0
full_text_file                  8567
url                              302
dtype: int64

### Check how many papers or articles (json files) are there?

In [4]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

59311

### Extract main content from the json files 

In [33]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            try:
                if content['abstract']:
                
                    for entry in content['abstract']:
                        self.abstract.append(entry['text'])
            except:
                self.abstract.append('NA')
                
            
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[10000])
print(first_row)

PMC4821533: NA... Incidence and prevalence are standard epidemiological indicators, monitored to understand disease dynamic within society [1, 2]. In the case of infectious diseases, it is customary to measure how far ...


##### Abtracts have breaks which need to be extracted too to get full text.

In [34]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

### Attempt to extract files and create a dataset(table)

In [22]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text'])
df_covid.head()

Processing index: 0 of 59311
Processing index: 5931 of 59311


KeyError: 'abstract'

#### MOdified attempt to also include metadata information 

In [35]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 'NA': 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

Processing index: 0 of 59311
Processing index: 5931 of 59311
Processing index: 11862 of 59311
Processing index: 17793 of 59311
Processing index: 23724 of 59311
Processing index: 29655 of 59311
Processing index: 35586 of 59311
Processing index: 41517 of 59311
Processing index: 47448 of 59311
Processing index: 53379 of 59311
Processing index: 59310 of 59311


Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,7db22f7f81977109d493a0edf8ed75562648e839,"Scorpine, a small cationic peptide from the ve...",The oldest known scorpions lived around 430 mi...,"Zhang, Chao. He, Xinlong...",Recombinant Scorpine Produced Using SUMO<br>F...,PLoS One,"Scorpine, a small cationic peptide from the<b..."
1,a137eb51461b4a4ed3980aa5b9cb2f2c1cf0292a,Background: The complex interplay between vira...,The emergence of Severe Acute Respiratory Synd...,"McDermott, Jason E.. Mitchell, Hugh D....",The effect of inhibition of PP1 and TNFα<br>s...,BMC Syst Biol,Background: The complex interplay between<br>...
2,6c3e1a43f0e199876d4bd9ff787e1911fd5cfaa6,,Sjögren's syndrome (SS) is a connective tissue...,"Talotta, Rossella. Sarzi-Puttini, Piercarlo...",Microbial Agents as Putative Inducers of B Ce...,J Immunol Res,
3,2ce201c2ba233a562ee605a9aa12d2719cfa2beb,Background: Human adenovirus type 55 is a re-e...,Human adenovirus (HAdV) is a common pathogen a...,"Yi, Lina. Zou, LiRong...",A cluster of adenovirus type B55 infection in...,Influenza Other Respir Viruses,Background: Human adenovirus type 55 is a<br>...
4,b460e5b511b4e2c3233f9476cd4e0616d6f405ac,The severity of respiratory viral infections i...,"Viruses from several different families, inclu...","VanLeuven, James T.. Ridenhour, Benjamin J....",Lung epithelial cells have virus-specific and...,PLoS One,The severity of respiratory viral infections<...


In [36]:
df_covid.to_csv('initial_coviddata.csv')

In [37]:
sum(df_covid.body_text.str.contains(r'hum[ae]n | m[ae]n |treatment')[df_covid.body_text.str.contains(r'hum[ae]n | m[ae]n |treatment').notnull()])

30018

# Feature extraction

## Word Counts