In [1]:
import pandas as pd
import os
from tqdm import tqdm
import json

In [2]:
dirs = ['biorxiv_medrxiv','comm_use_subset','custom_license','noncomm_use_subset']
docs = []

for d in dirs:
    for file in tqdm(os.listdir(f"{d}/{d}")):
        file_path = f"{d}/{d}/{file}"
        j = json.load(open(file_path,"rb"))
#         print(j)
        
        paper_id = j['paper_id']
        title = j['metadata']['title']
        try:
            abstract = j['abstract'][0]['text']
        except:
            abstract = ''
            
        full_text = ''
        for text in j['body_text']:
            full_text += text['text']+'\n\n'
        docs.append([paper_id,title,abstract,full_text])
df = pd.DataFrame(docs,columns = ['paper_id','title','abstract','full_text'])

100%|████████████████████████████████████████████████████████████████████████████████| 885/885 [00:15<00:00, 56.89it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 9118/9118 [02:30<00:00, 60.78it/s]
100%|████████████████████████████████████████████████████████████████████████████| 16959/16959 [04:42<00:00, 60.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2353/2353 [00:36<00:00, 65.27it/s]


In [3]:
df.to_pickle('unprocessed_articles.pkl')

In [31]:
df.head()

Unnamed: 0,paper_id,title,abstract,full_text
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,word count: 194 22 Text word count: 5168 23 24...,"VP3, and VP0 (which is further processed to VP..."
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,,The 2019-nCoV epidemic has spread across China...
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",Infectious bronchitis (IB) causes significant ...,"Infectious bronchitis (IB), which is caused by..."
3,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,Nipah Virus (NiV) came into limelight recently...,Nipah is an infectious negative-sense single-s...
4,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,Background: A novel coronavirus (2019-nCoV) em...,"In December 2019, a cluster of patients with p..."


In [4]:
key_words = [' hospital',' personnel',' nurse',' doctor',' aid',
             ' site',' strategies',' strategy',' fund',' equipment']
for word in key_words:
    print(word, df[df['abstract'].str.contains(word)].shape)

 hospital (1296, 4)
 personnel (131, 4)
 nurse (152, 4)
 doctor (66, 4)
 aid (185, 4)
 site (1244, 4)
 strategies (1165, 4)
 strategy (689, 4)
 fund (673, 4)
 equipment (176, 4)


In [20]:
# Initially thought to only include articles with abstracts
covid = ['COVID','novel corona virus','SARS','MERS','corona','virus']
relevant_abstracts = []
for name in covid:
    temp = df[df['abstract'].str.contains(name)]
    print(name, temp.shape,temp.index.tolist()[0],temp.index.tolist()[-1])
    relevant_abstracts.extend(temp.index.tolist())

COVID (626, 4) 20 29187
novel corona virus (4, 4) 340 18859
SARS (1813, 4) 18 29287
MERS (807, 4) 26 29299
corona (3271, 4) 4 29310
virus (11141, 4) 0 29313


In [21]:
# After a quick scan of some articles without abstracts, there were articles that had relevant information dispite not
# having an abstract.
covid = ['COVID','novel corona virus','SARS','MERS','corona','virus']
relevant_articles = []
for name in covid:
    temp = df[df['full_text'].str.contains(name)]
    print(name, temp.shape,temp.index.tolist()[0],temp.index.tolist()[-1])
    relevant_articles.extend(temp.index.tolist())

COVID (630, 4) 5 29285
novel corona virus (20, 4) 3260 28505
SARS (11169, 4) 5 29310
MERS (3459, 4) 5 29305
corona (16416, 4) 2 29312
virus (24887, 4) 0 29313


In [22]:
len(relevant_abstracts),len(set(relevant_abstracts)),len(relevant_articles),len(set(relevant_articles))

(17662, 12189, 56581, 26719)

In [28]:
# Making sure that every article was accounted for
a = relevant_articles+relevant_abstracts
len(set(a))

26806

In [33]:
cleaned_articles = df.iloc[list(set(a))]

In [35]:
cleaned_articles.to_pickle('cleaned_articles.pkl')