## Import Dependencies and set working directory

In [81]:
import json
import os
import pandas as pd
import numpy as np
from datetime import datetime

In [82]:
%cd "C:\Users\lbrig\Documents\GRAD SCHOOL\Semester 3\Covid19 Kaggle\CORD-19-research-challenge"

C:\Users\lbrig\Documents\GRAD SCHOOL\Semester 3\Covid19 Kaggle\CORD-19-research-challenge


## For loop to extract relevant data from each json file

In [83]:
kaggle= []

start=datetime.now()

jsonpaths = ["biorxiv_medrxiv\\biorxiv_medrxiv", "comm_use_subset\\comm_use_subset", "custom_license\\custom_license", "noncomm_use_subset\\noncomm_use_subset"]
for path in jsonpaths:
    for file in os.listdir(path):
        filename = "%s\%s" % (path, file)
        with open(filename, 'r') as covidfile:
            df=json.load(covidfile)

        #Pull out columns needed
        #Paper ID number
        paperid = df['paper_id']
        #Title of paper
        title = df['metadata']['title']
        #Abstract of paper with each portion of text concatinated together
        abstract= ''
        for words in df['abstract']:
            abstract = abstract + ' ' + words['text']
        #Full document with each portion of text concatinated together
        fulldoc= ''
        for words in df['body_text']:
            fulldoc = fulldoc + ' ' + words['text']
        #File type 
        filetype= path.split('\\')
        
        #Append to kaggle 
        kaggle.append({'ID': paperid, 'Title': title, 'Abstract': abstract, 'Document': fulldoc, 'Filepath': filetype[0]})


kaggledf=pd.DataFrame(kaggle)

end=datetime.now()
print("Run time: ", end-start)


Run time:  0:01:28.155496


In [84]:
kaggledf.head()

Unnamed: 0,ID,Title,Abstract,Document,Filepath
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,word count: 194 22 Text word count: 5168 23 2...,"VP3, and VP0 (which is further processed to V...",biorxiv_medrxiv
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,,The 2019-nCoV epidemic has spread across Chin...,biorxiv_medrxiv
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",Infectious bronchitis (IB) causes significant...,"Infectious bronchitis (IB), which is caused b...",biorxiv_medrxiv
3,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,Nipah Virus (NiV) came into limelight recentl...,Nipah is an infectious negative-sense single-...,biorxiv_medrxiv
4,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,Background: A novel coronavirus (2019-nCoV) e...,"In December 2019, a cluster of patients with ...",biorxiv_medrxiv


In [85]:
kaggledf.shape

(33224, 5)

In [86]:
#How many of each document type?
pd.value_counts(kaggledf.Filepath)


custom_license        20657
comm_use_subset        9315
noncomm_use_subset     2350
biorxiv_medrxiv         902
Name: Filepath, dtype: int64

In [87]:
#New directory
%cd "C:\Users\lbrig\Documents\GRAD SCHOOL\Semester 3\Covid19 Kaggle"

C:\Users\lbrig\Documents\GRAD SCHOOL\Semester 3\Covid19 Kaggle


In [88]:
meta=pd.read_csv("metadata.csv", usecols= [1, 3, 8, 10, 14, 15])

In [89]:
meta.shape

(45774, 6)

In [90]:
#Split the IDs into multiple columns
m2= meta.sha.str.split(";", expand=True)

In [91]:
#Add a prefix to these columns 
m2=m2.add_prefix('ID_')

In [92]:
#Concat the ID columns and metadata
result = pd.concat([m2, meta], axis=1, sort=False)

In [93]:
result.shape

(45774, 12)

In [94]:
#Drop sha column
result.drop(columns=['sha', 'ID_1', 'ID_2', 'ID_3', 'ID_4', 'ID_5'])

Unnamed: 0,ID_0,title,abstract,authors,has_full_text,full_text_file
0,f056da9c64fbf00a4645ae326e8a4339d015d155,SIANN: Strain Identification by Alignment to N...,Next-generation sequencing is increasingly bei...,Samuel Minot; Stephen D Turner; Krista L Ternu...,True,biorxiv_medrxiv
1,daf32e013d325a6feb80e83d15aabc64a48fae33,Spatial epidemiology of networked metapopulati...,An emerging disease is one infectious epidemic...,Lin WANG; Xiang Li,True,biorxiv_medrxiv
2,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,Sequencing of the human IG light chain loci fr...,Germline variation at immunoglobulin gene (IG)...,Corey T Watson; Karyn Meltz Steinberg; Tina A ...,True,biorxiv_medrxiv
3,4da8a87e614373d56070ed272487451266dce919,Bayesian mixture analysis for metagenomic comm...,Deep sequencing of clinical samples is now an ...,Sofia Morfopoulou; Vincent Plagnol,True,biorxiv_medrxiv
4,eccef80cfbe078235df22398f195d5db462d8000,Mapping a viral phylogeny onto outbreak trees ...,Developing methods to reconstruct transmission...,Stephen P Velsko; Jonathan E Allen,True,biorxiv_medrxiv
...,...,...,...,...,...,...
45769,289deae0b2050aa259a05ba84565a4df82fa099a,Personal Protective Equipment: Protecting Heal...,Abstract Purpose The recent Ebola epidemic tha...,"Fischer, William A.; Weber, David J.; Wohl, Da...",True,custom_license
45770,21a4369f83891bf6975dd916c0aa495d5df8709e,Viruses and asthma,Abstract Background Viral respiratory infectio...,"Dulek, Daniel E.; Peebles, R. Stokes",True,custom_license
45771,,Why the WHO won't use the p-word,"There are no criteria for a pandemic, but covi...","MacKenzie, Debora",False,custom_license
45772,,"Communication, transparency key as Canada face...",,"Glauser, Wendy",False,


In [95]:
merged= result.merge(kaggledf, how= 'left', left_on='ID_0', right_on= 'ID')

In [96]:
merged.shape

(45774, 17)

In [97]:
merged.tail()

Unnamed: 0,ID_0,ID_1,ID_2,ID_3,ID_4,ID_5,sha,title,abstract,authors,has_full_text,full_text_file,ID,Title,Abstract,Document,Filepath
45769,289deae0b2050aa259a05ba84565a4df82fa099a,,,,,,289deae0b2050aa259a05ba84565a4df82fa099a,Personal Protective Equipment: Protecting Heal...,Abstract Purpose The recent Ebola epidemic tha...,"Fischer, William A.; Weber, David J.; Wohl, Da...",True,custom_license,289deae0b2050aa259a05ba84565a4df82fa099a,Personal Protective Equipment: Protecting Heal...,Purpose: The recent Ebola epidemic that devas...,The recent Ebola epidemic that devastated Wes...,custom_license
45770,21a4369f83891bf6975dd916c0aa495d5df8709e,,,,,,21a4369f83891bf6975dd916c0aa495d5df8709e,Viruses and asthma,Abstract Background Viral respiratory infectio...,"Dulek, Daniel E.; Peebles, R. Stokes",True,custom_license,21a4369f83891bf6975dd916c0aa495d5df8709e,Viruses and asthma ☆,Background: Viral respiratory infection has l...,Asthma is a complex disease regulated by both...,custom_license
45771,,,,,,,,Why the WHO won't use the p-word,"There are no criteria for a pandemic, but covi...","MacKenzie, Debora",False,custom_license,,,,,
45772,,,,,,,,"Communication, transparency key as Canada face...",,"Glauser, Wendy",False,,,,,,
45773,3369a14e1d116943f48b3a33597796c9802de279,f523909ff52d8a6b9e7ddeb44c7d10ff9adf366c,,,,,3369a14e1d116943f48b3a33597796c9802de279; f523...,Searching for animal models and potential targ...,Emerging and re-emerging pathogens represent a...,"Vergara-Alert, Júlia; Vidal, Enric; Bensaid, A...",True,noncomm_use_subset,3369a14e1d116943f48b3a33597796c9802de279,Searching for animal models and potential targ...,Emerging and re-emerging pathogens represent ...,Searching for animal models and potential tar...,noncomm_use_subset


In [98]:
pd.value_counts(merged.has_full_text)

True     31753
False    14021
Name: has_full_text, dtype: int64