# Good Notebooks to refer back to
[1] https://www.kaggle.com/ivanegapratama/covid-eda-initial-exploration-tool

[2] https://www.kaggle.com/midnitekoder/covid-19-citation-graph-embedding-using-deepwalk

[3] https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv

[4] https://www.kaggle.com/maksimeren/covid-19-literature-clustering

[5] https://www.kaggle.com/nofoosports/cord-19-analysis-with-sentence-embeddings/data#Building-task-reports

In [1]:
# imports
# inital code was mostly adapted from https://www.kaggle.com/ivanegapratama/covid-eda-initial-exploration-tool
import os
import pandas as pd
import numpy as np
import matplotlib
import glob
import json
import spacy
import re
import nltk
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# read in metadata
meta_df = pd.read_csv('metadata.csv', dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})

In [3]:
# what I ran to get an initial look at the metadata
meta_df.columns
meta_df.dropna(subset = ['has_full_text']).info()
meta_df['has_full_text'].value_counts()
meta_df.columns = meta_df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
meta_df.columns
meta_df[meta_df.full_text_file.isnull() == False].query('has_full_text == False').head()
meta_df[meta_df.full_text_file.isnull() == False]
meta_df[meta_df.doi.isnull() == True]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44220 entries, 0 to 44219
Data columns (total 15 columns):
sha                            28462 non-null object
source_x                       44220 non-null object
title                          43996 non-null object
doi                            40750 non-null object
pmcid                          23319 non-null object
pubmed_id                      22943 non-null object
license                        44220 non-null object
abstract                       35806 non-null object
publish_time                   34197 non-null object
authors                        41074 non-null object
journal                        33173 non-null object
Microsoft Academic Paper ID    964 non-null object
WHO #Covidence                 1767 non-null object
has_full_text                  44220 non-null bool
full_text_file                 32829 non-null object
dtypes: bool(1), object(14)
memory usage: 5.1+ MB


Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,microsoft_academic_paper_id,who_#covidence,has_full_text,full_text_file
1297,,Elsevier,An improved method for the routine identificat...,,,,els-covid,,1978-07-31,"Caul, E.O.; Ashley, C.R.; Egglestone, S.I.",FEMS Microbiology Letters,,,False,custom_license
1298,84aa394458d6daea92ecbcd504b90ed9fadb56e6,Elsevier,Translation of infectious bronchitis virus RNA,,,,els-covid,,1978-04-30,"Highfield, P.E.; Morser, J.; Lomniczi, B.; Ste...",FEMS Microbiology Letters,,,True,custom_license
1299,,Elsevier,Author index to volume 5,,,,els-covid,,1979-06-30,,FEMS Microbiology Letters,,,False,custom_license
1300,,Elsevier,Subject index to volume 5 (1979),,,,els-covid,,1979-06-30,,FEMS Microbiology Letters,,,False,custom_license
1301,133db9d6eab9f982c996b3e2bdbacd782db63ed7,Elsevier,Preliminary studies on the isolation of corona...,,,,els-covid,,1979-02-28,"Caul, E.O.; Ashley, C.R.; Ferguson, Morag; Egg...",FEMS Microbiology Letters,,,True,custom_license
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40974,786da349a33148e24e09edd6378da80ea2f40e07; 2810...,PMC,Common variations in TERT-CLPTM1L locus are re...,,PMC4808031,26621837,cc-by,Associations between single nucleotide polymor...,2015 Nov 26,"Zhang, Yang; Zhang, Xiaoai; Zhang, Hongxing; Z...",Oncotarget,,,True,comm_use_subset
41190,8e014e2133b319d93ba7e9341ffe33bab3f9e265,PMC,A HLA-A2-restricted CTL epitope induces anti-t...,,PMC4808025,26621839,cc-by,Cancer immunotherapy is attractive for antigen...,2015 Nov 26,"Sher, Yuh-Pyng; Lin, Su-I; Chen, I-Hua; Liu, H...",Oncotarget,,,True,comm_use_subset
41212,ad4f830c6a0985b878a66d4e797232ccba91aedc; 9de4...,PMC,Targeting a ribonucleoprotein complex containi...,,PMC4385842,25669982,cc-by,Tylophorine compounds have been the focus of d...,2014 Dec 10,"Qiu, Ya-Qi; Yang, Cheng-Wei; Lee, Yue-Zhi; Yan...",Oncotarget,,,True,comm_use_subset
41930,,WHO,Medicine's Challenges: Vaping and Coronavirus,,PMC7023952,32158035,unk,,2020,"DiRenna, James",Mo Med,,#7371,False,


In [4]:
# get all the json files within the dataset into a list of paths
all_json = glob.glob(f'**/*.json', recursive=True)
all_json

['biorxiv_medrxiv\\biorxiv_medrxiv\\0015023cc06b5362d332b3baf348d11567ca2fbb.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\004f0f8bb66cf446678dc13cf2701feec4f36d76.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\00d16927588fb04d4be0e6b269fc02f0d3c2aa7b.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\0139ea4ca580af99b602c6435368e7fdbefacb03.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\013d9d1cba8a54d5d3718c229b812d7cf91b6c89.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\01e3b313e78a352593be2ff64927192af66619b5.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\02201e4601ab0eb70b6c26480cf2bfeae2625193.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\0255ea4b2f26a51a3bfa3bd8f3e1978c82c976d5.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\029c1c588047f1d612a219ee15494d2d19ff7439.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\03ce432f27c7df6af22b92245a614db2ecb5de5f.json',
 'biorxiv_medrxiv\\biorxiv_medrxiv\\03ea3a614b56409d3f099c9ad764864293132540

In [5]:
#check json schema, as per [1]
with open(all_json[0]) as file:
    first_entry = json.load(file)
    print(json.dumps(first_entry, indent=4))

{
    "paper_id": "0015023cc06b5362d332b3baf348d11567ca2fbb",
    "metadata": {
        "title": "The RNA pseudoknots in foot-and-mouth disease virus are dispensable for genome replication but essential for the production of infectious virus. 2 3",
        "authors": [
            {
                "first": "Joseph",
                "middle": [
                    "C"
                ],
                "last": "Ward",
                "suffix": "",
                "affiliation": {},
                "email": ""
            },
            {
                "first": "Lidia",
                "middle": [],
                "last": "Lasecka-Dykes",
                "suffix": "",
                "affiliation": {},
                "email": ""
            },
            {
                "first": "Chris",
                "middle": [],
                "last": "Neil",
                "suffix": "",
                "affiliation": {},
                "email": ""
            },
            {
           

In [6]:
#helper class taken from [1]
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.source = file_path.split("\\")[0]
            self.title = content['metadata']['title']
            self.abstract = []
            self.body_text = []
            self.bib_entries = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            # Bibliography
            for k, entry in content['bib_entries'].items():
                self.bib_entries.append(str(entry)) 
                # just stringify everything we'll probably clean later
            self.abstract = ' '.join(self.abstract)
            self.bib_entries = str(self.bib_entries)
            
            #Strip copyright disclaimers
            self.body_text = ' '.join(self.body_text)
            reg = '((The copyright holder for this preprint \(which was not peer-reviewed\) is the . )(https:\/\/doi.org\/10.[0-9]{4,}?\/[0-9]{4}.[0-9]{2}.[0-9]{2}.[^\s]+)( doi: bioRxiv preprint)|(author\/funder. All rights reserved. No reuse allowed without permission.))'
            re.sub(reg, '', self.body_text)
            
            
            '''
            stringify the whole bib_entries list so we can use eval() 
            on a df cell at a later time to get the full list of dicts
            '''
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

In [7]:
#checking it out
first_row = FileReader(all_json[0])
print(first_row)

0015023cc06b5362d332b3baf348d11567ca2fbb: word count: 194 22 Text word count: 5168 23 24 25 author/funder. All rights reserved. No reuse allowed without permission. Abstract 27 The positive stranded RNA genomes of picornaviruses comprise a si... VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structura...


In [35]:
[re.sub('\D', '', each['section']) for each in first_entry['body_text']]

['',
 '70',
 '120',
 '120',
 '120',
 '135',
 '136',
 '144',
 '144',
 '144',
 '301',
 '350',
 '368',
 '368',
 '468',
 '468',
 '479',
 '479',
 '479',
 '479']

# The data is a bit messy
This is specifically about the biorxiv data, since all papers in this set were probably scraped the same way.
    
    1. There are random numbers. these are caused by one of the following:
        * superscripts/subscripts 
        * preprint pdf line numbers

    2. The preprint pdf includes a copyright header that gets scraped into the text. This is almost the exact same each time, so it can probably be removed pretty easily

    3. Section values have something to do with the line number that the text paragraph starts on, but I can't find a consistent rule for how they were scraped

For now I'll just read everything into a dataframe with the helper class

In [11]:
# adapted from [1]
#Making Dataframe
dict_ = {'paper_id': [], 'source': [], 'title': [], 'abstract': [], 'body_text': [], 'bib_entries': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    dict_['paper_id'].append(content.paper_id)
    dict_['source'].append(content.source)
    dict_['title'].append(content.title)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    dict_['bib_entries'].append(content.bib_entries)
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'source', 'title', 'abstract', 'body_text', 'bib_entries'])

#Adding the Word Count Column¶
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))

#Clean duplicates
df_covid.drop_duplicates(['abstract'], inplace=True)
df_covid.describe(include='all')

## This will definitely take a while (5-10 min?)

Processing index: 0 of 29315
Processing index: 2931 of 29315
Processing index: 5862 of 29315
Processing index: 8793 of 29315
Processing index: 11724 of 29315
Processing index: 14655 of 29315
Processing index: 17586 of 29315
Processing index: 20517 of 29315
Processing index: 23448 of 29315
Processing index: 26379 of 29315
Processing index: 29310 of 29315


Unnamed: 0,paper_id,source,title,abstract,body_text,bib_entries,abstract_word_count,body_word_count
count,21050,21050,21050.0,21050,21050,21050,21050.0,21050.0
unique,21050,4,19993.0,21050,21046,21035,,
top,cfe74dcee96b3f90d0b1a0bbba20a8206490a26b,custom_license,,The presence of Giardia and Cryptosporidium wa...,Gastrointestinal disorders are frequently repo...,"[""{'ref_id': 'b0', 'title': 'Situation Report ...",,
freq,1,10527,912.0,1,2,2,,
mean,,,,,,,227.180855,4593.006698
std,,,,,,,168.553834,5556.557109
min,,,,,,,0.0,1.0
25%,,,,,,,149.0,2735.0
50%,,,,,,,203.0,3861.0
75%,,,,,,,263.75,5469.0


In [12]:
df_covid.head()

Unnamed: 0,paper_id,source,title,abstract,body_text,bib_entries,abstract_word_count,body_word_count
0,0015023cc06b5362d332b3baf348d11567ca2fbb,biorxiv_medrxiv,The RNA pseudoknots in foot-and-mouth disease ...,word count: 194 22 Text word count: 5168 23 24...,"VP3, and VP0 (which is further processed to VP...","[""{'ref_id': 'b0', 'title': 'Genetic economy i...",241,1728
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,biorxiv_medrxiv,Healthcare-resource-adjusted vulnerabilities t...,,The 2019-nCoV epidemic has spread across China...,"[""{'ref_id': 'b0', 'title': 'World Health Orga...",0,755
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,biorxiv_medrxiv,"Real-time, MinION-based, amplicon sequencing f...",Infectious bronchitis (IB) causes significant ...,"Infectious bronchitis (IB), which is caused by...","[""{'ref_id': 'b0', 'title': 'Emergence of nove...",1647,4003
3,0139ea4ca580af99b602c6435368e7fdbefacb03,biorxiv_medrxiv,A Combined Evidence Approach to Prioritize Nip...,Nipah Virus (NiV) came into limelight recently...,Nipah is an infectious negative-sense single-s...,"[""{'ref_id': 'b0', 'title': 'Molecular biology...",326,2399
4,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,biorxiv_medrxiv,Assessing spread risk of Wuhan novel coronavir...,Background: A novel coronavirus (2019-nCoV) em...,"In December 2019, a cluster of patients with p...","[""{'ref_id': 'b0', 'title': 'A Novel Coronavir...",22,4642


# What to do next?
The idea is to organize this information so that it can be understood more efficiently.

How can we "tag" papers?

After we tag them, how can we separate the tags into different classes (probably by frequency of occurrence if that's a good representaiton of generality)

From the corpus that possesses these tags, how can we recompute the frequency of other tags within this corpus, then stratify those

Rinse and repeat. 

We want to be able to go from general -> specific.

This is one way.

Idea: Titles usually have one noun which can be used as a tag


In [13]:
counts = df_covid['abstract'].value_counts()

In [14]:
titles = df_covid['title'].value_counts()

In [15]:
titles.to_csv('titles.csv', index = True)

  """Entry point for launching an IPython kernel.


PermissionError: [Errno 13] Permission denied: 'titles.csv'

In [17]:
a = 'A*02:06-p4 (LTA-J-FLIFL) A*02:07-p5 (LLDS-J-YERL) A*02:07-p6 (LLDSD-J-ERL)'
df_covid[df_covid.title == 'A*02:06-p4 (LTA-J-FLIFL) A*02:07-p5 (LLDS-J-YERL) A*02:07-p6 (LLDSD-J-ERL)']

Unnamed: 0,paper_id,source,title,abstract,body_text,bib_entries,abstract_word_count,body_word_count
9401,ee06cad1229867a6389ed8a7aaffb5148dd5f8ed,comm_use_subset,A*02:06-p4 (LTA-J-FLIFL) A*02:07-p5 (LLDS-J-YE...,A Supporting Information Figure 1 A. Design of...,IT-TOF. The mass to charge ratios (m/z) of pep...,"[""{'ref_id': 'b1', 'title': 'Cytolytic T lymph...",307,330


In [18]:
df_covid[df_covid.title == 'A*02:06-p4 (LTA-J-FLIFL) A*02:07-p5 (LLDS-J-YERL) A*02:07-p6 (LLDSD-J-ERL)'].iloc[0]['body_text']

"IT-TOF. The mass to charge ratios (m/z) of peptides were measured after liquid chromatography (LC) separation on a C18 column. As both isomers of the photocleavable 3-amino-3-(2nitro) phenyl-propanoic acid (Anp) used during solid phase peptide synthesis of the conditional ligand, 2 diasteroisomers could usually be identified. In some cases, however, the two isomers failed to separate during chromatography. The data displayed is for one of the two isomers, mostly for doubly charged molecules (M 2+ ), together with the amino acid sequences of conditional ligands where the letter J represents Anp.\nA. Cells were first analyzed for their size and granularity based on their forward scatter area (FSC-A) and side scatter area (SSC-A) profile. Doublets were excluded using side-scatter height (SSC-H) vs side-scatter width (SSC-W) and forward-scatter height (FSC-H) vs forwardscatter width (FSC-W) parameters. Cells present in all three gates were analyzed for their time vs forward scatter area (

# Removing Stopwords and unnecessary text

   - medrxiv https://www.medrxiv.org/content/medrxiv/early/2020/03/24/2020.03.22.20040915.full.pdf
   - biorxiv https://www.biorxiv.org/content/biorxiv/early/2020/01/11/2020.01.10.901801.full.pdf


In [19]:
df_covid.source.value_counts()

custom_license        10527
comm_use_subset        8177
noncomm_use_subset     1570
biorxiv_medrxiv         776
Name: source, dtype: int64

In [61]:
df_covid.body_text[4]

'In December 2019, a cluster of patients with pneumonia of unknown cause were reported in the city of Wuhan, Hubei Province, China, and epidemiologically linked to a seafood wholesale market [1, 2] . It has been determined that the pathogen causing the viral pneumonia among affected individuals is a new coronavirus (2019-nCoV) [1, 3] . The pathogen exhibits high human-to-human transmissibility and has spread rapidly within and beyond Wuhan city [4, 5] . On January 30 th , 2020, World Health Organization (WHO) declared the 2019-nCoV outbreak a Public Health Emergency of International Concern [6] .\nWuhan is central China\'s transportation hub with a population of 11 million residents and a large number of higher-education students (~1.3 million in 89 universities and colleges), a particularly mobile population [7] . Beyond these factors, viral spread was likely exacerbated further by the surge in domestic and international travel during the 40-day Lunar New Year (LNY) celebrations (from

In [65]:
medRxiv = []
for i, each in enumerate(df_covid.body_text):
    if ('medRxiv' in each):
        medRxiv.append(i)
medRxiv

[4,
 5,
 11,
 12,
 13,
 16,
 21,
 22,
 26,
 29,
 30,
 32,
 34,
 40,
 41,
 42,
 44,
 45,
 46,
 48,
 52,
 53,
 55,
 56,
 60,
 61,
 65,
 66,
 67,
 68,
 70,
 71,
 72,
 73,
 78,
 82,
 84,
 90,
 94,
 96,
 101,
 102,
 104,
 105,
 109,
 111,
 113,
 114,
 116,
 121,
 122,
 125,
 126,
 127,
 128,
 129,
 132,
 134,
 137,
 143,
 145,
 148,
 153,
 155,
 160,
 161,
 162,
 167,
 169,
 171,
 174,
 178,
 180,
 181,
 184,
 189,
 190,
 194,
 196,
 198,
 201,
 204,
 208,
 210,
 218,
 219,
 220,
 221,
 222,
 223,
 228,
 229,
 232,
 233,
 236,
 238,
 244,
 249,
 250,
 253,
 255,
 257,
 260,
 261,
 262,
 263,
 264,
 267,
 271,
 272,
 273,
 276,
 278,
 281,
 282,
 286,
 287,
 288,
 289,
 291,
 292,
 298,
 299,
 300,
 301,
 304,
 308,
 316,
 317,
 318,
 324,
 334,
 339,
 341,
 343,
 345,
 346,
 347,
 350,
 356,
 359,
 361,
 366,
 367,
 371,
 373,
 374,
 383,
 385,
 386,
 388,
 389,
 390,
 391,
 393,
 401,
 402,
 403,
 407,
 410,
 411,
 412,
 414,
 415,
 418,
 422,
 426,
 432,
 438,
 440,
 444,
 445,
 448,
 450

In [66]:
df_covid.body_text[5]

'The sudden outbreak of the new coronavirus (SARS-CoV-2) at the end of December 2019 poses a huge threat to human health worldwide. The SARS-CoV-2 virus causes severe respiratory disease that can quickly spread from person to person and in some cases lead to death.\nResearchers have found that the new SARS-CoV-2 and SARS coronaviruses invade human cells in target tissues in a similar manner via high-affinity binding to angiotensin-converting enzyme 2 (ACE2) [1] . In recent epidemiological investigations of the spread of the SARS-CoV-2 and a preliminary study of the clinical characteristics of this disease [2] [3] [4] [5] [6] , researchers have found that patients infected with the new coronavirus have severe symptoms similar to those of the SARS infection. The first batch of clinical data reports of SARS-CoV-2 infection cases in China revealed "cytokine storms" in critically ill patients [7, 8] . However, the mechanism of the viral infection and pathological changes in the immune syste

In [88]:
reg = '(|(CC-BY-NC 4.0 International license It is made available under a)| (author\/funder, who has granted medRxiv a license to display the preprint in perpetuity.)|(is the \(which was not peer-reviewed\))|(The copyright holder for this preprint)|(The copyright holder for this preprint . )|(https:\/\/doi.org\/10.[0-9]{4,}?\/[0-9]{4}.[0-9]{2}. [0-9]{2}.[^\s]+)|(https:\/\/doi.org\/10.[0-9]{4,}?\/[0-9]{4}.[0-9]{2}.[0-9]{2}.[^\s]+)| (doi: medRxiv preprint)|(author\/funder. All rights reserved. No reuse allowed without permission.))'
g = re.sub('\\n', ' ', df_covid.body_text[5])
re.sub(reg, '', g)

'The sudden outbreak of the new coronavirus (SARS-CoV-2) at the end of December 2019 poses a huge threat to human health worldwide. The SARS-CoV-2 virus causes severe respiratory disease that can quickly spread from person to person and in some cases lead to death. Researchers have found that the new SARS-CoV-2 and SARS coronaviruses invade human cells in target tissues in a similar manner via high-affinity binding to angiotensin-converting enzyme 2 (ACE2) [1] . In recent epidemiological investigations of the spread of the SARS-CoV-2 and a preliminary study of the clinical characteristics of this disease [2] [3] [4] [5] [6] , researchers have found that patients infected with the new coronavirus have severe symptoms similar to those of the SARS infection. The first batch of clinical data reports of SARS-CoV-2 infection cases in China revealed "cytokine storms" in critically ill patients [7, 8] . However, the mechanism of the viral infection and pathological changes in the immune system