In [1]:
import json
import re
import os
import nltk
from nltk.tokenize import WordPunctTokenizer
from collections import defaultdict

# Collection of relevant terms that will help find COVID-19 articles
key_words = ["COVID-19", "Coronavirus 19", "Coronavirus-19", "COVID 19", "SARS-CoV-2"]
key_words = list(map(str.upper, key_words))
covid_words = re.compile("|".join(key_words))

# Maximum number of COVID 19 files we process
MAX_FILES = 5000

# Minimum number of occurance for a word
TOKEN_FREQUENCY = 3

def consolidate(meta_data):
    '''
    This function wil merge all of the tokens of each file and some additional cleaning.
    '''
    # merge all of the tokens
    tokens = []
    for file in meta_data:
        tokens += meta_data[file]
        
    freq = defaultdict(lambda : 0)
    cleaned_tokens = []
    
    # Remove single character count freqs
    for index, token in enumerate(tokens):
        if not (len(token) <= 1 and not token.isalnum() and token not in ['a', 'i']):
            cleaned_tokens.append(token)
            freq[token] += 1

    tokens = cleaned_tokens
    tokens = list(filter(lambda token: freq[token] >= TOKEN_FREQUENCY, tokens))
    return tokens

def clean(body):
    '''
    Helper function where we can clean the text of the data in the way we want.
    Should return the body as well as the tokens in the body.
    '''
    sentences = nltk.sent_tokenize(body)
    tokens = []
    if sentences:
        for index, sentence in enumerate(sentences):
            tokenizer = WordPunctTokenizer()
            sentence_tokens = tokenizer.tokenize(sentence)

            sentence_tokens.insert(0, '<s>')
            sentence_tokens.append('</s>')
            tokens += sentence_tokens
    return tokens

def process(max_files):
    '''
    Function will build and collect the text and metadata associated with the 
    COVID dataset will use to train.
    '''
    # metadata is {file: tokens}
    meta_data = {}
    base_path = "CORD-19-research-challenge/document_parses/pdf_json/"
    
    for path in os.listdir(base_path):
        # Keep processing till we have enough data
        if max_files < 0:
            # Writing the text
            with open('corpus.json', 'w') as fp:
                tokens = consolidate(meta_data)
                corpus = {'count' : len(tokens), 'tokens' : tokens}
                json.dump(corpus, fp)
        
            return
        
        # I/O
        file = open(base_path + path)
        text = json.load(file)
        file.close()

        # Check if we have a match in the metadata of the article
        if covid_words.search(text['metadata']['title'].upper()):
            max_files -= 1
            if max_files % 100 == 0:
                print(f"Currently there are {max_files} files left to process.")
            
            # The body of text is always in list of texts
            raw = " ".join([content['text'] for content in text['body_text']])
            
            # Clean the body of text and return the tokens
            tokens = clean(raw)
            
            meta_data[path] = tokens

    # Writing the text
    with open('corpus.json', 'w') as fp:
        tokens = consolidate(meta_data)
        corpus = {'count' : len(tokens), 'tokens' : tokens}
        json.dump(corpus, fp)

process(max_files=MAX_FILES)

Currently there are 4900 files left to process.
Currently there are 4800 files left to process.
Currently there are 4700 files left to process.
Currently there are 4600 files left to process.
Currently there are 4500 files left to process.
Currently there are 4400 files left to process.
Currently there are 4300 files left to process.
Currently there are 4200 files left to process.
Currently there are 4100 files left to process.
Currently there are 4000 files left to process.
Currently there are 3900 files left to process.
Currently there are 3800 files left to process.
Currently there are 3700 files left to process.
Currently there are 3600 files left to process.
Currently there are 3500 files left to process.
Currently there are 3400 files left to process.
Currently there are 3300 files left to process.
Currently there are 3200 files left to process.
Currently there are 3100 files left to process.
Currently there are 3000 files left to process.
Currently there are 2900 files left to p

In [2]:
file = open('corpus.json')
text = json.load(file)
file.close()


In [3]:
text

{'count': 57289,
 'tokens': ['<s>',
  'There',
  'is',
  'a',
  'disproportionate',
  'number',
  'of',
  'individuals',
  'with',
  'mental',
  'and',
  'somatic',
  'illnesses',
  'among',
  'persons',
  'in',
  'detention',
  '2020',
  '2012',
  '</s>',
  '<s>',
  'It',
  'is',
  'also',
  'known',
  'that',
  'infections',
  'which',
  'are',
  'transmitted',
  'human',
  'to',
  'human',
  'via',
  'droplet',
  'or',
  'close',
  'contact',
  'spread',
  'particularly',
  'well',
  'in',
  'confined',
  'spaces',
  '</s>',
  '<s>',
  'Since',
  'transfer',
  'options',
  'for',
  'further',
  'treatment',
  'are',
  'more',
  'difficult',
  'especially',
  'in',
  'detention',
  'facilities',
  'preventive',
  'measures',
  'are',
  'strongly',
  'emphasized',
  'particularly',
  'in',
  'the',
  'case',
  'of',
  'viral',
  'droplet',
  'infections',
  '</s>',
  '<s>',
  'For',
  'example',
  'in',
  'the',
  'context',
  'of',
  'influenza',
  'vaccination',
  'of',
  'detainees