In [1]:
import os
import re
import json
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import spacy
from spacy.matcher import Matcher

from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

In [2]:
debug = False
articles = {}
stat = { }
for dirpath, subdirs, files in os.walk('/kaggle/input'):
    for x in files:
        if x.endswith(".json"):
            articles[x] = os.path.join(dirpath, x)        
df = pd.read_csv('metadata.csv')


# Current Research Status

The curent research notebook is focusin on understanding the behavior of the coronavirus by scraping the provided articles. The current version of the notebook is providing information about the following:
* Symptoms
* Incubation Period
* Quarantine
* Transmission Methods

The mining is running on a custom rule matching engine built on top of Spacy in order to use POS in terms identification.

# Prepare term libraries

* virus_ref - virus references in articles. is used to filter the dataset when specific referentiation is needed
* symptoms - list of generic symptoms
* higher_terms - list of terms that are used to define the starting point of an age group
* lower_terms - list of terms that are used to define the end of an age group

In [3]:
virus_ref = ['covid-19', 'coronavirus', 'cov-2', 'sars-cov-2', 'sars-cov', 'hcov', '2019-ncov']
symptoms = ['weight loss','chills','shivering','convulsions','deformity','discharge','dizziness','vertigo','fatigue','malaise','asthenia','hypothermia','jaundice','muscle weakness','pyrexia','sweats','swelling','swollen','painful lymph node','weight gain','arrhythmia','bradycardia','chest pain','claudication','palpitations','tachycardia','dry mouth','epistaxis','halitosis','hearing loss','nasal discharge','otalgia','otorrhea','sore throat','toothache','tinnitus','trismus','abdominal pain','fever','bloating','belching','bleeding','blood in stool','melena','hematochezia', 'constipation','diarrhea','dysphagia','dyspepsia','fecal incontinence','flatulence','heartburn','nausea','odynophagia','proctalgia fugax','pyrosis','steatorrhea','vomiting','alopecia','hirsutism','hypertrichosis','abrasion','anasarca','bleeding into the skin','petechia','purpura','ecchymosis and bruising','blister','edema','itching','laceration','rash','urticaria','abnormal posturing','acalculia','agnosia','alexia','amnesia','anomia','anosognosia','aphasia and apraxia','apraxia','ataxia','cataplexy','confusion','dysarthria','dysdiadochokinesia','dysgraphia','hallucination','headache','akinesia','bradykinesia','akathisia','athetosis','ballismus','blepharospasm','chorea','dystonia','fasciculation','muscle cramps','myoclonus','opsoclonus','tic','tremor','flapping tremor','insomnia','loss of consciousness','syncope','neck stiffness','opisthotonus','paralysis and paresis','paresthesia','prosopagnosia','somnolence','abnormal vaginal bleeding','vaginal bleeding in early pregnancy', 'miscarriage','vaginal bleeding in late pregnancy','amenorrhea','infertility','painful intercourse','pelvic pain','vaginal discharge','amaurosis fugax','amaurosis','blurred vision','double vision','exophthalmos','mydriasis','miosis','nystagmus','amusia','anhedonia','anxiety','apathy','confabulation','depression','delusion','euphoria','homicidal ideation','irritability','mania','paranoid ideation','suicidal ideation','apnea','hypopnea','cough','dyspnea','bradypnea','tachypnea','orthopnea','platypnea','trepopnea','hemoptysis','pleuritic chest pain','sputum production','arthralgia','back pain','sciatica','Urologic','dysuria','hematospermia','hematuria','impotence','polyuria','retrograde ejaculation','strangury','urethral discharge','urinary frequency','urinary incontinence','urinary retention']
higher_terms = ['over', 'above', 'higher', 'older', '>', 'over', 'less']
lower_terms = ['under', 'below', 'fewer', 'younger', '<', 'under', 'more']

# Defined generic Spacy pattern matchers and util library

the patterns will be used in order to assemble a set of rules to identify the desired sequence of patterns

* matchers
 - Term Matcher - lookout for a single term
 - Terms Matcher - lookout in a list of terms
 - Number Suffix Matcher - search for numeric value preceeded by a time definition (parametrized, e.g: ["day", "year"])
 - Number Interval Matcher - search for numeric intervals of time definition (parametrized, e.g: ["minute", "day", "year"])
 
 
* plot_dict - utility to plot a dictionary
* dict_counter - increase or set the value of a key in the dictionary
* day_value - report the time value in days
* report_interval - populates dictionary with values for an interval (e.g. 4-7 => {4: 1, 5: 1, 6: 1, 7: 1})
* virus_match - checks if any virus term is referenced in the text

In [4]:
matchers = {    
    "Term Matcher": lambda term: [{'LOWER': t} for t in term.split(' ')],
    "Terms Matcher": lambda terms: [{"LOWER": {"IN": terms } }],
    "Number Suffix Matcher": lambda periods: [
        {'LIKE_NUM': True},
        {"TEXT": {"REGEX": f'({"|".join(periods)})'}}
    ],
    "Number Interval Matcher": lambda periods: [
        {'POS': 'NUM',},
        {'TEXT': {'REGEX': f'({"|".join(periods)})'}, 'OP': '?'},
        {'DEP': 'quantmod', 'OP': '?'},
        {'DEP': 'punct', 'OP': '?'},
        {'DEP': 'prep', 'OP': '?'},
        {'POS': 'NUM'},
        {'TEXT': {'REGEX': f'({"|".join(periods)})'}},
    ],
    "Group Matcher": [
        {"TEXT": {"IN": higher_terms+lower_terms }}
    ]
}


In [5]:

def plot_dict(stat, t = 10, sort_values = False):
    filtered = dict(stat)
    to_delete = []
    for key in filtered:
        if filtered[key] < t:
            to_delete.append(key)
    for key in to_delete:
        del filtered[key]

    
    if sort_values == False:
        lists = sorted(filtered.items())
    else:
        lists = sorted(filtered.items(),key=lambda item: item[1])
        
    figure(num=None, figsize=(20, 4))
    x, y = zip(*lists) # unpack a list of pairs into two tuples
    plt.bar(x, y)
    plt.show()
    

def merge_keys(mergers, obj):
    result = dict(obj)
    for key, arr in mergers:
        if key not in result:
            result[key] = 0
        for merger in arr:
            if merger in result:
                result[key] = result[key] + result[merger]
                del result[merger]
    return result

def dict_counter(res, arg):
    try:
        key = str(arg)
        res.setdefault(key, 0)
        res[key] = res[key] + 1
    except:
        pass

def numval(val):
    try:
        return int(float(str(val))) 
    except:
        return None
    
def day_value(val, rep = None):
    
    if rep != None:
        val = numval(val.text)
        if val != None and 'week' in rep.text:
            val = val * 7
        return val
    else:
        return None

def report_interval(res, min_val, max_val):       
    if min_val != None and max_val != None:
        for key in range(min_val, max_val):
            res.setdefault(key, 0)
            res[key] = res[key] + 1    

def virus_match(text):
    return len(re.findall(rf'\ ({"|".join(virus_ref)})\ ', text)) > 0

## Prepare COVID-19 Literature dataset

In [6]:
literature = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    sha = str(row['sha'])
    if sha != 'nan':
        sha = sha + '.json';
        try:
            allow = False
            with open(articles[sha]) as f:
                data = json.load(f)
                for key in ['abstract', 'body_text']:
                    if allow == False and key in data:
                        for abstract in data['abstract']:
                            allow = allow or virus_match(abstract['text'])
                            if allow:
                                literature.append({'file': articles[sha], 'abstract': data['abstract'], 'body': data['body_text']})
                                break;
        except KeyError:
            pass

100%|██████████| 44220/44220 [00:04<00:00, 10183.75it/s]


# Define rule matching execution functions

When executing a set of rules on a text body the order of the mathcers will be appended to the matching array that will allow the user to define conditional executions when rules match. 

Rule example:
```
rule = {    
    "Matchers": [
        ("Term Matcher", [
            {"LOWER": "incubation"},
            {"LOWER": "period", "OP": "?"}
        ]),
        ("Time Matcher", matchers["Number Suffix Matcher"](["days", "weeks"])),
        ("Time Interval Matcher", matchers["Number Interval Matcher"](["days", "weeks"]))
    ],
    "root": {          
        "Term Matcher": { 
            "execute": lambda x: print(x),
            "Time Matcher": incubation_period_report ,
            "Time Interval Matcher": incubation_period_report,
        }
    }
}
```

A rule is made of *Matchers* and *executors* - starting with **root executor**. The example above defines the following matchers:
* Term Matcher -> looks for the term **incubation** and optionally **period**
* Time Matcher -> matches all time references in days / weeks
* Time Interval Matcher -> matches all time intervals in days / weeks

The next item in the dictionary is *root* that defines the preferred matching order execution (if the order is not satisfied then the matcher executor won't get called)
* Term Matcher -> Time Matcher
* Term Matcher -> Time Interval Matcher

If the matcher rule has the **execute** key present in the dictionary then the rule will get executed even if further specific matchers will get called later


In [7]:
def execute_matches(match_arr, root, sentence, file, aggregate = []):
    key, result = match_arr[0]
    rest = match_arr[1:]
    state = aggregate + [(key, result)]
    
    if key in root:
        rule = root[key]
        if callable(rule):
            rule( (result, state, sentence, file) )
        else:
            if 'execute' in rule:
                rule['execute']( (result, state, sentence, file) )
            if len(rest) > 0:
                execute_matches(rest, rule, sentence, file, state)
    
    if len(rest) > 0:
        execute_matches(rest, root, sentence, file, state)

def match_parser(matcher, doc, rule, file):
    matches = matcher(doc)
    if len(matches)>0:
        to_process = []
        for match_id, start, end in matches:
            string_id = nlp.vocab.strings[match_id]  # Get string representation
            span = doc[start:end]  # The matched span
            to_process.append((string_id, span))
        execute_matches(to_process, rule['root'], doc, file)

def parse_body(matcher, text, rule, file = None, sentence_level = False):
    text = text.lower()
    doc = nlp(text)
    
    if sentence_level == True:    
        for sent in doc.sents:
            sent_doc = nlp(sent.text)
            match_parser(matcher, sent_doc, rule, file)
    else:
        match_parser(matcher, doc, rule, file)

def execute_ruleset(term, rule, sentence_level = False):
    matcher = Matcher(nlp.vocab)
    for name, m in rule["Matchers"]:
        matcher.add(name, None, m)
    
    for article in tqdm(literature):
        abstracts = article['abstract']
        body = article['body']
        file = article['file']
        
        for body_text in body:
            text = body_text['text']
            if callable(term):
                allow = term(text)
            else:
                allow = term == None or term in text
            if allow == True:
                parse_body(matcher, text, rule, file, sentence_level)        

# Virus Symptoms

Search for virus references and its symptoms in all articles that have a coronavirus reference and at least one term in symptoms dictionary
Since is really important to understand the symptoms the search will be performed on the entire article dataset with no filter. 

In [9]:
stat

{'symptoms': {}}

In [8]:
stat['symptoms'] = {}

def match(text):
    if virus_match(text) == True:
        return len(re.findall(rf'\ ({"|".join(symptoms)})\ ', text)) > 0
    else:
        return False

def symptom(res):
    ref, agregate, sentence, file = res
    dict_counter(stat['symptoms'], ref.text)
        
rule = {    
    "Matchers": [      
       ("Symptoms Reference", matchers['Terms Matcher'](symptoms)),
    ],
    "root": {
        "Symptoms Reference": symptom
    }
}


def symptom_match(text):
    return len(re.findall(r'symptom', text)) > 0

execute_ruleset(symptom_match, rule)
plot_dict(stat['symptoms'], 50, True)

0it [00:00, ?it/s]


ValueError: not enough values to unpack (expected 2, got 0)

<Figure size 1440x288 with 0 Axes>

# Incubation Period

The first analysis is parsing filtered articles that might refer to COVID-19 incubation period. For the moment the term *incubation period* is searched in text abstract in order to identify the potential articles.


In [10]:
stat['incubation_periods'] = {}

def incubation_period_report(x):
    arr = x[1][-2:]
    m1, v1 = arr[0]
    m2, v2 = arr[1]
    
    if m1 == 'Term Matcher':
        if m2 == 'Time Matcher':
            report_interval(stat['incubation_periods'], 0, day_value(v2[0], v2[1]))            
        elif m2 == 'Time Interval Matcher':
            report_interval(stat['incubation_periods'], day_value(v2[0], v2[3]), day_value(v2[1], v2[1]))           
    elif m2 == 'Term Matcher':
        if m2 == 'Time Matcher':
            report_interval(stat['incubation_periods'], 0, day_value(v2[0], v2[1]))
        elif m2 == 'Time Interval Matcher':
            report_interval(stat['incubation_periods'], day_value(v2[0], v2[3]), day_value(v2[1], v2[1]))      

rule = {    
    "Matchers": [
        ("Term Matcher", [
            {"LOWER": "incubation"},
            {"LOWER": "period", "OP": "?"}
        ]),
        ("Time Matcher", matchers["Number Suffix Matcher"](["days", "weeks"])),
        ("Time Interval Matcher", matchers["Number Interval Matcher"](["days", "weeks"]))
    ],
    "root": {          
        "Term Matcher": { 
            "Time Matcher": incubation_period_report ,
            "Time Interval Matcher": incubation_period_report,
        },
        "Day Matcher": { "Term Matcher": incubation_period_report },
        "Day Interval Matcher": { "Term Matcher": incubation_period_report }
    }
}

execute_ruleset('incubation period', rule)
plot_dict(stat['incubation_periods'], 15)

0it [00:00, ?it/s]


ValueError: not enough values to unpack (expected 2, got 0)

<Figure size 1440x288 with 0 Axes>

# Quarantine

Searches for all quarantine recommendations in the articles where coronavirus term is present. The lookout will be performed at the sentence level and not at the full body level for a better approximation.

In [11]:
stat['quarantine'] = {}

def quarantine_matcher(text):
    return virus_match(text) == True and 'quarantine' in text

def quarantine_report(x):
    arr = x[1][-2:]
    m1, v1 = arr[0]
    m2, v2 = arr[1]
    
    if m1 == 'Quarantine Matcher':
        if m2 == 'Time Matcher':
            report_interval(stat['quarantine'], 0, day_value(v2[0], v2[1]))            
        elif m2 == 'Time Interval Matcher':
            report_interval(stat['quarantine'], day_value(v2[0], v2[3]), day_value(v2[1], v2[1]))           
    elif m2 == 'Quarantine Matcher':
        if m2 == 'Time Matcher':
            report_interval(stat['quarantine'], 0, day_value(v2[0], v2[1]))
        elif m2 == 'Time Interval Matcher':
            report_interval(stat['quarantine'], day_value(v2[0], v2[3]), day_value(v2[1], v2[1]))      
            
rule = {    
    "Matchers": [
        ("Quarantine Matcher", [
            {"LOWER": "quarantine"},
        ]),
        
        ("Time Matcher", matchers["Number Suffix Matcher"](["days", "weeks"])),
        ("Time Interval Matcher", matchers["Number Interval Matcher"](["days", "weeks"]))
    ],
    "root": {          
        "Quarantine Matcher": { 
            "Time Matcher": quarantine_report ,
            "Time Interval Matcher": quarantine_report,
        },
        "Day Matcher": { "Quarantine Matcher": quarantine_report },
        "Day Interval Matcher": { "Quarantine Matcher": quarantine_report }
    }
}

execute_ruleset('quarantine', rule)
plot_dict(stat['quarantine'], 10)

0it [00:00, ?it/s]


ValueError: not enough values to unpack (expected 2, got 0)

<Figure size 1440x288 with 0 Axes>

# Transmission methods

Searches for all transmission method (NOUNS) in articles where coronavirus is mentioned.

In [12]:
stat['transmission'] = {}
def process_matcher(x):
    arr = x[1][-4:]
    sentence = x[2]
    m1, v1 = arr[0]
    m2, v2 = arr[1]
    m3, v3 = arr[2]
    m4, v4 = arr[3]
    if m1 == 'Term Matcher' and m2 == "Verbs Matcher" and m3 == "Form Matcher":
#         print(v1, '->', v4, sentence[v3.end:v4.end], '>>>', sentence[v3.end:])
        tokens = []
        for token in sentence[v3.end:]:
            if token.pos_ == 'PUNCT' and token.tag_ != 'HYPH':
                break;
            else:
                tokens.append(token.lemma_)
        result = ' '.join(tokens)
        result = re.sub(r' - ', '-', result)
        
        for res in re.split(r' (and|or) ', result):
            val = res.strip()
            if len(val) > 1 and val not in ['or', 'and']:
                dict_counter(stat['transmission'], val)
        
        
rule = {    
    "Matchers": [
        ("Term Matcher", matchers["Terms Matcher"](['transmit','transmitted', 'spread', 'spreaded'])),
        ("Form Matcher", matchers["Terms Matcher"](['through', 'by', 'via'])),
        ("Verbs Matcher", [            
            {"POS": "VERB"},            
        ]),
        ("Noun Matcher", [
            {"POS": "NOUN"},            
        ])
    ],
    "root": {          
        "Term Matcher": {
            "Form Matcher": {
                "Noun Matcher": process_matcher
            }
        }
    }
}

def filter(text):
    return len(re.findall(r'(transmitted|transmit|spread|spreaded)', text)) > 0

execute_ruleset(filter, rule, sentence_level = False)

filtered = dict(stat['transmission'])

to_delete = []
for key in filtered:
    if filtered[key] < 5 or len(key) > 20:
        to_delete.append(key)
for key in to_delete:
    del filtered[key]

plot_dict(filtered, 6, True)

0it [00:00, ?it/s]


ValueError: not enough values to unpack (expected 2, got 0)

<Figure size 1440x288 with 0 Axes>