In [None]:
### Code for SpaCy Pattern Matching. The entire code in run on kaggle notebook as the data set was huge and could be easily accessed without download in kaggle

In [None]:
pip install spacy

In [None]:
## Loading required libraries 
import os
import re
import json
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns


import spacy
from spacy.matcher import Matcher
from tqdm import tqdm

## loading pre trained statistical model in English language 
nlp = spacy.load("en_core_web_sm")

In [None]:
## Loading all Json Files 
debug = False
articles = {}
stat = { }
for dirpath, subdirs, files in os.walk('/kaggle/input'):
    for x in files:
        if x.endswith(".json"):
    
            articles[x] = os.path.join(dirpath, x)        
df = pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv')

In [None]:
## Defining known terms that are used to refine search of articles 
covid_reference  = ['covid-19', 'coronavirus', 'cov-2', 'sars-cov-2', 'sars-cov', 'hcov', '2019-ncov']
risk_factors = ['diabetes','hypertension','heart disease','cancer','smoking','lung disease','alcohol','climate','small children','age','immune compromised groups','race/ethnicity','mental hospital inpatients','long-term care facility residents','health workers','pregnancy status','chronic kidney disease','Parkinson Disease','Influenza ','pneumonia','Hepatitis B','alcoholic liver disease','fatty liver ','fungal infection','Thyroid diseases','COPD','chronic bronchitis','obesity','BMI','gender','hemoglobin','HDL cholesterol','LDL cholesterol']


In [None]:
## Defining patterns that will put together a rule to identify the risk factors  

patterns = {    
    "Term Matcher": lambda term: [{'LOWER': t} for t in term.split(' ')],
    "Terms Matcher": lambda terms: [{"LOWER": {"IN": terms } }]  
}


In [None]:
### A function to plot dictionary and the risk factors in a bar graph. 
def create_dict(stat, t = 10, sort_values = False, barh = False, width = 20, height = 4, title = ''):
    filtered = dict(stat)
    
    if sort_values == True:
       lists = sorted(filtered.items(), key = lambda item : item[1])
   

    fig = figure(num=None, figsize=(width, height))    #Defining the size and title for bar graph        

    if title != '':
        fig.suptitle(title, fontsize=20)

    x, y = zip(*lists) 

    plt.bar(x, y)
    plt.show()
    
    


## A function to count the number of times a risk factor occurrs in articles
def risk_counter(res, arg):
    try:
        key = str(arg)
        res.setdefault(key, 0)
        res[key] = res[key] + 1
    except:
        pass

## A function that will help find all thr articles that are related to covid based on the covid_reference terms provided above. 
def covidMatch(text):
    return len(re.findall(rf'({"|".join(covid_reference)})', text, flags=re.IGNORECASE)) > 0

In [None]:
## performing required data pre processing to extract only the abstract and full textbody & articles that have a covid reference in it 
literature = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    sha = str(row['sha'])
    if sha != 'nan':
        sha = sha + '.json';
        try:
            found = False
            with open(articles[sha]) as f:
                data = json.load(f)
                for key in ['abstract', 'full_text']:
                    if found == False and key in data:
                        for content in data[key]:
                            text = content['text']
                            if covidMatch(text) == True:                                
                               literature.append({'file': articles[sha], 'body': text})                                
        except KeyError:
            pass

In [None]:
## functions to execute pattern matching

## combines all matches and their occurences 
def result_matches(match_arr, root, sentence, file, index = 0, execution = []):
    key, result = match_arr[0]
    rest = match_arr[1:]
    next_exec = execution + [(key, result, index)]
    if key in root:
        rule = root[key]
        if callable(rule):
            rule( (result, next_exec, sentence, file) )            
        else:
            if 'execute' in rule:
                rule['execute']( (result, next_exec, sentence, file) )
            if len(rest) > 0:
                result_matches(rest, rule, sentence, file, index+1, next_exec)

        

def merge_matches(matches, doc):
    match_list = []
    current = (None, None, None)
    for match_id, start, end in matches:   
        if match_id != current[0] or current[2] < start:
            if current[0] != None:
                match_list.append(current)
            current = (match_id, start, end)
        elif current[2] < end:
            current = (match_id, current[1], end)
        
    match_list.append(current)
    return match_list;

## calling matcher object and extracts all matching results 
def match_articles(matcher, doc, rule, file):
    matches = matcher(doc)
    if len(matches)>0:
        to_process = []
        for match_id, start, end in merge_matches(matches, doc):
            string_id = nlp.vocab.strings[match_id]  # Get string representation
            span = doc[start:end]  # The matched span
            to_process.append((string_id, span))
        result_matches(to_process, rule['root'], doc, file)
        

##Converting text articles to document object 
def parse_articles(matcher, text, rule, file = None, sentence_level = False):
    text = text.lower()
    doc = nlp(text)
    
    if sentence_level == True:    
        for sent in doc.sents:
            sent_doc = nlp(sent.text)
            match_articles(matcher, sent_doc, rule, file)
    else:
        match_articles(matcher, doc, rule, file)


## Creating the matcher class object and adding pattern to the matcher object created 
def extract_words(term, rule, sentence_level = False, literature = literature):
    matcher = Matcher(nlp.vocab)
    for name, m in rule["Matchers"]:
        matcher.add(name, None, m)
    
    for article in tqdm(literature):
        text_list = re.compile("\. ").split(article['body'])
        file = article['file']
        for text in text_list:
            if callable(term):
                allow = term(text)
            else:
                allow = term == None or term in text
            if allow == True:
                parse_articles(matcher, text, rule, file, sentence_level) 

In [None]:
## Matcher rule that searches for risk factors in articles 
## Also uses word risk to search through artilces 

stat['risk_factors'] = {}

def match(text):
    if covidMatch(text) == True:
        return len(re.findall(rf'\ ({"|".join(risk_factors)})\ ', text)) > 0
    else:
        return False

def riskfactor(res):
    ref, agregate, sentence, file = res
    risk_counter(stat['risk_factors'], ref.text)
    
rule = {    
    "Matchers": [      
       ("Risk factors Reference", patterns['Terms Matcher'](risk_factors)),
    ],
    "root": {
        "Risk factors Reference": riskfactor
    }
}
        

def risk_match(text):
    return len(re.findall(r'risk', text)) > 0

extract_words(risk_match, rule)
create_dict(stat['risk_factors'], 50, True, title = "Risk Factors")