In [22]:
import spacy
import scispacy
import deplacy
from spacy import displacy
import stanza
import re
import pandas as pd
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
from spacy.matcher import Matcher
import matplotlib.pyplot as plt
import networkx as nx

In [2]:
#loading the data
with open ("/Users/abhinavshinow/Documents/GitHub/KG-Genaration-from-Biomedical-Text-Using-NER/Data_Collection/heart_basic-data.txt","r") as f:
    text=f.read().replace("\n\n"," ").replace("\n"," ")
text=text[:1000000]

In [3]:
len(text)

1000000

In [15]:
#loading the scispacy core model
nlp_core = spacy.load("en_core_sci_lg")
nlp_core.add_pipe("abbreviation_detector")
nlp_core.add_pipe("hyponym_detector", last=True, config={"extended": False})

#loading the scispacy ner models
nlp_jnlpba = spacy.load('en_ner_jnlpba_md')
nlp_craft = spacy.load('en_ner_craft_md')
nlp_bionlp = spacy.load('en_ner_bionlp13cg_md')
nlp_bc5cdr = spacy.load('en_ner_bc5cdr_md')

#loading the stanza model
nlp_stanza = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'})

2021-12-10 21:14:06 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | mimic   |
| pos       | mimic   |
| lemma     | mimic   |
| depparse  | mimic   |
| ner       | i2b2    |

2021-12-10 21:14:06 INFO: Use device: cpu
2021-12-10 21:14:06 INFO: Loading: tokenize
2021-12-10 21:14:06 INFO: Loading: pos
2021-12-10 21:14:07 INFO: Loading: lemma
2021-12-10 21:14:07 INFO: Loading: depparse
2021-12-10 21:14:07 INFO: Loading: ner
2021-12-10 21:14:08 INFO: Done loading processors!


In [11]:
nlp_core.pipe_labels['ner']

['ENTITY']

In [18]:
doc=nlp_core(text)

In [13]:
for ent in doc.ents:
    print(ent)

heart
muscular organ
animals
pumps blood
blood vessels
circulatory system
pumped blood carries oxygen
nutrients
body
metabolic
waste
carbon dioxide
lungs
humans
heart
size
closed fist
lungs
middle compartment
chest
humans
mammals
birds
heart
chambers
upper left
right atria
lower left
right ventricles
right atrium
ventricle
right heart
left counterparts
left heart
Fish
chambers
atrium
ventricle
reptiles
chambers
healthy
heart blood
heart
heart valves
prevent
backflow
heart
enclosed
protective sac
pericardium
amount
fluid
heart
layers
epicardium
myocardium
endocardium
heart pumps
blood
rhythm
group
pacemaker cells
sinoatrial node
contraction
heart
traveling
atrioventricular node
conduction system
heart
heart
blood low
oxygen
systemic
circulation
right atrium
superior
inferior venae cavae
right ventricle
pumped
pulmonary circulation
lungs
oxygen
carbon dioxide
Oxygenated blood
left atrium
left ventricle
aorta
systemic
circulation‚àíwhere the oxygen
metabolized
carbon dioxide
heart beats
r

In [17]:
#Hearst Patterns
doc._.hearst_patterns

[]

In [None]:
#Abbreviations
for abrv in doc._.abbreviations:
	print(str(abrv),str(abrv._.long_form))

In [None]:
abbreviations=[]
long_form=[]
for abrv in doc._.abbreviations:
    if str(abrv) not in abbreviations:
        if str(abrv)!=str(abrv._.long_form):
            abbreviations.append(str(abrv))
            long_form.append(str(abrv._.long_form))

hyponyms=list(doc._.hearst_patterns)
sources=[str(source[1]) for source in hyponyms]
targets=[str(target[2]) for target in hyponyms]
relations=[str(relation[0]) for relation in hyponyms]

for abrv,abrv_long in zip(abbreviations,long_form):
    sources.append(abrv)
    targets.append(abrv_long)
    relations.append('abbreviated-as')

In [None]:
#Pos Tagging and Dependency parsing
def extract(sent):

  doc=nlp_core(sent)
  matcher = Matcher(nlp_core.vocab)
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 
  matcher.add("relation",[pattern]) 
  matches = matcher(doc)
  k = len(matches) - 1
  span = doc[matches[k][1]:matches[k][2]] 

  ent1,ent2,prv_tok_dep,prv_tok_text,prefix,modifier = "","","","","",""
  for tok in nlp_core(sent):
    if tok.dep_ != "punct":
      if tok.dep_ == "compound":
        prefix = tok.text
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text

      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""   

      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
      
  return [ent1.strip(),span.text,ent2.strip()]

In [None]:
ner_core=[]
for sent in doc.sents:
    source=extract(str(sent))[0]
    relation=extract(str(sent))[1]
    target=extract(str(sent))[2]
    if not any(re.search("^\s*$",s) for s in [source,target,relation]):
        ner_core.append(extract(str(sent)))
        sources.append(source)
        relations.append(relation)
        targets.append(target)

In [None]:
# for enitity_relations in ner:
#     print(enitity_relations)


#Add method to remove special characters

In [None]:
#Ner Models
print("jnlpba-labels",nlp_jnlpba.pipe_labels['ner'])
print("craft-labels",nlp_craft.pipe_labels['ner'])
print("bionlp-labels",nlp_bionlp.pipe_labels['ner'])
print("Bc5cdr-labels",nlp_bc5cdr.pipe_labels['ner'])

In [None]:
print("Bc5cdr-labels",nlp_bc5cdr.pipe_labels['ner'])

In [19]:
#Entity-Ruler
ruler = nlp_bc5cdr.add_pipe("entity_ruler",after="ner")

In [None]:
print("Bc5cdr-labels",nlp_bc5cdr.pipe_labels['ner'])

In [None]:
#Custom Entities and Labels
pattern = [
    #CVD
    {"label":"CVD","pattern":"cardiomyopathies"},
    {"label":"CVD","pattern":"angina"},
    {"label":"CVD","pattern":"arrhythmia"},
    {"label":"CVD","pattern":"dilated cardiomyopathy"},
    {"label":"CVD","pattern":"hypertrophic cardiomyopathy"},
    {"label":"CVD","pattern":"mitral regurgitation"},
    {"label":"CVD","pattern":"mitral valve prolapse"},
    {"label":"CVD","pattern":"pulmonary stenosis"},
    {"label":"CVD","pattern":"aortic stenosis"},
    {"label":"CVD","pattern":"atrial fibrillation"},
    {"label":"CVD","pattern":"peripheral artery disease"},
    {"label":"CVD","pattern":"aneurysm"},
    {"label":"CVD","pattern":"atherosclerosis"},
    {"label":"CVD","pattern":"raynaud‚Äôs disease"},
    {"label":"CVD","pattern":"peripheral venous disease"},
    {"label":"CVD","pattern":"ischemic stroke"},
    {"label":"CVD","pattern":"venous blood clots"},
    {"label":"CVD","pattern":"buerger‚Äôs disease"},
    
    #Drug
    {"label":"Drug","pattern":"statins"},
    {"label":"Drug","pattern":"aspirin"},
    {"label":"Drug","pattern":"clopidogrel"},
    {"label":"Drug","pattern":"warfarin"},
    {"label":"Drug","pattern":"beta-blockers"},
    {"label":"Drug","pattern":"ace inhibitors"},
    {"label":"Drug","pattern":"apixaban"},
    {"label":"Drug","pattern":"dabigatran"},
    {"label":"Drug","pattern":"edoxaban"},
    {"label":"Drug","pattern":"heparin"},
    {"label":"Drug","pattern":"rivaroxaban"},
    {"label":"Drug","pattern":"dipyridamole"},
    {"label":"Drug","pattern":"prasugrel"},
    {"label":"Drug","pattern":"ticagrelor"},
    
    #Symptoms
    {"label":"Symptoms","pattern":"chest pain"},
    {"label":"Symptoms","pattern":"chest tightness"},
    {"label":"Symptoms","pattern":"chest pressure"},
    {"label":"Symptoms","pattern":"chest discomfort"},
    {"label":"Symptoms","pattern":"shortness of breath"},
    {"label":"Symptoms","pattern":"numbness"},
    {"label":"Symptoms","pattern":"coldness"},
    {"label":"Symptoms","pattern":"neck pain"},
    {"label":"Symptoms","pattern":"jaw pain"},
    {"label":"Symptoms","pattern":"throat pain"},
    {"label":"Symptoms","pattern":"upper abdomen pain"},
    {"label":"Symptoms","pattern":"back pain"},
    
    #Parts of the human heart
    {"label":"Parts of the human heart","pattern":"superior vena cava"},
    {"label":"Parts of the human heart","pattern":"inferior vena cava"},
    {"label":"Parts of the human heart","pattern":"pulmonary vein"},
    {"label":"Parts of the human heart","pattern":"right atrium"},
    {"label":"Parts of the human heart","pattern":"pulmonary valve"},
    {"label":"Parts of the human heart","pattern":"tricuspid valve"},
    {"label":"Parts of the human heart","pattern":"right ventricle"},
    {"label":"Parts of the human heart","pattern":"aorta"},
    {"label":"Parts of the human heart","pattern":"pulmonary artery"},
    {"label":"Parts of the human heart","pattern":"left atrium"},
    {"label":"Parts of the human heart","pattern":"mitral valve"},
    {"label":"Parts of the human heart","pattern":"aortic valve"},
    {"label":"Parts of the human heart","pattern":"left ventricle"},
    
    #Causes
    {"label":"Causes","pattern":"family history"},
    {"label":"Causes","pattern":"smoking"},
    {"label":"Causes","pattern":"poor diet"},
    {"label":"Causes","pattern":"high blood pressure"},
    {"label":"Causes","pattern":"high blood cholesterol levels"},
    {"label":"Causes","pattern":"age"},
    {"label":"Causes","pattern":"sex"},
    {"label":"Causes","pattern":"stress"},
    {"label":"Causes","pattern":"obesity"},
    {"label":"Causes","pattern":"physical inactivity"},
    {"label":"Causes","pattern":"stress"},
    {"label":"Causes","pattern":"poor dental health"},
    
    #Complications
    {"label":"Complications","pattern":"heart failure"},
    {"label":"Complications","pattern":"heart attack"},
    {"label":"Complications","pattern":"stroke"},
    {"label":"Complications","pattern":"aneurysm"},
    {"label":"Complications","pattern":"peripheral artery disease"},
    {"label":"Complications","pattern":"sudden cardiac arrest"},

    #Prevention
    {"label":"Prevention","pattern":"healthy weight"},
    {"label":"Prevention","pattern":"manage stress"},
    {"label":"Prevention","pattern":"hygiene"},
    {"label":"Prevention","pattern":"exercise"},
    {"label":"Prevention","pattern":"diet"},
    ]

In [None]:
for i in range(len(pattern)):
    s = pattern[i]['pattern']
    s_cap = s.upper()
    s_fcap = s.capitalize()
    pattern.append({"label":pattern[i]['label'],"pattern":s_cap})
    pattern.append({"label":pattern[i]['label'],"pattern":s_fcap})

In [None]:
ruler.add_patterns(pattern)

doc_ner1 = nlp_bc5cdr(text)
doc_ner2 = nlp_bc5cdr(text)
doc_ner3 = nlp_bc5cdr(text)
doc_ner4 = nlp_bc5cdr(text)

In [None]:
for ents in doc_ner1.ents:
    print(ents)

In [None]:
html=displacy.render(doc_ner1,style="ent")

In [None]:
html=displacy.render(doc_ner2,style="ent")

In [None]:
html=displacy.render(doc_ner3,style="ent")

In [None]:
html=displacy.render(doc_ner4,style="ent")

In [None]:
#Label's and Entities added to source , target and relations list

docs=[doc_ner1,doc_ner2,doc_ner3,doc_ner4]
for doc in docs:
    for ents in doc.ents:
        sources.append(ents)
        targets.append(ents.label_)
        relations.append('NER')

In [None]:
doc_stanza = nlp_stanza(text)