#**To Label nodes in Parser Output Json-LD**
###**Input:**
1.   Parser Output JSLD (Footnotes referenced) *{Example: NCCN_NSCLGraph_21_23.json}*
2.   MedCy label JSON file (containing nodeid and label) *{Example: page21_23_medaCy_only_labeled_nodes.json}*

###**Output:**
1. JSON LD File same as input containing the labels from spaCy and medCy *{Example: NCCN_NSCLGraph_21_23_labeled.json}*

In [21]:
!pip install spacy==3.3 &> /dev/null

In [22]:
import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span
import json
import re

In [23]:
##Creating Patterns for labeling
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")  ##requires spacy 3
patterns = [{"label": "ASSESSMENT", "pattern": [{"LOWER": "negative"}]},
            {"label": "ASSESSMENT", "pattern": [{"LOWER": "positive"}]},
            {"label": "ASSESSMENT", "pattern":[{"LOWER": "margins"}]},
            {"label": "ASSESSMENT", "pattern":[{"LOWER": "recurrence"}]},
            {"label": "ASSESSMENT", "pattern":[{"LOWER": "pd-l1"}]},
            {"label": "ASSESSMENT", "pattern":[{"LOWER": "stable"}]},
            {"label": "EVALUATION", "pattern": [{"LOWER": "ct"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER": "pet"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER":"pulmonary function tests(pfts)"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER": "bronchoscopy"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER": "mri"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER": "pathology"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER": "mediastinoscopy"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER": "thoracoscopy"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER": "biopsy"}]},
            {"label": "EVALUATION", "pattern":[{"LOWER": "mediastinotomy"}]},
            {"label": "OBSERVATION", "pattern": [{"LOWER": "finding"}]},
            {"label": "OBSERVATION", "pattern":[{"LOWER": "lesion"}]},
            {"label": "OBSERVATION", "pattern":[{"LOWER": "symptomatic"}]},
            {"label": "OBSERVATION", "pattern":[{"LOWER": "asymptomatic"}]},
            {"label": "FOLLOW-UP", "pattern":[{"LOWER": "follow-up"}]},
            {"label": "FOLLOW-UP", "pattern":[{"LOWER": "mo"}]},
            {"label": "RECOMMENDATION", "pattern": [{"LOWER": "consider"}]},
            {"label": "TREATMENT", "pattern": [{"LOWER": "resection"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "dissection"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "chemoradiation"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "reresection"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "rt"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "sampling"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "chemotherapy"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "endobronchial obstruction"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "superior vena cava (svc) obstruction"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "laser"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "brachytherapy"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "embolization"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "orthopedic"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "bisphosphonate"}]},
            {"label": "TREATMENT", "pattern":[{"LOWER": "sabr"}]}]
ruler.add_patterns(patterns)

In [24]:
## READING JSON FILES (Parser Output of footnotes marked automatically)
file = open('NCCN_NSCLGraph_21_23.json')
F_markedjsld_21_23 = json.load(file)

In [25]:
##READING MEDACY LABELS FILE
with open("page21_23_medaCy_only_labeled_nodes.json", "r") as read_file:
    nodeid_medaCylabel_js = json.load(read_file)

In [26]:
##COMBINING SPACY AND MEDACY LABELS AND CREATING JSON OUTPUT
nodes_medCy=list(nodeid_medaCylabel_js.keys())
nodeid_spaCymedCylabel_dic={}
A=[]  #Label set of an individual node

for node in F_markedjsld_21_23['@graph']:
  
  node_id = str(node['@id'])
  nodeid_spaCymedCylabel_dic[node_id]=[]
  
  for x in range(len(nodes_medCy)):
    if (node_id == nodes_medCy[x]) :   
      nodeid_spaCymedCylabel_dic[node_id].append(nodeid_medaCylabel_js[nodes_medCy[x]]) 
   
  if node['@type'] != "nccn:Footnote":
    old_str =  node['nccn:content']
    F_removed_str = re.sub('\{[^\{]+\}', '', old_str)
    doc = nlp(F_removed_str)
    for ent in doc.ents:
      A.append(ent.label_)
    set_a2=set(A)
    A=list(set_a2)
    
      
    for m in range(len(A)):
      nodeid_spaCymedCylabel_dic[node_id].append(A[m])

       
    A.clear()
  nodeid_spaCymedCylabel_dic[node_id]=list(set(nodeid_spaCymedCylabel_dic[node_id]))             
  
#print(output_label_dic)  

In [27]:
nodes_spaCy_medCy=list(nodeid_spaCymedCylabel_dic.keys())
for i in F_markedjsld_21_23['@graph']:
  for x in range(len(nodes_spaCy_medCy)):
    if ((nodes_spaCy_medCy[x])== str(i['@id'])):
      for a in range(len(nodeid_spaCymedCylabel_dic[nodes_spaCy_medCy[x]])):
        i['@type'].append("nccn:"+nodeid_spaCymedCylabel_dic[nodes_spaCy_medCy[x]][a])

In [28]:
from google.colab import files

file_name = "NCCN_NSCLGraph_21_23_labeled.json"
with open(file_name, "w") as fid: 
     json.dump(F_markedjsld_21_23, fid)
# files.download('NCCN_NSCLGraph_21_23_labeled.json')     