# Rule-based Relation Extraction

Here is a rule-based method which performs heuristic search of subject-attribute entity pairs from the NER result.

In [1]:
import pandas as pd
import ast
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

## Utility functions

In [2]:
# BIO tags
def tag(ner_result):
    tags = []
    for word in ner_result:
        # print('word, ', word)
        if 'SEP' not in word['tag'] and 'CLS' not in word['tag'] :    #bert large may return 'CLS' tag
            tags.append((word['word'], word['tag']))
    return tags

In [3]:
# Create tree
def stanford_tree(bio_tagged):
    if len(bio_tagged) != 0:
        tokens, ne_tags = zip(*bio_tagged)
        pos_tags = [pos for token, pos in pos_tag(tokens)]

        conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
        ne_tree = conlltags2tree(conlltags)
        return ne_tree
    else:
        return None

In [4]:
# Parse named entities from tree
def structure_ne(ne_tree):
    if ne_tree is not None:
        ne = []
        for subtree in ne_tree:
            if type(subtree) == Tree:
                ne_label = subtree.label()
                ne_string = " ".join([token for token, pos in subtree.leaves()])
                ne.append((ne_string, ne_label))
        return ne
    else:
        return None

In [5]:
def find_index(mentions, s):
    index=0
    indexes=[]
    for mention in mentions:
        if mention in s:
            c = mention[0]
            # for ch in s:
            # Iterate over index
            for i in range(index, len(s)):            
                if s[i]==c:
                    if s[i:i+len(mention)] == mention:
                        indexes.append((mention,i,i+len(mention)))
                        index = i+len(mention)
                        break
    return indexes

In [6]:
# Apply relation extraction on the NER result
def call_RE_neighborpairs(df_ner):
    
    df_ner = df_ner[['#nct_id','eligibility_type','criterion','NER']]    
    df_ner = df_ner.drop_duplicates()
    
    result_ner = df_ner['NER']
    
    doc_index = df_ner['#nct_id']
    result_re = []
    for ner in result_ner:
        if isinstance(ner, str):
            ner = ast.literal_eval(ner)
        words = tag(ner)
        
        text=""
        for word in words:
            token = word[0]
            text+=token+" "
        
        tags_formatted = structure_ne(stanford_tree(words))
        mentions=[]
        tags=[]
        for one in tags_formatted:
            mentions.append(one[0])
            tags.append(one[1])
        
        indexes = find_index(mentions,text)
        # print(text)
        # print(indexes)
                
        entitylist=[]
        # 3:6:age,7:14:upper_bound	< age @NUMBER
        string=""
        for i in range(0,len(indexes)):
            mention=indexes[i][0]
            start=indexes[i][1]
            end=indexes[i][2]
            label=tags[i]
            string+= str(start)+":"+str(end)+":"+label +","
                        
            #entitylist.append((mention,label,start,end))
            entitylist.append((mention,label))

        string+="\t"+text
        # print(string)
                
        # generate relation statements
        relations=[]
        for i in range(len(entitylist)):
            if 'bound' in entitylist[i][1]:
                # print("index: ", i)
                left=entitylist[:i]

                # Find the neareast entity type on the left (of _bound)
                # find first element in a list with condition
                a = next((x for x in reversed(left) if x[1] in ["clinical_variable",'bmi','age']), None)        
                if a != None:
                    # print("nearby left: has value ", a, entitylist[i])
                    relations.append(('hasValue',a,entitylist[i]))
                b = next((x for x in reversed(left) if x[1] in ["allergy_name","cancer","chronic_disease","pregnancy",'treatment']), None)
                if a==None and b!=None:
                    # print("nearby left: has temp ", b, entitylist[i])
                    relations.append(('hasTemp',b,entitylist[i]))

                # Find the neareast entity type on the right (of _bound); if nothing on the left
                if(a==None and b==None):
                    right=entitylist[i:]
                    c = next((x for x in right if x[1] in ["clinical_variable",'bmi','age']), None)        
                    if c != None:
                        # print("nearby right: has value ", entitylist[i], c)
                        relations.append(('hasValue',entitylist[i],c))
                    d = next((x for x in right if x[1] in ["allergy_name","cancer","chronic_disease","pregnancy",'treatment']), None)
                    if c==None and d!=None:
                        print("nearby right: has temp ", entitylist[i], d)
                        relations.append(('hasTemp',entitylist[i],d))
            
        result_re.append(relations)
    
    print(len(df_ner))
    print(len(result_re))
    df_ner['Relation'] = result_re
    
    #reformat the relation statements
    df_re = df_ner[['#nct_id','eligibility_type','criterion','Relation']]
    df_re = df_re.explode('Relation')
    
    df_re[['RelationType', 'Entity1', 'Entity2']] = pd.DataFrame(df_re['Relation'].tolist(), index=df_re.index)
    
    df_re = df_re[['#nct_id','eligibility_type','criterion','RelationType','Entity1','Entity2']]
    
    return df_re

## Do relation extraction using sample data

In [7]:
# Load sample NER data
df_ner = pd.read_excel('data_ner/sample_trial_NER.xlsx')
df_ner

Unnamed: 0,#nct_id,eligibility_type,criterion,NER,Tags,Entity,Type
0,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('three', 'lower_bound')",three,lower_bound
1,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('sigmoid diverticulitis', 'chronic_disease')",sigmoid diverticulitis,chronic_disease
2,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('Fever', 'chronic_disease')",Fever,chronic_disease
3,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('body temperature', 'clinical_variable')",body temperature,clinical_variable
4,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('38°C', 'lower_bound')",38°C,lower_bound
5,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('Leukocytosis', 'chronic_disease')",Leukocytosis,chronic_disease
6,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('leukocytes', 'clinical_variable')",leukocytes,clinical_variable
7,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('10,000/µl', 'lower_bound')","10,000/µl",lower_bound
8,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('differential blood count', 'clinical_variable')",differential blood count,clinical_variable
9,NCT00097734,inclusion,- At least three of the following signs or sym...,"[{'word': '-', 'tag': 'O', 'confidence': 0.999...","('1 %', 'lower_bound')",1 %,lower_bound


In [8]:
# Input is the NER result
df_relations = call_RE_neighborpairs(df_ner)

13
13


In [9]:
df_relations

Unnamed: 0,#nct_id,eligibility_type,criterion,RelationType,Entity1,Entity2
0,NCT00097734,inclusion,- At least three of the following signs or sym...,hasValue,"(three, lower_bound)","(body temperature, clinical_variable)"
0,NCT00097734,inclusion,- At least three of the following signs or sym...,hasValue,"(body temperature, clinical_variable)","(38°C, lower_bound)"
0,NCT00097734,inclusion,- At least three of the following signs or sym...,hasValue,"(leukocytes, clinical_variable)","(10,000/µl, lower_bound)"
0,NCT00097734,inclusion,- At least three of the following signs or sym...,hasValue,"(differential blood count, clinical_variable)","(1 %, lower_bound)"
0,NCT00097734,inclusion,- At least three of the following signs or sym...,hasValue,"(Elevated CRP, clinical_variable)","(20 mg/l, lower_bound)"
12,NCT00097734,inclusion,- CT evidence of wall thickening in the sigmoi...,,,
13,NCT00097734,inclusion,- Decision in favor of conservative therapy on...,,,
14,NCT00097734,inclusion,- Evidence of sigmoid diverticulitis by contra...,,,
16,NCT00097734,exclusion,- Antibiotic therapy in the two weeks prior to...,hasTemp,"(Antibiotic therapy, treatment)","(two weeks prior, upper_bound)"
18,NCT00097734,exclusion,- Complications of sigmoid diverticulitis lead...,,,


In [10]:
df_relations.to_excel("data_re/sample_trial_relations_Rule-prediction.xlsx", index=False, encoding='utf8')