## Lab 3: DDI 

### Imports

In [None]:
import os
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.corpus import stopwords 
import xml.etree.ElementTree as ET
from nltk.tree import Tree

### Variables

In [None]:
inputdir = '../../data/Devel'
test_path = '../../data/Test-DDI'
train_path = '../../data/Train'
outputfile = 'task9.2_develGoal_1.txt'

my_parser = CoreNLPDependencyParser(url="http://localhost:9000")

### Functions

In [None]:
def analyze(sent):
    if len(sent)<= 0:
        return None
    mytree, = my_parser.raw_parse(sent)
    tree = mytree.nodes
    ini_token = 0
                   
    # clean tree
    aux = ['address', 'head', 'lemma', 'rel', 'word', 'tag']
    for k in range(1,len(tree)):
        node = tree[k] 
        for key in list(node):
            if key not in aux:
                del node[key]
        
        # add offset
        ini_token = sent.find(node['word'] ,ini_token)
        
        node['start'] = ini_token
        ini_token += len(node['word'])
        node['end'] = ini_token - 1
    return tree

def check_interaction(analysis, entities, e1, e2):
    # In between words per interaction
    type_effect = ['administer', 'potentiate', 'prevent']
    type_mechanism = ['reduce', 'increase', 'decrease']
    type_int = ['interact', 'interaction']
    type_advise = []
    
    # Rules
    entity1 = None
    entity2 = None
    for key in analysis.keys():
        if 'start' in analysis[key].keys() and str(analysis[key]['start']) == str(entities[e1][0]) and str(analysis[key]['end']) == str(entities[e1][1]):
            entity1 = analysis[key]
        elif 'start' in analysis[key].keys() and str(analysis[key]['start']) == str(entities[e2][0]) and str(analysis[key]['end']) == str(entities[e2][1]):
            entity2 = analysis[key]
            
    if entity1 is not None and entity2 is not None:
        # Get words between the entities
        between_words = []
        for key in analysis.keys():
            if 'start' in analysis[key].keys() and entity1['end'] < analysis[key]['start'] < entity2['start']:
                between_words.append(analysis[key]['lemma'])
                
        if len(between_words) > 0:
            if len(list(set(type_effect) & set(between_words)))> 0:
                return (1,"effect")
            elif len(list(set(type_mechanism) & set(between_words)))> 0:
                return (1,"mechanism")
            elif len(list(set(type_int) & set(between_words)))> 0:
                return (1,"int")
            elif len(list(set(type_advise) & set(between_words)))> 0:
                return (1,"advise")
            else:
                return (0, "null")
        else: 
            return (0, "null")
    else:
        return (0, "null")
    

def evaluate(inputdir, outputfile):
    os.system("java -jar ../../eval/evaluateDDI.jar "+ str(inputdir) + " " + str(outputfile))
    return 

def main_function(inputdir):
    outf = open(outputfile, "w")
    # process each file in directory
    for filename in os.listdir(inputdir):
        # parse XML file, obtaining a DOM tree
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()  
        
        # process each sentence in the file
        for sentence in root.findall('sentence'):
            # Get sentence id and tokenize text
            sent_id = sentence.get('id') # get sentence id
            sent_text = sentence.get('text') #get sentence text 
            # load sentence entities into a dictionary
            entities = {}
            for ent in sentence.findall('entity'):
                ent_id = ent.get('id') 
                offs = ent.get('charOffset').split('-')
                entities[ent_id] = offs
            # Tokenize, tag and parse sentence
            analysis = analyze(sent_text)
            # for each pair in the sentence, decide whether it is DDI and its type
            for pair in sentence.findall('pair'):
                id_e1 = pair.get('e1')
                id_e2 = pair.get('e2')
                (is_ddi, ddi_type) = check_interaction(analysis, entities, id_e1, id_e2)
                outf.write(str(sent_id)+'|'+str(id_e1)+'|'+str(id_e2)+'|'+str(is_ddi)+'|'+str(ddi_type))
                outf.write("\n")
    # get performance score
    evaluate(inputdir, outputfile)

In [None]:
main_function(inputdir)