## Lab 4: DDI using ML

### Imports

In [1]:
import os
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.corpus import stopwords 
import xml.etree.ElementTree as ET

### Variables

In [2]:
devel_path = '../../data/Devel'
test_path = '../../data/Test-DDI'
train_path = '../../data/Train'
output_file = 'output/task9.2_develGoal_1.txt'

model_path = '/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/'
output_path = '/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/output/'
features_path = '/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/features/'

my_parser = CoreNLPDependencyParser(url="http://localhost:9000")

### Feature extractor

In [3]:
def analyze(sent):
    if len(sent)<= 0:
        return None
    
    mytree, = my_parser.raw_parse(sent)
    tree = mytree.nodes
    ini_token = 0
                   
    # clean tree
    info = ['address', 'head', 'lemma', 'rel', 'word', 'tag']
    for k in range(len(tree)):
        node = tree[k] 
        for key in list(node):
            if key not in info:
                del node[key]
        
        if k != 0:
            # add offsets
            ini_token = sent.find(node['word'] ,ini_token)
            node['start'] = ini_token
            ini_token += len(node['word'])
            node['end'] = ini_token - 1
            
    return tree

In [4]:
def get_entity_nodes(tree, entities, e1, e2):
    entity1 = []
    entity2 = []
    for key in tree.keys():
        if 'start' in tree[key].keys() and (str(tree[key]['start']) == str(entities[e1][0]) or str(tree[key]['end']) == str(entities[e1][1])):
            entity1.append(tree[key])
        elif 'start' in tree[key].keys() and (str(tree[key]['start']) == str(entities[e2][0]) or str(tree[key]['end']) == str(entities[e2][1])):
            entity2.append(tree[key])
    return entity1, entity2


def extract_features(tree, entities, e1, e2):
    # Get entities
    entity1, entity2 = get_entity_nodes(tree, entities, e1, e2)
    
    # Features
    features = []
    
    for i in range(len(entity1)):
        if entity1[i]['head'] != 0:
            features.extend(['h1_lemma_%s=%s' % (str(i), tree[entity1[i]['head']]['lemma']),
                             'h1_word_%s=%s' % (str(i), tree[entity1[i]['head']]['word']),
                             'h1_tag_%s=%s' % (str(i), tree[entity1[i]['head']]['tag']),
                             'h1_address_%s=%s' % (str(i), tree[entity1[i]['head']]['address']),
                            ])
            
        features.append('e1_rel_%s=%s' % (str(i), entity1[i]['rel']))
            
    for i in range(len(entity2)):
        if entity2[i]['head'] != 0:
            features.extend(['h2_lemma_%s=%s' % (str(i), tree[entity2[i]['head']]['lemma']),
                             'h2_word_%s=%s' % (str(i), tree[entity2[i]['head']]['word']),
                             'h2_tag_%s=%s' % (str(i), tree[entity2[i]['head']]['tag']),
                             'h2_address_%s=%s' % (str(i), tree[entity2[i]['head']]['address']),
                            ])
        features.append('e2_rel_%s=%s' % (str(i), entity2[i]['rel']))
      
    return features

In [5]:
def output_features(sent_id, e1, e2, gold_class, features, extra_info_outf, outf):
    features_str = '\t'.join(features)
    outf.write(gold_class+'\t'+features_str)
    outf.write("\n")
    extra_info_outf.write(sent_id+'\t'+e1+'\t'+e2+'\t'+gold_class+'\t'+features_str)
    extra_info_outf.write("\n")

In [6]:
def main_extract_features(input_dir, extra_info_output_file, output_file):
    outf = open(output_file, "w")
    extra_info_outf = open(extra_info_output_file, "w")
    # process each file in directory
    for filename in os.listdir(input_dir):  
        # parse XML file, obtaining a DOM tree
        fullname = os.path.join(input_dir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        
        # process each sentence in the file
        for sentence in root.findall('sentence'):
            sent_id = sentence.get('id')
            
            # load sentence entities into a dictionary
            entities = {}
            for ent in sentence.findall('entity'):
                ent_id = ent.get('id') 
                offs = ent.get('charOffset').split('-')
                entities[ent_id] = offs
            
            tree = analyze(sentence.get('text'))
            # for each pair in the sentence, decide whether it is DDI and its type
            for pair in sentence.findall('pair'):
                e1 = pair.get('e1')
                e2 = pair.get('e2')
                
                gold_class = pair.get('type')
                if gold_class is None:
                    gold_class = 'null'
            
                features = extract_features(tree, entities, e1, e2)
                output_features(sent_id, e1, e2, gold_class, features, extra_info_outf, outf)
    outf.close()
    extra_info_outf.close()
    return

In [7]:
main_extract_features(train_path, 'features/info_train_features', 'features/train_features')
main_extract_features(devel_path, 'features/info_devel_features', 'features/devel_features')
main_extract_features(test_path, 'features/info_test_features', 'features/test_features')

## MaxEntropy Classifier
### Learner

In [8]:
def train(megam, features_file, out_train_model):
    print("ubuntu run \""+ megam + " -quiet -nc -nobias multiclass " + features_file + " > " + out_train_model + "\"")
    os.system("ubuntu run \""+ megam + " -quiet -nc -nobias multiclass " + features_file + " > " + out_train_model + "\"")
    return

In [9]:
train(model_path+'megam-64.opt', features_path+'train_features', model_path+'model.dat')

ubuntu run "/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/megam-64.opt -quiet -nc -nobias multiclass /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/features/train_features > /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/model.dat"


### Classifier

In [10]:
def classify(megam, features_file, train_model, prediction_output):
    print("ubuntu run \""+ megam + " -quiet -nc -nobias -predict " + train_model + " multiclass " + features_file + " > " + prediction_output + "\"")
    os.system("ubuntu run \""+ megam + " -quiet -nc -nobias -predict " + train_model + " multiclass " + features_file + " > " + prediction_output + "\"")

In [11]:
# Devel prediction
classify(model_path+'megam-64.opt', features_path+'devel_features', model_path+'model.dat', output_path+'devel_prediction')

ubuntu run "/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/megam-64.opt -quiet -nc -nobias -predict /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/model.dat multiclass /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/features/devel_features > /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/output/devel_prediction"


In [16]:
# Test prediction
classify(model_path+'megam-64.opt', features_path+'test_features', model_path+'model.dat', output_path+'test_prediction')

ubuntu run "/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/megam-64.opt -quiet -nc -nobias -predict /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/model.dat multiclass /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/features/test_features > /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/output/test_prediction"


## Produce output and evaluate

In [17]:
def output_ddi(sent_id, id_e1, id_e2, predicted_class, outf):
    if predicted_class == 'null':
        outf.write(sent_id+'|'+id_e1+'|'+id_e2+'|0|'+predicted_class)
    else:
        outf.write(sent_id+'|'+id_e1+'|'+id_e2+'|1|'+predicted_class)
    outf.write("\n")

In [18]:
def evaluate(input_dir, output_file):
    os.system("java -jar ../../eval/evaluateDDI.jar "+ str(input_dir) + " " + str(output_file))
    return 

In [19]:
def main_function(input_dir, features_info, prediction_file, output_file):
    outf = open(output_file, "w")
    
    with open(features_info) as textfile1, open(prediction_file) as textfile2: 
        for x, y in zip(textfile1, textfile2):
            saved_features = x.split('\t')
            sent_id = saved_features[0]
            id_e1 = saved_features[1]
            id_e2 = saved_features[2]
            
            predicted_class = y.split()[0]
            
            output_ddi(sent_id, id_e1, id_e2, predicted_class, outf)
    outf.close()
    # get performance score
    evaluate(input_dir, output_file)

In [20]:
#main_function(devel_path, 'features/info_devel_features', 'output/devel_prediction', output_file)

In [22]:
main_function(test_path, 'features/info_test_features', 'output/test_prediction', output_file)