## Goal 2: DDI using ML

### Imports

In [1]:
import os
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.corpus import stopwords 
import xml.etree.ElementTree as ET

### Variables

In [2]:
devel_path = '../../data/Devel'
test_path = '../../data/Test-DDI'
train_path = '../../data/Train'

model_path = '/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/'
output_path = '/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/output/'
features_path = '/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/features/'

my_parser = CoreNLPDependencyParser(url="http://localhost:9000")

### Feature extractor

In [3]:
def handle_special_symbols(node):
    if node['word'] == '-LRB-':
        node['word'] = '('
        node['lemma'] = '('
    elif node['word'] == '-RRB-':
        node['word'] = ')'
        node['lemma'] = ')'
    elif node['word'] == '-LSB-':
        node['word'] = '['
        node['lemma'] = '['
    elif node['word'] == '-RSB-':
        node['word'] = ']'
        node['lemma'] = ']'
    elif node['word'] in ["``", "''"]:
        node['word'] = '"'
        node['lemma'] = '"'
    return
    
def analyze(sent):
    if len(sent)<= 0:
        return None
    
    mytree, = my_parser.raw_parse(sent)
    tree = mytree.nodes
    ini_token = 0
                   
    # clean tree
    info = ['address', 'head', 'lemma', 'rel', 'word', 'tag']
    
    for k in sorted(tree.keys()):
        node = tree[k] 
        for key in list(node):
            if key not in info:
                del node[key]
        
        handle_special_symbols(node)
        
        if k != 0:
            # add offsets
            ini_token = sent.find(node['word'] ,ini_token)
            node['start'] = ini_token
            ini_token += len(node['word'])
            node['end'] = ini_token - 1
            
    return tree

In [4]:
def get_entity_nodes(tree, entities, e1, e2):
    entity1 = []
    entity2 = []

    starts1 = [offs[0] for offs in entities[e1]]
    starts2 = [offs[0] for offs in entities[e2]]
    ends1 = [offs[1] for offs in entities[e1]]
    ends2 = [offs[1] for offs in entities[e2]]
    
    for k in sorted(tree.keys()):
        if 'start' in tree[k].keys():
            for i in range(len(starts1)):
                if int(starts1[i]) in range(tree[k]['start'], tree[k]['end']+1) or int(ends1[i]) in range(tree[k]['start'], tree[k]['end']+1):
                    entity1.append(tree[k])
                elif tree[k]['start'] in range(int(starts1[i]), int(ends1[i])+1) and tree[k]['end'] in range(int(starts1[i]), int(ends1[i])+1):
                    entity1.append(tree[k])
                    
            for i in range(len(starts2)):
                if int(starts2[i]) in range(tree[k]['start'], tree[k]['end']+1) or int(ends2[i]) in range(tree[k]['start'], tree[k]['end']+1):
                    entity2.append(tree[k])
                elif tree[k]['start'] in range(int(starts2[i]), int(ends2[i])+1) and tree[k]['end'] in range(int(starts2[i]), int(ends2[i])+1):
                    entity2.append(tree[k])
                    
    return entity1, entity2


def is_under(entity1, entity2):
    for i in range(len(entity1)):
        if entity1[i]['head'] in [e['address'] for e in entity2]:
            return True
    return False

def get_entity_parent(tree, entity):
    if len(entity) == 1:
        return tree[entity[0]['head']], entity[0]['rel']
    else:
        parent = None
        rel = None
        for e in entity:
            if e['head'] not in [other['address'] for other in entity]:
                parent = tree[e['head']]
                rel = e['rel']
        return parent, rel
    
def same(parent1, parent2):
    if parent1['address'] == parent2['address']:
        return True
    else:
        return False
    
def parent_lemma_belongs(parent, lemma_set):
    if parent['lemma'] in lemma_set:
        return True
    else:
        return False
    
def get_entity_limits(entity):
    start = min([e['start'] for e in entity])
    end = max([e['end'] for e in entity])
    return start, end
    
    
def extract_features_1(tree, entities, e1, e2):
    # Get entities
    entity1, entity2 = get_entity_nodes(tree, entities, e1, e2)
    
    # Features
    features = []
    
    parent1, rel1 = get_entity_parent(tree, entity1)
    parent2, rel2 = get_entity_parent(tree, entity2)
    
    features.extend(['h1_lemma=%s' % parent1['lemma'],
                     'h1_word=%s' % parent1['word'],
                     'h1_tag=%s' % parent1['tag'],
                     'h1_rel=%s' % rel1,
                     'h2_lemma=%s' % parent2['lemma'],
                     'h2_word=%s' % parent2['word'],
                     'h2_tag=%s' % parent2['tag'],
                     'h2_rel=%s' % rel2,
                    ])
    
    if same(parent1, parent2):
        features.append('under_same')
        if parent1['tag'][0].lower() == 'v':
            features.append('under_same_verb')
    
    if is_under(entity1, entity2):
        features.append('1under2')
    
    if is_under(entity2, entity1):
        features.append('2under1')
        
    if parent_lemma_belongs(parent1, ['interact', 'interaction']):
        features.append('interaction')
    
    start1, end1 = get_entity_limits(entity1)
    start2, end2 = get_entity_limits(entity2)

    for k in sorted(tree.keys()):
        if 'start' in tree[k].keys() and  tree[k]['start'] < start1:
            features.append('lb1=%s' % tree[k]['lemma'])

        if 'start' in tree[k].keys() and end1 < tree[k]['start'] < start2:
            features.append('lib=%s' % tree[k]['lemma'])

        if 'start' in tree[k].keys() and end2 < tree[k]['start']:
            features.append('la2=%s' % tree[k]['lemma'])
      
    return features


In [5]:
def get_entity_parent_v2(tree, entity):
    if len(entity) == 1:
        return [tree[entity[0]['head']]], [entity[0]['rel']]
    else:
        parent = []
        rel = []
        for e in entity:
            if e['head'] not in [other['address'] for other in entity]:
                parent.append(tree[e['head']])
                rel.append(e['rel'])
        return parent, rel
    
def same_v2(parent1, parent2):
    for p1 in parent1:
        for p2 in parent2:
            if p1['address'] == p2['address']:
                return True
    return False

def same_verb_v2(parent1, parent2):
    for p1 in parent1:
        for p2 in parent2:
            if p1['address'] == p2['address'] and p1['tag'][0].lower() == 'v':
                return True
    return False
    
def parent_lemma_belongs_v2(parent, lemma_set):
    for p in parent:
        if p['lemma'] in lemma_set:
            return True
    return False
    
def extract_features_2(tree, entities, e1, e2):
    # Get entities
    entity1, entity2 = get_entity_nodes(tree, entities, e1, e2)
    
    # Features
    features = []
    
    parent1, rel1 = get_entity_parent_v2(tree, entity1)
    parent2, rel2 = get_entity_parent_v2(tree, entity2)
    
    for i in range(len(parent1)):
        features.extend(['h1_lemma_%d=%s' % (i, parent1[i]['lemma']),
                         'h1_word_%d=%s' % (i, parent1[i]['word']),
                         'h1_tag_%d=%s' % (i, parent1[i]['tag']),
                         'h1_rel_%d=%s' % (i, rel1[i])])
    for i in range(len(parent2)):
        features.extend(['h2_lemma_%d=%s' % (i, parent2[i]['lemma']),
                         'h2_word_%d=%s' % (i, parent2[i]['word']),
                         'h2_tag_%d=%s' % (i, parent2[i]['tag']),
                         'h2_rel_%d=%s' % (i, rel2[i])])
    
    if same_v2(parent1, parent2):
        features.append('under_same')
    
    if same_verb_v2(parent1, parent2):
        features.append('under_same_verb')
    
    if is_under(entity1, entity2):
        features.append('1under2')
    
    if is_under(entity2, entity1):
        features.append('2under1')
        
    if parent_lemma_belongs_v2(parent1, ['interact', 'interaction']):
        features.append('interaction')
    
    start1, end1 = get_entity_limits(entity1)
    start2, end2 = get_entity_limits(entity2)

    for k in sorted(tree.keys()):
        if 'start' in tree[k].keys() and  tree[k]['start'] < start1:
            features.append('lb1=%s' % tree[k]['lemma'])

        if 'start' in tree[k].keys() and end1 < tree[k]['start'] < start2:
            features.append('lib=%s' % tree[k]['lemma'])

        if 'start' in tree[k].keys() and end2 < tree[k]['start']:
            features.append('la2=%s' % tree[k]['lemma'])
      
    return features

In [6]:
def is_under_v2(entity1, entity2):
    for i in range(len(entity1)):
        if entity1[i]['head'] in [e['address'] for e in entity2]:
            return True, entity1[i]['rel']
    return False, None

def extract_features_3(tree, entities, e1, e2):
    # Get entities
    entity1, entity2 = get_entity_nodes(tree, entities, e1, e2)
    
    # Features
    features = []
    
    parent1, rel1 = get_entity_parent_v2(tree, entity1)
    parent2, rel2 = get_entity_parent_v2(tree, entity2)
    
    for i in range(len(parent1)):
        features.extend(['h1_lemma_%d=%s' % (i, parent1[i]['lemma']),
                         'h1_word_%d=%s' % (i, parent1[i]['word']),
                         'h1_tag_%d=%s' % (i, parent1[i]['tag']),
                         'h1_rel_%d=%s' % (i, rel1[i])])
    for i in range(len(parent2)):
        features.extend(['h2_lemma_%d=%s' % (i, parent2[i]['lemma']),
                         'h2_word_%d=%s' % (i, parent2[i]['word']),
                         'h2_tag_%d=%s' % (i, parent2[i]['tag']),
                         'h2_rel_%d=%s' % (i, rel2[i])])
    
    if same_v2(parent1, parent2):
        features.append('under_same')
    
    if same_verb_v2(parent1, parent2):
        features.append('under_same_verb')
        
    is_1_under_2, rel_1_2 = is_under_v2(entity1, entity2)
    if is_1_under_2:
        features.append('1under2')
        features.append('1under2_rel=%s' % rel_1_2)
    
    is_2_under_1, rel_2_1 = is_under_v2(entity2, entity1)
    if is_2_under_1:
        features.append('2under1')
        features.append('2under1_rel=%s' % rel_2_1)
        
    if parent_lemma_belongs_v2(parent1, ['interact', 'interaction']):
        features.append('interaction')
    
    start1, end1 = get_entity_limits(entity1)
    start2, end2 = get_entity_limits(entity2)

    for k in sorted(tree.keys()):
        if 'start' in tree[k].keys() and  tree[k]['start'] < start1:
            features.append('lb1=%s' % tree[k]['lemma'])
            features.append('pb1=%s' % tree[k]['tag'])

        if 'start' in tree[k].keys() and end1 < tree[k]['start'] < start2:
            features.append('lib=%s' % tree[k]['lemma'])
            features.append('pib=%s' % tree[k]['tag'])

        if 'start' in tree[k].keys() and end2 < tree[k]['start']:
            features.append('la2=%s' % tree[k]['lemma'])
            features.append('pa2=%s' % tree[k]['tag'])
      
    return features

In [7]:
def output_features(sent_id, e1, e2, gold_class, features, extra_info_outf, outf):
    features_str = '\t'.join(features)
    outf.write(gold_class+'\t'+features_str)
    outf.write("\n")
    extra_info_outf.write(sent_id+'\t'+e1+'\t'+e2+'\t'+gold_class+'\t'+features_str)
    extra_info_outf.write("\n")

In [8]:
def main_extract_features(input_dir, extra_info_output_file, output_file, number):
    outf = open(output_file, "w")
    extra_info_outf = open(extra_info_output_file, "w")
    # process each file in directory
    for filename in os.listdir(input_dir):  
        # parse XML file, obtaining a DOM tree
        fullname = os.path.join(input_dir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        
        # process each sentence in the file
        for sentence in root.findall('sentence'):
            # Get sentence id and tokenize text
            sent_id = sentence.get('id') # get sentence id
            sent_text = sentence.get('text') #get sentence text
            splitted_sent = sent_text.split('\r\n')
            sent_text = splitted_sent[0]
            for s in splitted_sent[1:]:
                if len(s) >= 1:
                    sent_text += ('. ' + s)
            
            # load sentence entities into a dictionary
            entities = {}
            for ent in sentence.findall('entity'):
                ent_id = ent.get('id')
                offsets = ent.get('charOffset').split(';')
                offs = [o.split('-') for o in offsets]
                entities[ent_id] = offs
            
            tree = analyze(sent_text)
            # for each pair in the sentence, decide whether it is DDI and its type
            for pair in sentence.findall('pair'):
                e1 = pair.get('e1')
                e2 = pair.get('e2')
                
                gold_class = pair.get('type')
                if gold_class is None:
                    gold_class = 'null'
                
                if number == 1:
                    features = extract_features_1(tree, entities, e1, e2)
                elif number == 2:
                    features = extract_features_2(tree, entities, e1, e2)
                else:
                    features = extract_features_3(tree, entities, e1, e2)
                output_features(sent_id, e1, e2, gold_class, features, extra_info_outf, outf)
    outf.close()
    extra_info_outf.close()
    return

In [10]:
main_extract_features(train_path, 'features/info_train_features_1', 'features/train_features_1', 1)
main_extract_features(devel_path, 'features/info_devel_features_1', 'features/devel_features_1', 1)
main_extract_features(test_path, 'features/info_test_features_1', 'features/test_features_1', 1)

main_extract_features(train_path, 'features/info_train_features_2', 'features/train_features_2', 2)
main_extract_features(devel_path, 'features/info_devel_features_2', 'features/devel_features_2', 2)
main_extract_features(test_path, 'features/info_test_features_2', 'features/test_features_2', 2)

main_extract_features(train_path, 'features/info_train_features_3', 'features/train_features_3', 3)
main_extract_features(devel_path, 'features/info_devel_features_3', 'features/devel_features_3', 3)
main_extract_features(test_path, 'features/info_test_features_3', 'features/test_features_3', 3)

## MaxEntropy Classifier
### Learner

In [11]:
def train(megam, parameters, features_file, out_train_model):
    print("ubuntu run \""+ megam + " " + parameters + " multiclass " + features_file + " > " + out_train_model + "\"")
    os.system("ubuntu run \""+ megam + " " + parameters + " multiclass " + features_file + " > " + out_train_model + "\"")
    return

In [95]:
train(model_path+'megam-64.opt', '-quiet -nc -nobias -repeat 5  -tune -lambda 0.01  -minfc 3', features_path+'train_features_3', model_path+'model.dat')

ubuntu run "/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/megam-64.opt -quiet -nc -nobias -repeat 5  -tune -lambda 0.01  -minfc 3 multiclass /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/features/train_features_3 > /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/model.dat"


In [57]:
# param_1 = ['-nobias', '']
# param_2 = ['-repeat 5', '-repeat 10', '']
# param_3 = ['-norm2', '']
# param_4 = ['-tune', '']
# param_5 = ['-lambda 0.9', '-lambda 0.01', '']
# param_6 = ['']
# param_7 = ['-minfc 3', '-minfc 5', '']

# param_1 = ['']
# param_2 = ['-repeat 5', '-repeat 10']
# param_3 = ['']
# param_4 = ['-tune', '']
# param_5 = ['-lambda 0.9', '-lambda 0.01', '']
# param_6 = ['']
# param_7 = ['-minfc 1', '-minfc 3', '']

# param_1 = ['-nobias']
# param_2 = ['-repeat 10']
# param_3 = ['-norm2']
# param_4 = ['-tune']
# param_5 = ['-lambda 0.9', '-lambda 0.01', '']
# param_6 = ['']
# param_7 = ['-minfc 3', '-minfc 5', '']

# #' -quiet -nc -nobias -repeat 10 -norm2 -tune -minfc 3 '

# all_parameters = []
# model_nb = 0
# for p1 in param_1:
#     for p2 in param_2:
#         for p3 in param_3:
#             for p4 in param_4:
#                 for p5 in param_5:
#                     for p6 in param_6:
#                         for p7 in param_7:
#                             parameters = ' '.join(['-quiet -nc', p1, p2, p3, p4, p5, p6, p7])
#                             print(parameters)
#                             for feat in ['1', '2', '3']:    
#                                 train(model_path+'megam-64.opt', parameters, features_path+'train_features_'+feat, model_path+'model_'+str(model_nb)+'_f'+feat+'.dat')
#                                 outf = open('model/parameters_'+str(model_nb)+'_f'+feat, "w")
#                                 outf.write(parameters)
#                                 outf.close()
#                             all_parameters.append(parameters)
#                             print("Model nb:", model_nb)
#                             model_nb += 1

### Classifier

In [25]:
def classify(megam, parameters, features_file, train_model, prediction_output):
    print("ubuntu run \""+ megam + " " + parameters + " -predict " + train_model + " multiclass " + features_file + " > " + prediction_output + "\"")
    os.system("ubuntu run \""+ megam + " " + parameters + " -predict " + train_model + " multiclass " + features_file + " > " + prediction_output + "\"")

In [96]:
# Devel prediction
classify(model_path+'megam-64.opt', '-quiet -nc -nobias', features_path+'devel_features_3', model_path+'model.dat', output_path+'devel_prediction_v2')

ubuntu run "/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/megam-64.opt -quiet -nc -nobias -predict /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/model.dat multiclass /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/features/devel_features_3 > /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/output/devel_prediction_v2"


In [97]:
# Test prediction
classify(model_path+'megam-64.opt', '-quiet -nc -nobias', features_path+'test_features_3', model_path+'model.dat', output_path+'test_prediction_v2')

ubuntu run "/mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/megam-64.opt -quiet -nc -nobias -predict /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/model/model.dat multiclass /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/features/test_features_3 > /mnt/d/Users/Albert/OneDrive/Educació/Master\ in\ Artificial\ Intelligence\ \(UPC\)/Semester\ 2/AHLT/Lab/AHLT-MAI/source/ddi/output/test_prediction_v2"


In [73]:
# for i in range(len(all_parameters)):
#     for feat in ['1', '2', '3']:
#         classify(model_path+'megam-64.opt', all_parameters[i], features_path+'devel_features_'+feat, model_path+'model_'+str(i)+'_f'+feat+'.dat', output_path+'devel_prediction_'+str(i)+'_f'+feat+'.dat')
#         classify(model_path+'megam-64.opt', all_parameters[i], features_path+'test_features_'+feat, model_path+'model_'+str(i)+'_f'+feat+'.dat', output_path+'test_prediction_'+str(i)+'_f'+feat+'.dat')

## Produce output and evaluate

In [29]:
def output_ddi(sent_id, id_e1, id_e2, predicted_class, outf):
    if predicted_class == 'null':
        outf.write(sent_id+'|'+id_e1+'|'+id_e2+'|0|'+predicted_class)
    else:
        outf.write(sent_id+'|'+id_e1+'|'+id_e2+'|1|'+predicted_class)
    outf.write("\n")

In [30]:
def evaluate(input_dir, output_file):
    os.system("java -jar ../../eval/evaluateDDI.jar "+ str(input_dir) + " " + str(output_file))
    return 

In [31]:
def main_function(input_dir, features_info, prediction_file, output_file):
    outf = open(output_file, "w")
    
    with open(features_info) as textfile1, open(prediction_file) as textfile2: 
        for x, y in zip(textfile1, textfile2):
            saved_features = x.split('\t')
            sent_id = saved_features[0]
            id_e1 = saved_features[1]
            id_e2 = saved_features[2]
            
            predicted_class = y.split()[0]
            
            output_ddi(sent_id, id_e1, id_e2, predicted_class, outf)
    outf.close()
    # get performance score
    evaluate(input_dir, output_file)

In [100]:
main_function(devel_path, 'features/info_devel_features', 'output/devel_prediction', 'output/task9.2_develGoal_1.txt')

In [101]:
main_function(test_path, 'features/info_test_features', 'output/test_prediction', 'output/task9.2_testGoal_1.txt')

In [34]:
for i in range(len(all_parameters)):
    for feat in ['1', '2', '3']:
        main_function(devel_path, 'features/info_devel_features_'+feat, 'output/devel_prediction_'+str(i)+'_f'+feat+'.dat', 'output/task9.2_develGoal'+str(i)+'_f'+feat+'.txt')
        main_function(test_path, 'features/info_test_features_'+feat, 'output/test_prediction_'+str(i)+'_f'+feat+'.dat', 'output/task9.2_testGoal'+str(i)+'_f'+feat+'.txt')
