## Lab 4: DDI using ML

### Imports

In [12]:
import os
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.corpus import stopwords 
import xml.etree.ElementTree as ET

### Variables

In [13]:
devel_path = '../../data/Devel'
test_path = '../../data/Test-DDI'
train_path = '../../data/Train'
outputfile = 'task9.2_develGoal_1.txt'

my_parser = CoreNLPDependencyParser(url="http://localhost:9000")

### Functions

In [14]:
def analyze(sent):
    if len(sent)<= 0:
        return None
    
    mytree, = my_parser.raw_parse(sent)
    tree = mytree.nodes
    ini_token = 0
                   
    # clean tree
    info = ['address', 'head', 'lemma', 'rel', 'word', 'tag']
    for k in range(len(tree)):
        node = tree[k] 
        for key in list(node):
            if key not in info:
                del node[key]
        
        if k != 0:
            # add offsets
            ini_token = sent.find(node['word'] ,ini_token)
            node['start'] = ini_token
            ini_token += len(node['word'])
            node['end'] = ini_token - 1
            
    return tree

In [43]:
def get_entity_nodes(tree, entities, e1, e2):
    entity1 = []
    entity2 = []
    for key in tree.keys():
        if 'start' in tree[key].keys() and (str(tree[key]['start']) == str(entities[e1][0]) or str(tree[key]['end']) == str(entities[e1][1])):
            entity1.append(tree[key])
        elif 'start' in tree[key].keys() and (str(tree[key]['start']) == str(entities[e2][0]) or str(tree[key]['end']) == str(entities[e2][1])):
            entity2.append(tree[key])
    return entity1, entity2

def extract_features(tree, entities, e1, e2):
    # Get entities
    entity1, entity2 = get_entity_nodes(tree, entities, e1, e2)
    
    # Features
    features = []
    
    for i in range(len(entity1)):
        if entity1[i]['head'] != 0:
            features.extend(['h1_lemma_%s=%s' % (str(i), tree[entity1[i]['head']]['lemma']),
                             'h1_word_%s=%s' % (str(i), tree[entity1[i]['head']]['word']),
                             'h1_tag_%s=%s' % (str(i), tree[entity1[i]['head']]['tag']),
                             'h1_address_%s=%s' % (str(i), tree[entity1[i]['head']]['address']),
                            ])
            
        features.append('e1_rel_%s=%s' % (str(i), entity1[i]['rel']))
            
    for i in range(len(entity2)):
        if entity2[i]['head'] != 0:
            features.extend(['h2_lemma_%s=%s' % (str(i), tree[entity2[i]['head']]['lemma']),
                             'h2_word_%s=%s' % (str(i), tree[entity2[i]['head']]['word']),
                             'h2_tag_%s=%s' % (str(i), tree[entity2[i]['head']]['tag']),
                             'h2_address_%s=%s' % (str(i), tree[entity2[i]['head']]['address']),
                            ])
        features.append('e2_rel_%s=%s' % (str(i), entity2[i]['rel']))
      
    return features

In [44]:
def output_features(sent_id, e1, e2, gold_class, features, outf):
    features_str = ' '.join(features)
    outf.write(sent_id+'\t'+e1+'\t'+e2+'\t'+gold_class+'\t'+features_str)
    outf.write("\n")

In [45]:
def main_extract_features(inputdir, outputfile):
    outf = open(outputfile, "w")
    # process each file in directory
    for filename in os.listdir(inputdir):  
        # parse XML file, obtaining a DOM tree
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        
        # process each sentence in the file
        for sentence in root.findall('sentence'):
            sent_id = sentence.get('id')
            
            # load sentence entities into a dictionary
            entities = {}
            for ent in sentence.findall('entity'):
                ent_id = ent.get('id') 
                offs = ent.get('charOffset').split('-')
                entities[ent_id] = offs
            
            tree = analyze(sentence.get('text'))
            # for each pair in the sentence, decide whether it is DDI and its type
            for pair in sentence.findall('pair'):
                e1 = pair.get('e1')
                e2 = pair.get('e2')
                
                gold_class = pair.get('type')
                if gold_class is None:
                    gold_class = 'null'
            
                features = extract_features(tree, entities, e1,e2)
                output_features(sent_id, e1, e2, gold_class, features, outf)
    outf.close()
    return

In [46]:
main_extract_features(train_path, 'train_feature_vectors')
main_extract_features(devel_path, 'devel_feature_vectors')
main_extract_features(test_path, 'test_feature_vectors')

## MaxEntropy Classifier
### Learner

In [17]:
def read_features_and_classes(inputfile):
    features = []
    classes = []
    prev_sent_id = ''
    with open(inputfile) as f:
        for i, line in enumerate(f):
            saved_features = line.split()
            sent_id = saved_features[0]
            
            if i == 0:
                feature_vector = []
                classes_vector = []
                feature_vector.append(saved_features[5:])
                classes_vector.append(saved_features[4])
            
            elif sent_id == prev_sent_id:
                feature_vector.append(saved_features[5:])
                classes_vector.append(saved_features[4])
            
            else:
                features.append(feature_vector)
                classes.append(classes_vector)
                feature_vector = []
                classes_vector = []
            
            prev_sent_id = sent_id
    
    return features, classes               

In [18]:
def train(features_file, model_name):
    # Get features of train data
    features_train, gs_train = read_features_and_classes(features_file)
    numIterations = 100
 
    algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
    classifier = nltk.MaxentClassifier.train(features_train, algorithm, max_iter=numIterations)
    classifier.show_most_informative_features(10)

    
    return

In [None]:
train('train_feature_vectors', 'ml_model.megam')

### Classifier