# Lab2

### Imports 

In [2]:
import os
import nltk
from nltk.tokenize import word_tokenize
import xml.etree.ElementTree as ET 
import pycrfsuite

### Paths

In [3]:
train_path = 'data/Train'
devel_path = 'data/Devel'
test_path = 'data/Test-NER'
drug_bank_path = "resources/DrugBank.txt"
HSDB_path = "resources/HSDB.txt"

### Extract drug bank

In [11]:
def extract_drug_bank(drug_bank_path):
    drug_bank = []
    with open(drug_bank_path, encoding="utf8") as f:
        for line in f:
            data = line.strip().split('|')
            drug_bank.append(data[0])
    return drug_bank

drug_bank = extract_drug_bank(drug_bank_path)

In [12]:
def extract_HSDB(HSDB_path):
    HSDB = {}
    with open(HSDB_path, encoding="utf8") as f:
        for line in f:
            data = line.strip()
            HSDB.append(data)
    return HSDB

HSDB = extract_drug_bank(HSDB_path)

### Feature extractor

In [13]:
def tokenize(text): 
    list_tokens = []
    tokens = word_tokenize(text)
    ini_token = 0
    for token in tokens:
        ini_token = text.find(token, ini_token)
        list_tokens.append((token, ini_token, ini_token+len(token)-1))
        ini_token += len(token)
        
    return list_tokens

In [14]:
def extract_features(tokens):
    features = []
    
    for i in range(len(tokens)):
        token = tokens[i]
        word = token[0]
        
        feature_vector = [
            'word.lower=' + word.lower(),
            'word[-3:]=' + word[-3:],
            'word[-2:]=' + word[-2:],
            'word.isupper=%s' % word.isupper(),
            'word.istitle=%s' % word.istitle(),
            'word.isdigit=%s' % word.isdigit()
        ]
        if i > 0:
            word1 = tokens[i-1][0]
            feature_vector.extend([
                '-1:word.lower=' + word1.lower(),
                '-1:word.istitle=%s' % word1.istitle(),
                '-1:word.isupper=%s' % word1.isupper()
            ])
        else:
            feature_vector.append('BOS')

        if i < len(tokens)-1:
            word1 = tokens[i+1][0]
            feature_vector.extend([
                '+1:word.lower=' + word1.lower(),
                '+1:word.istitle=%s' % word1.istitle(),
                '+1:word.isupper=%s' % word1.isupper()
            ])
        else:
            feature_vector.append('EOS')
            
        if word in drug_bank:
            feature_vector.append('DRUGBANK')
        
        if word in HSDB:
            feature_vector.append('HSDB')
            
        features.append(feature_vector)
        
    return features

In [15]:
def output_features(sent_id, tokens, entities, features, outf):
    j = 0
    for i in range(len(tokens)):
        word = tokens[i][0]
        features_str = ' '.join(features[i])
        
        if j < len(entities) and word in entities[j].get('name').split():
            entity_name = entities[j].get('name').split()
            entity_type = entities[j].get('type').split()
            
            word_position = entity_name.index(word)
            
            if word_position == 0:
                outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'B-'+entities[j].get('type')+' '+features_str)
            else:
                outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'I-'+entities[j].get('type')+' '+features_str)
            
            if word_position == len(entity_name)-1:
                    j+=1
                
        else:
            outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'O'+' '+features_str)
        outf.write("\n")

In [16]:
def main_extract_features(inputdir, outputfile):
    outf = open(outputfile, "w")
    for filename in os.listdir(inputdir):     
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        
        for sentence in root.findall('sentence'):
            sent_id = sentence.get('id')
            tokens = tokenize(sentence.get('text'))
            entities = [{'name':entity.get('text'), 'type':entity.get('type'), 'offset':entity.get('charOffset')}
                         for entity in sentence.findall('entity')]
            features = extract_features(tokens)
            output_features(sent_id, tokens, entities, features, outf)
    
    outf.close()
    return 

In [17]:
main_extract_features(train_path, 'train_feature_vectors')
main_extract_features(devel_path, 'devel_feature_vectors')
main_extract_features(test_path, 'test_feature_vectors')

### Learner

In [18]:
def read_features_and_classes(inputfile):
    features = []
    classes = []
    prev_sent_id = ''
    with open(inputfile) as f:
        for i, line in enumerate(f):
            saved_features = line.split()
            sent_id = saved_features[0]
            
            if i == 0:
                feature_vector = []
                classes_vector = []
                feature_vector.append(saved_features[5:])
                classes_vector.append(saved_features[4])
            
            elif sent_id == prev_sent_id:
                feature_vector.append(saved_features[5:])
                classes_vector.append(saved_features[4])
            
            else:
                features.append(feature_vector)
                classes.append(classes_vector)
                feature_vector = []
                classes_vector = []
            
            prev_sent_id = sent_id
    
    return features, classes               

In [19]:
def train(features_file, model_name):
    # Get features of train data
    features_train, gs_train = read_features_and_classes(features_file)
    
    # Train
    crf = pycrfsuite.Trainer(verbose=False)
    crf.set_params({
        'c1': 1.0,
        'c2': 0.001,
        'max_iterations': 1000,
    })

    for xseq, yseq in zip(features_train, gs_train):
        crf.append(xseq, yseq)

    crf.train(model_name)
    
    return

In [20]:
train('train_feature_vectors', 'ml_model.crfsuite')

### Classifier

In [21]:
def read_features(sent_id, inputfile):
    features = []
    with open(inputfile) as f:
        features = [line.split()[5:] for line in f if line.split()[0] == sent_id]
        
    return features    

In [22]:
def predict_classes(tagger, features):
    classes = []
    for ch in tagger.tag(features):
        classes.append(ch)
    return classes

In [23]:
def output_entities(sent_id, tokens, classes, outf):
    B_indices = [i for i in range(len(classes)) if classes[i].startswith('B')]
    for b in B_indices:
        I_indices = []
        i = b + 1
        while i < len(classes) and classes[i].startswith('I'):
            I_indices.append(i)
            i+=1
        
        if len(I_indices) == 0:
            outf.write(sent_id+'|'+str(tokens[b][1])+'-'+str(tokens[b][2])+'|'+tokens[b][0]+'|'+classes[b][2:])
        else:
            joined_tokens = ' '.join([tokens[j][0] for j in [b] + I_indices])
            outf.write(sent_id+'|'+str(tokens[b][1])+'-'+str(tokens[I_indices[-1]][2])+'|'+joined_tokens+'|'+classes[b][2:])
        
        outf.write("\n")   
    return

In [24]:
def evaluate(inputdir, outputfile):
    os.system("java -jar eval/evaluateNER.jar "+ str(inputdir) + " " + str(outputfile))
    return

In [25]:
def my_nerc(inputdir, outputfile, model_name, features_file):    
    outf = open(outputfile, "w")
    # Read files and parse files
    for filename in os.listdir(inputdir):
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        sentences = root.findall('sentence')
        for i in range(len(sentences)):
            # Get sentence id and tokenize text
            sent_id = sentences[i].get('id')
            tokens = tokenize(sentences[i].get('text'))
            features = read_features(sent_id, features_file)
            
            tagger = pycrfsuite.Tagger()
            tagger.open(model_name)
            classes = predict_classes(tagger, features)
            
            output_entities(sent_id, tokens, classes, outf)
    
    outf.close()
    evaluate(inputdir, outputfile)
    return

In [26]:
my_nerc(devel_path, 'task9.1_devel_1.txt', 'ml_model.crfsuite', 'devel_feature_vectors')

In [27]:
my_nerc(test_path, 'task9.1_test_1.txt', 'ml_model.crfsuite', 'test_feature_vectors')