# Lab1

### Imports 

In [29]:
import os
import nltk
from nltk.tokenize import word_tokenize
import xml.etree.ElementTree as ET 
import pycrfsuite
import re

### Paths

In [2]:
train_path = 'data\Train'
devel_path = 'data\Devel'
test_path = 'data\Test-NER'

### Feature extractor

In [15]:
def tokenize(text): 
    list_tokens = []
    tokens = word_tokenize(text)
    pos_tag = [tag[1] for tag in nltk.pos_tag(tokens)]
    ini_token = 0
    for token in tokens:
        ini_token = text.find(token, ini_token)
        list_tokens.append((token, ini_token, ini_token+len(token)-1))
        ini_token += len(token)
        
    return list_tokens, pos_tag

In [41]:
def extract_features(tokens, pos_tag):
    features = []
    
    for i in range(len(tokens)):
        token = tokens[i]
        word = token[0]
        
        feature_vector = [
            'word.lower=' + word.lower(),
            'word[-5:]=' + word[-5:],
            'word[-4:]=' + word[-4:],
            'word[-3:]=' + word[-3:],
            'word[-2:]=' + word[-2:],
            'word[:2]=' + word[:2],
            'word[:3]=' + word[:3],
            'word[:4]=' + word[:4],
            'word[:5]=' + word[:5],
            'word.length=%s' % len(word),
            'word.isupper=%s' % word.isupper(),
            'word.containdigit=%s' % bool(re.search(r'\d', word)),
            'word.containdash=%s' % ('-' in word),
            'word.postag=' + pos_tag[i]
        ]
        if i > 0:
            word1 = tokens[i-1][0]
            feature_vector.extend([
                '-1:word.lower=' + word1.lower(),
                '-1:word.isupper=%s' % word1.isupper(),
                '-1:word.length%s=' % len(word1),
                '-1:word.isupper=%s' % word1.isupper(),
                '-1:word.containdigit=%s' % bool(re.search(r'\d', word1)),
                '-1:word.containdash=%s' % ('-' in word1),
                '-1:word.postag=' + pos_tag[i-1]
                
            ])
        else:
            feature_vector.append('Beginning')

        if i < len(tokens)-1:
            word1 = tokens[i+1][0]
            feature_vector.extend([
                '+1:word.lower=' + word1.lower(),
                '+1:word.isupper=%s' % word1.isupper(),
                '+1:word.length%s=' % len(word1),
                '+1:word.isupper=%s' % word1.isupper(),
                '+1:word.containdigit=%s' % bool(re.search(r'\d', word1)),
                '+1:word.containdash=%s' % ('-' in word1),
                '+1:word.postag=' + pos_tag[i+1]
            ])
        else:
            feature_vector.append('End')
            
        features.append(feature_vector)
      
    return features

In [42]:
def output_features(sent_id, tokens, entities, features, outf):
    j = 0
    for i in range(len(tokens)):
        word = tokens[i][0]
        features_str = ' '.join(features[i])
        
        if j < len(entities) and word in entities[j].get('name').split():
            entity_names = entities[j].get('name').split()
            if entity_names.index(word) == 0:
                outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'B-'+entities[j].get('type')+' '+features_str)
            elif entity_names.index(word) < len(entity_names)-1:
                outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'I-'+entities[j].get('type')+' '+features_str)
            else:
                outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'I-'+entities[j].get('type')+' '+features_str)
                ++j
        else:
            outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'O'+' '+features_str)
            ++j
        outf.write("\n")

In [43]:
def main_extract_features(inputdir, outputfile):
    outf = open(outputfile, "w")
    for filename in os.listdir(inputdir):     
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        
        for sentence in root.findall('sentence'):
            sent_id = sentence.get('id')
            tokens, pos_tag = tokenize(sentence.get('text'))
            entities = [{'name':entity.get('text'), 'type':entity.get('type'), 'offset':entity.get('charOffset')}
                         for entity in sentence.findall('entity')]
            features = extract_features(tokens, pos_tag)
            output_features(sent_id, tokens, entities, features, outf)
    
    outf.close()
    return 

In [44]:
main_extract_features(train_path, 'train_feature_vectors')
main_extract_features(devel_path, 'devel_feature_vectors')
main_extract_features(test_path, 'test_feature_vectors')

### Learner

In [45]:
def read_features_and_classes(inputfile):
    features = []
    gold_classes = []
    prev_sent_id = ''
    with open(inputfile) as f:
        for i, line in enumerate(f):
            
            saved_features = line.split()
            sent_id = saved_features[0]
            
            if i == 0:
                feature_vector = []
                gold_classes_vector = []
                feature_vector.append(saved_features[5:])
                gold_classes_vector.append(saved_features[4])
            
            elif sent_id == prev_sent_id:
                feature_vector.append(saved_features[5:])
                gold_classes_vector.append(saved_features[4])
            
            else:
                features.append(feature_vector)
                gold_classes.append(gold_classes_vector)
                feature_vector = []
                gold_classes_vector = []
            
            prev_sent_id = sent_id
    
    return features, gold_classes               

In [46]:
def train(features_file, model_name):
    # Get features of train data
    features_train, gs_train = read_features_and_classes(features_file)
    
    # Train
    crf = pycrfsuite.Trainer(verbose=False)
    crf.set_params({
        'c1': 1.0,
        'c2': 0.001,
        'max_iterations': 1000,
    })

    for xseq, yseq in zip(features_train, gs_train):
        crf.append(xseq, yseq)

    crf.train(model_name)
    
    return

In [47]:
train('train_feature_vectors', 'ml_model.crfsuite')

### Classifier

# TO-DO: Refactor

In [48]:
def output_entities(sent_id, tokens, classes, outf):
    i = 0
    while i < len(tokens):
        token = tokens[i][0]
        offset_ini = tokens[i][1]
        offset_end = tokens[i][2]
        token_BIO = classes[i][0]
        if token_BIO == 'B':
            token_class = classes[i][2:]
        ++i
        
        if token_BIO == 'B':
            while i < len(tokens) and classes[i].startsWith('I'):
                token = token + ' ' + tokens[i][0]
                offset_end = tokens[i][2]
                ++i
                
            outf.write(sent_id+'|'+str(offset_ini)+'-'+str(offset_end)+'|'+token+'|'+token_class)
            outf.write("\n")
    return

In [49]:
def predict_classes(features_file, model_name):
    tagger = pycrfsuite.Tagger()
    tagger.open(model_name)
    features_devel, gs_devel = read_features_and_classes(features_file)
    predicted_classes = []
    for xseq in features_devel:
        for ch in tagger.tag(xseq):
            predicted_classes.append(ch)
    return predicted_classes

In [50]:
def evaluate(inputdir, outputfile):
    os.system("java -jar eval/evaluateNER.jar "+ str(inputdir) + " " + str(outputfile))
    return

In [51]:
def my_nerc(inputdir, outputfile, model_name, features_file):
    classes = predict_classes(features_file, model_name)
    
    outf = open(outputfile, "w")
    # Read files and parse files
    for filename in os.listdir(inputdir):
        print('1')
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        sentences = root.findall('sentence')
        for i in range(len(sentences)):
            # Get sentence id and tokenize text
            sent_id = sentences[i].get('id')
            tokens = tokenize(sentences[i].get('text'))
            output_entities(sent_id, tokens, classes[i], outf)
    
    outf.close()
    evaluate(inputdir, outputfile)
    return

In [None]:
my_nerc(devel_path, 'task9.1_devel_1.txt', 'ml_model.crfsuite', 'devel_feature_vectors')

1


In [None]:
my_nerc(test_path, 'task9.1_test_1.txt', 'ml_model.crfsuite', 'test_feature_vectors')

In [16]:
ctokenize("Hola Albert, viva Cardona")

([('Hola', 0, 3),
  ('Albert', 5, 10),
  (',', 11, 11),
  ('viva', 13, 16),
  ('Cardona', 18, 24)],
 ['NNP', 'NNP', ',', 'FW', 'NNP'])