# Lab1

### Imports 

In [407]:
import os
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import xml.etree.ElementTree as ET 
import pycrfsuite
import re
import string


import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

### Paths

In [408]:
train_path = 'data/Train'
devel_path = 'data/Devel'
test_path = 'data/Test-NER'

### Feature extractor

In [409]:
def tokenize(text): 
    list_tokens = []
    tokens = word_tokenize(text)
    pos_tag = [tag[1] for tag in nltk.pos_tag(tokens)]
    ini_token = 0
    for token in tokens:
        ini_token = text.find(token, ini_token)
        list_tokens.append((token, ini_token, ini_token+len(token)-1))
        ini_token += len(token)
        
    return list_tokens, pos_tag

In [1328]:
def extract_features(tokens, pos_tag):
    features = []
    
    for i in range(len(tokens)):
        token = tokens[i]
        word = token[0]
        
        lower = re.compile(r'.*[a-z]+')
        upper = re.compile(r'.*[A-Z]+')
        
        feature_vector = [
            'word.lower=' + word.lower(),
            'word[-5:]=' + word[-5:],
            'word[-4:]=' + word[-4:],
            'word[-3:]=' + word[-3:],
            'word[-2:]=' + word[-2:],
            'word[:2]=' + word[:2],
            'word[:3]=' + word[:3],
            'word[:4]=' + word[:4],
            'word[:5]=' + word[:5],
            'word.length=%s' % len(word),
            'word.isupper=%s' % word.isupper(),
            'word.isupperandlower=%s' % bool(lower.match(word) and upper.match(word)),
            'word.containdigit=%s' % bool(re.search(r'\d', word)),
            'word.containdash=%s' % ('-' in word),
            'word.postag=' + pos_tag[i],
            'word.postag_1=' + pos_tag[i][0],
            #'word.ispunctuation=%s' % (word in string.punctuation),
            #'word.specialchar=%s' % bool(re.search('^[a-zA-Z0-9]*$',word)),
            'word.isalpha=%s' % word.isalpha(),
            'word.istitle=%s' % word.istitle(),
            #'word.startswithdigit=%s' % word[0].isdigit()
        ]
#         if i > 0:
#             word1 = tokens[i-1][0]
#             feature_vector.extend([
#                 '-1:word.lower=' + word1.lower(),
# #                 '-1:word.isupper=%s' % word1.isupper(),
# #                 #'-1:word.isupperandlower=%s' % bool(lower.match(word1) and upper.match(word1)),
# #                 '-1:word.length%s=' % len(word1),
# #                 #'-1:word.containdigit=%s' % bool(re.search(r'\d', word1)),
# #                 #'-1:word.containdash=%s' % ('-' in word1),
# #                 '-1:word.postag=' + pos_tag[i-1],
# #                 '-1:word.postag_1=' + pos_tag[i-1][0],
# #                 #'-1:word.ispunctuation=%s' % (word1 in string.punctuation),
# #                 #'-1:word.specialchar=%s' % bool(re.search('^[a-zA-Z0-9]*$',word)),
# #                 '-1:word.isalpha=%s' % word1.isalpha(),
# #                 #'-1:word.istitle=%s' % word1.istitle()
#             ])
#         else:
#             feature_vector.append('Start')

#         if i < len(tokens)-1:
#             word1 = tokens[i+1][0]
#             feature_vector.extend([
#                 '+1:word.lower=' + word1.lower(),
# #                 '+1:word.isupper=%s' % word1.isupper(),
# #                 #'+1:word.isupperandlower=%s' % bool(lower.match(word1) and upper.match(word1)),
# #                 '+1:word.length%s=' % len(word1),
# #                 #'+1:word.containdigit=%s' % bool(re.search(r'\d', word1)),
# #                 #'+1:word.containdash=%s' % ('-' in word1),
# #                 '+1:word.postag=' + pos_tag[i+1],
# #                 '+1:word.postag_1=' + pos_tag[i+1][0],
# #                 #'+1:word.ispunctuation=%s' % (word1 in string.punctuation),
# #                 #'+1:word.specialchar=%s' % bool(re.search('^[a-zA-Z0-9]*$',word)),
# #                 '+1:word.isalpha=%s' % word1.isalpha(),
# #                 #'+1:word.istitle=%s' % word1.istitle()
#             ])
#         else:
#             feature_vector.append('End')
            
        features.append(feature_vector)
      
    return features

In [1329]:
def output_features(sent_id, tokens, entities, features, outf):
    j = 0
    for i in range(len(tokens)):
        word = tokens[i][0]
        features_str = ' '.join(features[i])
        
        if j < len(entities) and word in entities[j].get('name').split():
            entity_name = entities[j].get('name').split()
            entity_type = entities[j].get('type').split()
            
            word_position = entity_name.index(word)
            
            if word_position == 0:
                outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'B-'+entities[j].get('type')+' '+features_str)
            else:
                outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'I-'+entities[j].get('type')+' '+features_str)
            
            if word_position == len(entity_name)-1:
                    j+=1
                
        else:
            outf.write(sent_id+' '+word+' '+str(tokens[i][1])+' '+str(tokens[i][2])+' '+'O'+' '+features_str)
        outf.write("\n")

In [1330]:
def main_extract_features(inputdir, outputfile):
    outf = open(outputfile, "w")
    for filename in os.listdir(inputdir):     
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        
        for sentence in root.findall('sentence'):
            sent_id = sentence.get('id')
            tokens, pos_tags = tokenize(sentence.get('text'))
            entities = [{'name':entity.get('text'), 'type':entity.get('type'), 'offset':entity.get('charOffset')}
                         for entity in sentence.findall('entity')]
            features = extract_features(tokens, pos_tags)
            output_features(sent_id, tokens, entities, features, outf)
    
    outf.close()
    return 

In [1331]:
main_extract_features(train_path, 'train_feature_vectors')
main_extract_features(devel_path, 'devel_feature_vectors')
main_extract_features(test_path, 'test_feature_vectors')

### Learner

In [1332]:
def read_features_and_classes(inputfile):
    features = []
    classes = []
    prev_sent_id = ''
    with open(inputfile) as f:
        for i, line in enumerate(f):
            saved_features = line.split()
            sent_id = saved_features[0]
            
            if i == 0:
                feature_vector = []
                classes_vector = []
                feature_vector.append(saved_features[5:])
                classes_vector.append(saved_features[4])
            
            elif sent_id == prev_sent_id:
                feature_vector.append(saved_features[5:])
                classes_vector.append(saved_features[4])
            
            else:
                features.append(feature_vector)
                classes.append(classes_vector)
                feature_vector = []
                classes_vector = []
            
            prev_sent_id = sent_id
    
    return features, classes               

In [1333]:
def train(features_file, model_name):
    # Get features of train data
    features_train, gs_train = read_features_and_classes(features_file)

    crf = pycrfsuite.Trainer(algorithm='pa', verbose=False)
    
    params = {
        'c': 0.21600273890535607,
        'epsilon': 0.004802939229551229,
        'type': 2,
        'feature.possible_transitions': True,
        'feature.possible_states': True,
        'max_iterations': 100
    }
    
    crf.set_params(params)
    
    for xseq, yseq in zip(features_train, gs_train):
        crf.append(xseq, yseq)

    crf.train(model_name)
    
    return

In [1334]:
train('train_feature_vectors', 'ml_model.crfsuite')

### Classifier

In [1335]:
def read_features(sent_id, inputfile):
    features = []
    with open(inputfile) as f:
        features = [line.split()[5:] for line in f if line.split()[0] == sent_id]
        
    return features    

In [1336]:
def predict_classes(tagger, features):
    classes = []
    for ch in tagger.tag(features):
        classes.append(ch)
    return classes

In [1337]:
def output_entities(sent_id, tokens, classes, outf):
    B_indices = [i for i in range(len(classes)) if classes[i].startswith('B')]
    for b in B_indices:
        I_indices = []
        i = b + 1
        while i < len(classes) and classes[i].startswith('I'):
            I_indices.append(i)
            i+=1
        
        if len(I_indices) == 0:
            outf.write(sent_id+'|'+str(tokens[b][1])+'-'+str(tokens[b][2])+'|'+tokens[b][0]+'|'+classes[b][2:])
        else:
            joined_tokens = ' '.join([tokens[j][0] for j in [b] + I_indices])
            outf.write(sent_id+'|'+str(tokens[b][1])+'-'+str(tokens[I_indices[-1]][2])+'|'+joined_tokens+'|'+classes[b][2:])
        
        outf.write("\n")   
    return

In [1338]:
def evaluate(inputdir, outputfile):
    os.system("java -jar eval/evaluateNER.jar "+ str(inputdir) + " " + str(outputfile))
    return

In [1339]:
def my_nerc(inputdir, outputfile, model_name, features_file):    
    outf = open(outputfile, "w")
    # Read files and parse files
    for filename in os.listdir(inputdir):
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        sentences = root.findall('sentence')
        for i in range(len(sentences)):
            # Get sentence id and tokenize text
            sent_id = sentences[i].get('id')
            tokens, pos_tags = tokenize(sentences[i].get('text'))
            features = read_features(sent_id, features_file)
            
            tagger = pycrfsuite.Tagger()
            tagger.open(model_name)
            classes = predict_classes(tagger, features)
            
            output_entities(sent_id, tokens, classes, outf)
    
    outf.close()
    evaluate(inputdir, outputfile)
    return

In [1340]:
my_nerc(devel_path, 'task9.1_develGoal3_1.txt', 'ml_model.crfsuite', 'devel_feature_vectors')

In [1341]:
my_nerc(test_path, 'task9.1_testGoal3_1.txt', 'ml_model.crfsuite', 'test_feature_vectors')