# Lab1

### Imports 

In [7]:
import os
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import xml.etree.ElementTree as ET 

### Variables

In [11]:
inputdir = "data\Devel"
test_path = 'data/Test-NER'

### Functions

In [9]:
def tokenize(text): 
    list_tokens = []
    tokens = word_tokenize(text)
    ini_token = 0
    for token in tokens:
        ini_token = text.find(token, ini_token)
        list_tokens.append((token, ini_token, ini_token+len(token)-1))
        ini_token += len(token)
        
    return list_tokens

def extract_entities(tokens):
    entities = []
    prev_drug = ""
    
    for i in range(len(tokens)):
        token = tokens[i]
        word = token[0]
        
        # Prefix and sufix rules
        drug_prefixes = ('pheny', 'digox', 'warfa', 'meth', 'theophy', 'lith', 'keto', 'cime', 'insu', 'fluox', 'alcoh', 'cyclos', 'eryth', 'carba', 'rifa', 'caffe')
        drug_sufixes = ('pitant', 'dine', 'azole', 'mide', 'pine', 'line', 'mine', 'tine', 'arin', 'avir', 'azem', 'rine', 'rone', 'arbital', 'olol', 'afil', 'inol', 'zolam')
        
        group_prefixes = ('benzo', 'beta', 'antico', 'antide', 'antibi', 'antihi', 'nsai', 'contra')
        group_sufixes = ('steroids','tics', 'ants', 'ents', 'tors', 'acid', 'acids', 'ceptives', 'gens', 'pines', 'lines', 'mines')
        
        brand_prefixes = ('aspi', 'accu', 'beza', 'star', 'exja')
        brand_sufixes = ('tane', 'dine', 'anil')
        
        drug_n_prefixes = ('ibog', 'endo')
        drug_n_sufixes = ('ate', 'sin', 'toxin', 'orfon')
        
        if word.lower().startswith(drug_prefixes) or word.lower().endswith(drug_sufixes):
            entities.append({'name':word, 'offset': str(token[1])+'-'+str(token[2]),'type':'drug'})
            prev_drug = str(word)+ " "+str(token[1])+" "+"drug"
            
        elif word.lower().startswith(group_prefixes) or word.lower().endswith(group_sufixes):
            if prev_drug != '':
                info = prev_drug.split(" ")
                if len(entities) > 0 and info[2]=='group':
                    entities.pop()
                    entities.append({'name':str(info[0])+word, 'offset': str(info[1])+'-'+str(token[2]),'type':'group'})
            else:
                entities.append({'name':word, 'offset': str(token[1])+'-'+str(token[2]),'type':'group'})
            prev_drug = str(word)+ " "+str(token[1])+ " "+"group"
        
        elif (word.isupper() and len(word)>4) or word.lower().startswith(brand_prefixes) or word.lower().endswith(brand_sufixes):
            entities.append({'name':word, 'offset': str(token[1])+'-'+str(token[2]),'type':'brand'})
            prev_drug = str(word)+ " "+str(token[1])+ " "+ "brand"
        
        elif word.isupper() or word.lower().startswith(drug_n_prefixes) or word.lower().endswith(drug_n_prefixes)\
        or (bool(re.search(r'\d', word)) and '-' in word):
            entities.append({'name':word, 'offset': str(token[1])+'-'+str(token[2]),'type':'drug_n'})
            prev_drug = str(word)+ " "+str(token[1])+ " "+"drug_n"
        else:
            prev_drug = ""
        
    return entities

def output_entities(sent_id, ents, outf):
    for entity in ents:
        outf.write(sent_id+'|'+entity['offset']+'|'+entity['name']+'|'+entity['type'])
        outf.write("\n")
    return

def evaluate(inputdir, outputfile):
    os.system("java -jar eval/evaluateNER.jar "+ str(inputdir) + " " + str(outputfile))
    return

def my_nerc(inputdir, outputfile):
    outf = open(outputfile, "w")
    # Read files and parse files
    for filename in os.listdir(inputdir):     
        fullname = os.path.join(inputdir, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()    
        
        for sentence in root.findall('sentence'):
            # Get sentence id and tokenize text
            sent_id = sentence.get('id')
            tokens = tokenize(sentence.get('text'))
            entities = extract_entities(tokens)
            output_entities(sent_id, entities, outf)
    
    outf.close()
    evaluate(inputdir, outputfile)
    return



In [10]:
my_nerc(devel_path, 'task9.1_develGoal_1.txt')

In [12]:
my_nerc(test_path, 'task9.1_testGoal_1.txt')