In [1]:
def read_data_from_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        sentence = []
        for line in file:
            if line.strip() and len(line.split('\t')) > 1:  # If line is not empty
                word, adr, Di, Dr, S, F = line.strip().split('\t')
                some = False
                if adr != "O":
                    sentence.append((word, adr[:2] + "ADR"))
                    some = True
                if Di != "O":
                    sentence.append((word, Di[:2] + "Di"))
                    some = True
                if Dr != "O":
                    sentence.append((word, Dr[:2] + "Dr"))
                    some = True
                if S != "O":
                    sentence.append((word, S[:2] + "S"))
                    some = True
                if F != "O":
                    sentence.append((word, F[:2] + "F"))
                    some = True
                if not some:
                    sentence.append((word, "O"))
            elif sentence != []:
                data.append(sentence)
                sentence = []
    return data

train_data = read_data_from_file("./data/CADEC/train.conll")
test_data = read_data_from_file("./data/CADEC/test.conll")

In [2]:
import numpy as np
import spacy
import nltk
import svgling
import seaborn as sns
import matplotlib.pyplot as plt
from complete_class import CompleteNER

In [3]:
cadec_tagger = CompleteNER(train_data, [], test_data, language="esp", postag=False, custom=True)

In [7]:
features = {
			'CAPITALIZATION': True,
			'HAS_UPPER': True,
			'HAS_NUM': True,
			'PUNCTUATION': True,
			'SUF': True,
            'PRE': True,
            '2NEXT': True,
            '2PREV': True,
			'WORD': True,
			'LEN': True,
			'NEXT': True,
			'POS': True,
			'LEMMA': False,
			'CITY': False,
			'COMPANY': True,
			'CELEBRITY': False,
			'RESEARCH_ORGANIZATION': False,
			'NAME': False,
			'SURNAME': False,
			'PREV': True,
			'NEXT': True,
			'NUMBER': False,
			'GENDER': False,
			'PERSON': True,
			'PRONTYPE': False,
			'DEP': False,
			'HEAD_DISTANCE': False,
			'HEAD': False
		}

In [8]:
cadec_tagger.train(verbose=True, file="cadec.mdl", feature_opt=features)

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 47647
Seconds required: 0.158

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 130829.429410
Feature norm: 1.000000
Error norm: 79133.903938
Active features: 47647
Line search trials: 1
Line search step: 0.000011
Seconds required for this iteration: 0.110

***** Iteration #2 *****
Loss: 77892.513069
Feature norm: 3.217942
Error norm: 25960.477036
Active features: 47647
Line search trials: 2
Line search step: 0.318133
Seconds required for this iteration: 0.119

***** Iteration #3 *****
Loss: 67166.469620
Feature norm: 2.863125
Error norm: 20968.180551
Active features: 47647
Line search trials: 1
Line search step: 1.000000
Seconds requ

In [9]:
cadec_tagger.test(verb=True)

Sentence index: 0
GOLD sentence:  [('Dry', 'B-ADR'), ('mouth', 'I-ADR'), ('.', 'O')]
PRED sentence:  [('Dry', 'O'), ('mouth', 'O'), ('.', 'O')]
ERROR 0 --- Gold: ('Dry', 'B-ADR') Predicted: ('Dry', 'O')
ERROR 1 --- Gold: ('mouth', 'I-ADR') Predicted: ('mouth', 'O')

Sentence index: 3
GOLD sentence:  [('I', 'O'), ('take', 'O'), ('it', 'O'), ('with', 'O'), ('2', 'O'), ('propain', 'B-Dr'), ('(', 'O'), ('40', 'O'), ('mg', 'O'), ('each', 'O'), (')', 'O'), ('at', 'O'), ('a', 'O'), ('time', 'O'), (',', 'O'), ('or', 'O'), ('2', 'O'), ('mypaid', 'B-Dr'), ('forte', 'I-Dr'), ('(', 'O'), ('Ibuprofen', 'B-Dr'), ('400', 'I-Dr'), ('mg', 'I-Dr'), ('Paracetamol', 'B-Dr'), ('325', 'I-Dr'), ('mg', 'I-Dr'), ('each', 'O'), (')', 'O'), ('(', 'O'), ('Don', 'O'), ("'", 'O'), ('t', 'O'), ('do', 'O'), ('this', 'O'), ('!!!)', 'O'), ('and', 'O'), ('it', 'O'), ('don', 'O'), ("'", 'O'), ('t', 'O'), ('work', 'O'), ('for', 'O'), ('more', 'O'), ('than', 'O'), ('an', 'O'), ('hour', 'O'), ('.', 'O')]
PRED sentence:  [('

(0.6716589861751152,
 0.548447789275635,
 0.6038322112894873,
 2690,
 0.911040221001743,
 array([[0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [1.100e+03, 2.662e+03, 1.300e+01, 0.000e+00, 1.200e+01, 1.600e+01],
        [6.300e+01, 2.900e+01, 2.300e+01, 0.000e+00, 2.000e+01, 1.000e+00],
        [1.330e+02, 1.000e+01, 0.000e+00, 3.370e+02, 4.000e+00, 0.000e+00],
        [1.400e+02, 8.900e+01, 8.000e+00, 2.000e+00, 2.400e+01, 1.000e+00],
        [4.900e+01, 5.000e+01, 2.000e+00, 0.000e+00, 3.000e+00, 1.400e+01]]))