In [1]:
from preprocessing import *
from crf import *
import os

ROOT_DIR = os.path.dirname(os.path.abspath(""))

train_data_tokens = load_tokens(os.path.join(ROOT_DIR, "data", "training_data_tokens.json"))
train_data = json.load(open(os.path.join(ROOT_DIR, "data", "training_data.json"), "r"))

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
train_data_bio = create_bio_tags(train_data, train_data_tokens)
with open(os.path.join(ROOT_DIR, "data", "training_data_bio.json"), "w") as f:
	json.dump(train_data_bio, f)

In [2]:
model = CRF(
	model_path=os.path.join(ROOT_DIR, "models", "crf_0_0.crfsuite"),
	trainer_params={
		'c1': 1e-3,
		'c2': 1e-1,
		'max_iterations': 50,
		"padding": True,
		"before_lim": 6,
		"after_lim": 1,
	},
	verbose=False
)

In [4]:
precompute_pos(
	tokens_path=os.path.join(ROOT_DIR, "data", "training_data_tokens.json"),
	pos_path=os.path.join(ROOT_DIR, "data", "training_data_pos.json")
)

100%|██████████| 254/254 [00:31<00:00,  8.16it/s]


In [3]:
model.train(
	train_tokens_path=os.path.join(ROOT_DIR, "data", "training_data_tokens.json"),
	train_labels_path=os.path.join(ROOT_DIR, "data", "training_data_bio.json"),
	train_pos_path=os.path.join(ROOT_DIR, "data", "training_data_pos.json"),
)

100%|██████████| 254/254 [00:00<00:00, 3330.11it/s]


In [3]:
ex_sent = train_data_tokens[0][4]
ex_sent["tokens"], np.array(model.predict(ex_sent["tokens"])), ex_sent["spans"]

(array(['diverticulosis', 'extenso', 'insuficiencia', 'renal', 'cronico',
        'colelitiasis', 'antecedente', 'quirurgico', 'exeresis', 'de',
        'lesión', 'cutanea', 'con', 'anestesia', 'local', 'protesis',
        'total', 'de', 'cadera', 'cordectomia', 'herniorrafia', 'inguinal',
        'proz', 'actual', 'var', 'de', '81a', 'que', 'a', 'raiz', 'de',
        'episodio', 'de', 'hematuria', 'macroscopico', 'él', 'realizar',
        'cistoscopia', 'que', 'ser', 'negativo', 'para', 'lesión',
        'maligno', 'pero', 'él', 'objetiir', 'estenosis', 'de', 'uretro'],
       dtype='<U14'),
 array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-NEG',
        'O', 'O', 'B-NEG', 'I-NEG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
       dtype='<U5'),
 array([[558, 572],
        [573, 580],
        [581, 594],
        [595, 600],
    

In [3]:
model.process(
	data_path=os.path.join(ROOT_DIR, "data", "training_data.json"),
	tokens_path=os.path.join(ROOT_DIR, "data", "training_data_tokens.json"),
	save_path=os.path.join(ROOT_DIR, "data", "training_data_predictions_crf.json"),
	pos_path=os.path.join(ROOT_DIR, "data", "training_data_pos.json"),
)

100%|██████████| 254/254 [00:03<00:00, 77.76it/s]


In [4]:
from eval import EvalOfficial

with open(os.path.join(ROOT_DIR, "data", 'training_data.json'), 'r', encoding='utf8') as _f:
	train_data = json.load(_f)

with open(os.path.join(ROOT_DIR, "data", 'training_data_predictions_crf.json'), 'r', encoding='utf8') as _f:
	train_data_predictions = json.load(_f)

metric = EvalOfficial()
p, r, f1 = metric.calc(train_data_predictions, train_data)
print("Training")
print(f'Precision: {p}, Recall:{r}, F1:{f1}')

Training
Precision: 0.9330407646862721, Recall:0.9330407646862721, F1:0.9330407646862721


In [5]:
model.process(
	data_path=os.path.join(ROOT_DIR, "data", "test_data.json"),
	tokens_path=os.path.join(ROOT_DIR, "data", "test_data_tokens.json"),
	save_path=os.path.join(ROOT_DIR, "data", "test_data_predictions_crf.json")
)

100%|██████████| 64/64 [00:11<00:00,  5.62it/s]


In [6]:
with open(os.path.join(ROOT_DIR, "data", 'test_data.json'), 'r', encoding='utf8') as _f:
	test_data = json.load(_f)

with open(os.path.join(ROOT_DIR, "data", 'test_data_predictions_crf.json'), 'r', encoding='utf8') as _f:
	test_data_predictions = json.load(_f)

metric = EvalOfficial()
p, r, f1 = metric.calc(test_data_predictions, test_data)
print("Testing")
print(f'Precision: {p}, Recall:{r}, F1:{f1}')

Testing
Precision: 0.9281980567308247, Recall:0.9281980567308247, F1:0.9281980567308247
