In [1]:
from preprocessing import *
from crf import *
import os

ROOT_DIR = os.path.dirname(os.path.abspath(""))

train_data_tokens = load_tokens(os.path.join(ROOT_DIR, "data", "training_data_tokens.json"))
train_data = json.load(open(os.path.join(ROOT_DIR, "data", "training_data.json"), "r"))

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
train_data_bio = create_bio_tags(train_data, train_data_tokens)
with open(os.path.join(ROOT_DIR, "data", "training_data_bio.json"), "w") as f:
	json.dump(train_data_bio, f)

In [4]:
model = CRF(
	model_path=os.path.join(ROOT_DIR, "models", "crf_0_0.crfsuite"),
	trainer_params={
		'c1': 1e-3,
		'c2': 1e-1,
		'max_iterations': 50,
		"padding": True,
		"before_lim": 6,
		"after_lim": 1,
	},
	verbose=False
)

In [4]:
precompute_pos(
	tokens_path=os.path.join(ROOT_DIR, "data", "training_data_tokens.json"),
	pos_path=os.path.join(ROOT_DIR, "data", "training_data_pos.json")
)

100%|██████████| 254/254 [00:31<00:00,  8.16it/s]


In [5]:
model.train(
	train_tokens_path=os.path.join(ROOT_DIR, "data", "training_data_tokens.json"),
	train_labels_path=os.path.join(ROOT_DIR, "data", "training_data_bio.json"),
	train_pos_path=os.path.join(ROOT_DIR, "data", "training_data_pos.json"),
)

100%|██████████| 254/254 [00:00<00:00, 3208.50it/s]


Training model...
Model trained


In [7]:
model.process(
	data_path=os.path.join(ROOT_DIR, "data", "training_data.json"),
	tokens_path=os.path.join(ROOT_DIR, "data", "training_data_tokens.json"),
	save_path=os.path.join(ROOT_DIR, "data", "training_data_predictions_crf.json"),
	pos_path=os.path.join(ROOT_DIR, "data", "training_data_pos.json"),
)

100%|██████████| 254/254 [00:03<00:00, 79.58it/s]


In [8]:
from eval import EvalOfficial

with open(os.path.join(ROOT_DIR, "data", 'training_data.json'), 'r', encoding='utf8') as _f:
	train_data = json.load(_f)

with open(os.path.join(ROOT_DIR, "data", 'training_data_predictions_crf.json'), 'r', encoding='utf8') as _f:
	train_data_predictions = json.load(_f)

metric = EvalOfficial()
p, r, f1 = metric.calc(train_data_predictions, train_data)
print("Training")
print(f'Precision: {p}, Recall:{r}, F1:{f1}')

Training
Precision: 0.9797242276351673, Recall:0.9797242276351673, F1:0.9797242276351673


In [9]:
model.process(
	data_path=os.path.join(ROOT_DIR, "data", "test_data.json"),
	tokens_path=os.path.join(ROOT_DIR, "data", "test_data_tokens.json"),
	save_path=os.path.join(ROOT_DIR, "data", "test_data_predictions_crf.json")
)

100%|██████████| 64/64 [00:10<00:00,  6.29it/s]


In [10]:
with open(os.path.join(ROOT_DIR, "data", 'test_data.json'), 'r', encoding='utf8') as _f:
	test_data = json.load(_f)

with open(os.path.join(ROOT_DIR, "data", 'test_data_predictions_crf.json'), 'r', encoding='utf8') as _f:
	test_data_predictions = json.load(_f)

metric = EvalOfficial()
p, r, f1 = metric.calc(test_data_predictions, test_data)
print("Testing")
print(f'Precision: {p}, Recall:{r}, F1:{f1}')

Testing
Precision: 0.9595855897995074, Recall:0.9595855897995074, F1:0.9595855897995074


### Experimentation

In [1]:
from eval import *
from preprocessing import *
import os

ROOT_DIR = os.path.dirname(os.path.abspath(""))

eval = EvalCRF(
	save_dir=os.path.join(ROOT_DIR, "temp"),
	results_dir=os.path.join(ROOT_DIR, "data"),
	train_data_path=os.path.join(ROOT_DIR, "data", 'training_data.json'),
	eval_data_path=os.path.join(ROOT_DIR, "data", 'test_data.json'),
	load_existing_train_tokens=False,
	load_existing_eval_tokens=False,
	lemmatize=False,
	remove_punctuation=True,
	replace_numbers=None,
	verbose=True
)

Creating evaluation tokens...


100%|██████████| 64/64 [00:21<00:00,  2.99it/s]
100%|██████████| 64/64 [00:00<00:00, 66.40it/s]


Creating training tokens...


100%|██████████| 254/254 [01:12<00:00,  3.50it/s]
100%|██████████| 254/254 [00:03<00:00, 72.74it/s]


Precomputing training POS tags...


100%|██████████| 254/254 [01:00<00:00,  4.18it/s]


Precomputing evaluation POS tags...


100%|██████████| 64/64 [00:17<00:00,  3.61it/s]


Loading NLP models...


In [6]:
trainer_params={
		'c1': 1e-3,
		'c2': 1e-1,
		'max_iterations': 50,
		"padding": True,
		"before_lim": 6,
		"after_lim": 1,
	}
eval.evaluate(**trainer_params)

Instantiating CRF...
Training CRF...


100%|██████████| 254/254 [00:00<00:00, 2429.36it/s]


Training model...
Model trained
Predicting...


100%|██████████| 64/64 [00:00<00:00, 73.66it/s]


Calculating metrics...


{'precision': 0.9622705694039893,
 'recall': 0.9622705694039893,
 'f1': 0.9622705694039893,
 'time': 0.983}