In [5]:
# Language = "English"
Language = "Chinese"
mode = "train"
param_num = 1

In [6]:
from pathlib import Path
import sys

sys.path.append(str(Path.cwd().parent))

from Part1.dataprocess import data_process, set_log, combine_data
from sklearn_crf import sent2features


set_log(None)
train_data, valid_data, test_data = data_process(f"../NER/{Language}", mode=mode)

x_train = [sent2features(sentence, Language, param_num) for sentence, _ in train_data]
y_train = [label for _, label in train_data]
x_valid = [sent2features(sentence, Language, param_num) for sentence, _ in valid_data]
y_valid = [label for _, label in valid_data]

2024-06-01 16:47:42,165 P79816 INFO train dataset size: 3820
2024-06-01 16:47:42,166 P79816 INFO valid dataset size: 462


In [7]:
import pickle
from sklearn_crfsuite import CRF

if param_num == 0:
    crf_model = CRF(
        algorithm="lbfgs",
        c1=0.01,
        c2=0.01,
        max_iterations=200,
        all_possible_transitions=True,
        verbose=True,
    )
elif param_num == 1:
    crf_model = CRF(
        algorithm="ap",
        max_iterations=300,
        all_possible_transitions=True,
        verbose=True,
    )
elif param_num == 2:
    raise NotImplementedError

crf_model.fit(x_train, y_train)
with open(f"crf_{Language}{param_num}.pkl", "wb") as f:
    pickle.dump(crf_model, f)

y_pred = crf_model.predict(x_valid)
combined_data = combine_data([sentence for sentence, _ in valid_data], y_pred)

output_file = f"output_{Language}.txt"
with open(output_file, "w", encoding="utf-8") as file:
    file.write(combined_data)

loading training data to CRFsuite: 100%|██████████| 3820/3820 [00:01<00:00, 3075.47it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 160528
Seconds required: 0.504

Averaged perceptron
max_iterations: 300
epsilon: 0.000000

Iter 1   time=0.12  loss=445.35   feature_norm=474.25
Iter 2   time=0.10  loss=204.26   feature_norm=597.26
Iter 3   time=0.10  loss=138.04   feature_norm=673.72
Iter 4   time=0.10  loss=104.13   feature_norm=732.69
Iter 5   time=0.09  loss=70.30    feature_norm=778.48
Iter 6   time=0.10  loss=62.60    feature_norm=815.15
Iter 7   time=0.09  loss=46.13    feature_norm=846.00
Iter 8   time=0.09  loss=55.17    feature_norm=872.61
Iter 9   time=0.09  loss=42.56    feature_norm=897.66
Iter 10  time=0.09  loss=24.37    feature_norm=919.90
Iter 11  time=0.09  loss=23.68    feature_norm=939.63
Iter 12  time=0.09  loss=25.50    feature_norm=957.25
Iter 13  time=0.09  loss=14.56    feature_norm=973.35
Iter 14  time=0.0

In [8]:
from NER.check import check

report = check(
    language=Language,
    gold_path=f"../NER/{Language}/validation.txt",
    my_path=output_file,
)

              precision    recall  f1-score   support

      B-NAME     0.9806    0.9902    0.9854       102
      M-NAME     0.9867    0.9867    0.9867        75
      E-NAME     0.9806    0.9902    0.9854       102
      S-NAME     1.0000    1.0000    1.0000         8
      B-CONT     1.0000    1.0000    1.0000        33
      M-CONT     1.0000    1.0000    1.0000        64
      E-CONT     1.0000    1.0000    1.0000        33
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.9722    0.9906    0.9813       106
       M-EDU     0.9620    1.0000    0.9806       177
       E-EDU     0.9541    0.9811    0.9674       106
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.9187    0.9187    0.9187       689
     M-TITLE     0.9050    0.9270    0.9158      1479
     E-TITLE     0.9841    0.9855    0.9848       689
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.9484    0.9502    0.9493       522
       M-ORG     0.9375    