In [None]:
import re
import spacy
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import pickle
from sklearn.model_selection import KFold
from evaluate import run_eval
import itertools

In [None]:
# 加载数据
basedir="./"
fout=f"{basedir}/sample2values.pkl"
with open(fout, "rb") as fw:
    sample2values=pickle.load(fw)

In [None]:
# 设置参数字典
parameters={"c": [10, 15, 25, 35, 40, 45, 50],
            "f": [1, 2],
           }
paras=gen_grid_paras(parameters)

# 拆分数据集: 五折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)
sv=list(sample2values.items())

results={}
for train_set, test_set in kf.split(sv):
    # 训练、测试数据文件生成
    trainDatas=[sv[i] for i in train_set]
    testDatas=[sv[i] for i in test_set]
    print("{} trainDatas, {} testDatas".format(len(trainDatas), len(testDatas)))
    with open(f"{basedir}/trainDatas.pkl", "wb") as fw:
        pickle.dump(trainDatas, fw)
    with open(f"{basedir}/testDatas.pkl", "wb") as fw:
        pickle.dump(testDatas, fw)

    tocrf(trainDatas, f"{basedir}/train-crf.data", hyphen="\t", labelName="LABEL")
    tocrf(testDatas, f"{basedir}/test-crf.data", hyphen="\t", labelName="LABEL")
    
    # 模型训练
    for p in paras.values():
        content=" ".join([f"-{k} {v}" for k,v in p.items()])
        cmd_train=f"crf_learn -p 32 {content} template train.data crfpp_model.bin\ncrf_test -m crfpp_model.bin test.data > test.result"
        with open("run.sh", "w") as fw2:
            fw2.write(cmd_train)
        !sh run.sh
        # 在测试数据集上评价结果
        f1=run_eval("./test.result", head=2, verbose=True)
        print("{}, f1={}".format(content, f1))
        if content not in results.keys():
            results[content]=[f1]
        else:
            results[content].append(f1)
    print("\n\n")

In [None]:
# 针对评价结果进行排序
fresults={k: {"f1": v, "mean_f1":np.mean(v)} for k,v in results.items()}
print(pd.DataFrame(fresults).T.sort_values("mean_f1", ascending=False))