In [None]:
# # install dependencies
# !pip install datasets dashscope openai requests retrying numpy func_timeout bert_score transformers

In [None]:
model_list=["ernie-turbo","chatglm2_6b_32k","qwen-turbo","baichuan2-7b-chat-v1","gpt-3.5-turbo","gpt-4","gpt-4-1106-preview"]
mode_list=["without_info","with_whole_song","with_rf"]
line_list=["wo_info","w_song","w_rf"]
res_table_s="|LLM|Method|Precision|Recall|F1|\n|:---:|:---:|:---:|:---:|:---:|\n"

In [None]:
import os
import json
from bert_score import score
import numpy as np

skip_flag=True  # whether to skip the samples, which the api platform refuses to give answers for due to the safety system

for model in model_list:
    for mode in mode_list:
        file_path=model+"_"+mode+"_dic.json"
        Precision=""
        Recall=""
        F1=""
        if(os.path.exists(file_path)==False):
            print("Not found: "+file_path)
            Precision="00.0000"
            Recall="00.0000"
            F1="00.0000"
        else:
            print("For: "+file_path)
            predictions=[]
            references=[]
            
            dic=None
            with open(file_path,"r",encoding='utf-8') as f:
                dic=json.load(f)
                
            for value in list(dic.values()):
                pred=value["pred"]
                label=value["label"]
                if((pred=="inappropriate error") and (skip_flag==True)):
                    print("skip")
                    continue
                if(pred[-1]!="。"): # make sure the pred ends with "。"
                    pred=pred+"。"
                predictions.append(pred)
                references.append(label)
                
            Precision, Recall, F1 = score(predictions, references, lang='zh', rescale_with_baseline=True,batch_size=128)
            Precision_mean=Precision.mean()
            Recall_mean=Recall.mean()
            F1_mean=F1.mean()
        
        npz_file_name=f"{model}_{mode}_bertscore.npz"
        np.savez(npz_file_name, Precision=Precision, Recall=Recall, F1=F1)
        
        res_table_s=res_table_s+"|*"+model+"*|*"+mode+"*|"+"%0.4f|"%Precision_mean+"%0.4f|"%Recall_mean+"%0.4f|\n"%F1_mean            

In [None]:
print(res_table_s)# markdown format

|LLM|Method|Precision|Recall|F1|
|:---:|:---:|:---:|:---:|:---:|
|*ernie-turbo*|*without_info*|-0.0350|0.1568|0.0511|
|*ernie-turbo*|*with_whole_song*|0.2472|0.5765|0.3895|
|*ernie-turbo*|*with_rf*|0.3600|0.6528|0.4864|
|*chatglm2_6b_32k*|*without_info*|0.0466|0.1787|0.1066|
|*chatglm2_6b_32k*|*with_whole_song*|0.2361|0.4606|0.3335|
|*chatglm2_6b_32k*|*with_rf*|0.4650|0.6477|0.5436|
|*qwen-turbo*|*without_info*|0.2331|0.2150|0.2208|
|*qwen-turbo*|*with_whole_song*|0.7673|0.8041|0.7804|
|*qwen-turbo*|*with_rf*|0.8600|0.8251|0.8386|
|*baichuan2-7b-chat-v1*|*without_info*|0.1755|0.2012|0.1857|
|*baichuan2-7b-chat-v1*|*with_whole_song*|0.4635|0.6324|0.5371|
|*baichuan2-7b-chat-v1*|*with_rf*|0.6567|0.7272|0.6851|
|*gpt-3.5-turbo*|*without_info*|0.2201|0.1983|0.2061|
|*gpt-3.5-turbo*|*with_whole_song*|0.8031|0.7812|0.7884|
|*gpt-3.5-turbo*|*with_rf*|0.8110|0.7484|0.7758|
|*gpt-4*|*without_info*|0.2426|0.2377|0.2376|
|*gpt-4*|*with_whole_song*|0.8405|0.8587|0.8464|
|*gpt-4*|*with_rf*|0.8865|0.8643|0.8732|
|*gpt-4-1106-preview*|*without_info*|0.2345|0.2061|0.2179|
|*gpt-4-1106-preview*|*with_whole_song*|0.8411|0.8117|0.8231|
|*gpt-4-1106-preview*|*with_rf*|0.8230|0.7678|0.7921|