In [1]:
import random
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
import pickle

import logging
from Lib.DataLoader import DataLoader
from Lib.Diagnoser import Diagnoser

full_acc = 0.8633
dieases = ["急性阑尾炎", "急性胰腺炎", "肠梗阻", "异位妊娠", "急性胆管炎", "急性胆囊炎", "上尿路结石", "卵巢囊肿", "消化道穿孔"]

dg = Diagnoser(r'output/models/diagnose/scaler_gbdt_全特征.pkl', r'output/models/diagnose/gbdt_全特征_%s.m' % full_acc)

In [2]:
logging.basicConfig(level=logging.ERROR)

def load_multilbl_data(file_path, num_fields, separator, skip_title, shuffle=True):
    dl = DataLoader()
    lines = dl.load_data_lines(file_path, num_fields=num_fields, separator=separator, skip_title=skip_title, shuffle=shuffle)
    X = [[int(e) for e in l[11:]] for l in lines]
    y = [[int(e) for e in l[2:11]] for l in lines]
    ids = [l[0:2] for l in lines]

    return X, y, ids

In [3]:
from paddlenlp import Taskflow
from pprint import pprint
from paddlenlp.transformers import AutoTokenizer
import sys

sys.path.append('../../paddlenlp/uie_fitting')

from model import UIE
from predict import process_text

tokenizer = AutoTokenizer.from_pretrained('./model_best')
model = UIE.from_pretrained('./model_best')

prompt = '预测疾病[%s]' % ','.join(dieases)
# ie = Taskflow("information_extraction", schema=schema, task_path='./model_best', device_id=0)

  resample=Image.BILINEAR,
  resample=Image.NEAREST,
  resample=Image.BICUBIC,
  resample=Image.BICUBIC,
[32m[2022-11-03 17:07:30,588] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load './model_best'.[0m


In [68]:
num_fields = 184
X_test, y_test, ids_test = load_multilbl_data(r'data/uie/人机_全特征_多诊断.txt', num_fields=num_fields, separator='	', 
                            skip_title=True, shuffle=False)

print(X_test[0])
print(y_test[0])
print(ids_test[0])

[0, 0, 1, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 36, 2, 1, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[0, 0, 1, 0, 0, 0, 0, 0, 0]
['10261670', '20220119']


In [54]:
def merge_results(r1_dict, r2_dict, r1_delta):
    """
    合并单诊断和多诊断的结果
    """
    r1_set = {r for r in r1_dict}
    r2_set = {r for r in r2_dict}
    
    result = []
    if len(r1_set) == 0:
        for e in r2_set:
            if r2_dict[e] >= 0.9:
                result.append(e)
        if len(result) == 0:
            result = list(r2_set)
    elif len(r2_set) == 0:
        result = list(r1_set)
    else:
        r1_text = list(r1_set)
        has_high_score = False
        for t in r1_text:
            if r1_dict[t] >= r1_delta:
                has_high_score = True
                break
        
        if has_high_score:
            result = r1_text
        else:
            r12_set = r1_set.intersection(r2_set)
            result = list(r12_set)
            for e in r1_set - r12_set:
                if r1_dict[e] >= 0:
                    result.append(e)
                    
            for e in r2_set - r12_set:
                if r2_dict[e] >= 0.9:
                    result.append(e)
#     print(result)
    return result
    

In [56]:
def text_result_to_labels(results):
    """
    文本预测结果转换为多标签标注
    """
    results_ = []
    for r in results:
        r_ = [0 for i in range(len(dieases))]
        for t in r:
            r_[dieases.index(t)] = 1
        results_.append(r_)
        
    return results_
    

def union_pred(X_test, r1_delta):
    """
    联合预测
    """
    results = []
    for idx, (X, y, ids) in enumerate(zip(X_test, y_test, ids_test)):
#         if idx not in [7,28,32,59,67,71,125]: #[7,8,17,27,28,32,33,36,59,67,71,81,83,103,112,113,125]:
#             continue
            
#         print('')
#         print(idx + 2)
        # 单诊断
        result1 = {}
        pred = dg.predict_batch([X], pred_num=None, prob_delta=0.5, out_format='dict')[0]
        for r1 in pred:
            result1[r1['name']] = r1['prob']
            
#         print(result1)

        # 多诊断
        text = '，'.join([str(x) for x in X])
        result2 = process_text(model, tokenizer, [text], prompt)[0]
#         print(result2)
        
        
        results.append(merge_results(result1, result2, r1_delta))

    return results

def single_pred(X_test):
    results = []
    for idx, (X, y, ids) in enumerate(zip(X_test, y_test, ids_test)):
        # 单诊断
        pred = dg.predict_batch([X], pred_num=None, prob_delta=0.5, out_format='dict')[0]
        results.append([r1['name'] for r1 in pred])

    return results

def calc_score(results, y_test):
    accuracy = accuracy_score(y_test, results)
    f1_score_micro = f1_score(y_test, results, average='micro')
    f1_score_macro = f1_score(y_test, results, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

In [69]:
results1 = text_result_to_labels(union_pred(X_test, 0.999))
calc_score(results1, y_test)

Accuracy Score = 0.8741721854304636
F1 Score (Micro) = 0.9292307692307693
F1 Score (Macro) = 0.9275322891646307


### 人机比较结果：
### 基线：单诊断
Accuracy Score = 0.9006622516556292
F1 Score (Micro) = 0.9155844155844156
F1 Score (Macro) = 0.8993278401332572

#### 0.5
Accuracy Score = 0.9072847682119205
F1 Score (Micro) = 0.9190938511326862
F1 Score (Macro) = 0.9051479459533629

#### 0.7
Accuracy Score = 0.9072847682119205
F1 Score (Micro) = 0.923076923076923
F1 Score (Macro) = 0.9136326117392036

#### 0.9
Accuracy Score = 0.8874172185430463
F1 Score (Micro) = 0.9177215189873418
F1 Score (Macro) = 0.9113110671369573

#### 0.999
Accuracy Score = 0.8741721854304636
F1 Score (Micro) = 0.9292307692307693
F1 Score (Macro) = 0.9275322891646307

In [None]:
# 测试集比较结果：
# 基线：单诊断
Accuracy Score = 0.7990115321252059
F1 Score (Micro) = 0.8453922315308453
F1 Score (Macro) = 0.8192913387716643

# 0.5
Accuracy Score = 0.800658978583196
F1 Score (Micro) = 0.8462709284627092
F1 Score (Macro) = 0.8199911944715201

# 0.7
Accuracy Score = 0.7990115321252059
F1 Score (Micro) = 0.8500376789751318
F1 Score (Macro) = 0.8221702109216127

# 0.9
Accuracy Score = 0.7973640856672158
F1 Score (Micro) = 0.8547904191616768
F1 Score (Macro) = 0.8281414487293165

#### 0.999
Accuracy Score = 0.7990115321252059
F1 Score (Micro) = 0.8602779809802488
F1 Score (Macro) = 0.836322123848968

In [63]:
results2 = text_result_to_labels(single_pred(X_test))
calc_score(results2, y_test)

Accuracy Score = 0.7990115321252059
F1 Score (Micro) = 0.8453922315308453
F1 Score (Macro) = 0.8192913387716643


In [22]:
for idx, (r1, r2, y) in enumerate(zip(results1, results2, y_test)):
    if r1 != r2:
        print(idx, r1, r2, y, r1==y, r2==y)
    

7 [1, 0, 0, 0, 0, 0, 0, 1, 0] [0, 0, 0, 0, 0, 0, 0, 1, 0] [0, 0, 0, 0, 0, 0, 0, 1, 0] False True
8 [0, 1, 0, 0, 1, 0, 0, 0, 0] [0, 1, 0, 0, 0, 0, 0, 0, 0] [0, 0, 0, 0, 1, 0, 0, 0, 0] False False
17 [0, 1, 0, 0, 0, 1, 0, 0, 0] [0, 1, 0, 0, 0, 0, 0, 0, 0] [0, 0, 0, 0, 0, 1, 0, 0, 0] False False
27 [0, 0, 0, 0, 1, 0, 0, 0, 1] [0, 0, 0, 0, 1, 0, 0, 0, 0] [0, 0, 0, 0, 0, 0, 0, 0, 1] False False
28 [0, 1, 0, 0, 1, 0, 0, 0, 0] [0, 1, 0, 0, 0, 0, 0, 0, 0] [0, 1, 0, 0, 0, 0, 0, 0, 0] False True
32 [0, 1, 0, 0, 0, 1, 0, 0, 0] [0, 1, 0, 0, 0, 0, 0, 0, 0] [0, 1, 0, 0, 0, 0, 0, 0, 0] False True
33 [0, 0, 0, 0, 0, 0, 0, 0, 1] [0, 0, 0, 0, 0, 0, 0, 0, 0] [0, 0, 0, 0, 0, 0, 0, 0, 1] True False
36 [0, 1, 0, 0, 0, 1, 0, 0, 0] [0, 1, 0, 0, 0, 0, 0, 0, 0] [0, 0, 0, 0, 1, 1, 0, 0, 0] False False
59 [0, 0, 0, 0, 0, 1, 1, 0, 0] [0, 0, 0, 0, 0, 1, 0, 0, 0] [0, 0, 0, 0, 0, 1, 0, 0, 0] False True
67 [0, 0, 0, 0, 1, 1, 0, 0, 0] [0, 0, 0, 0, 1, 0, 0, 0, 0] [0, 0, 0, 0, 1, 0, 0, 0, 0] False True
71 [0, 1, 0, 0, 1,

In [6]:
for idx, (X, y, ids) in enumerate(zip(X_test, y_test, ids_test)):
    print(idx + 1)
    
    # 单诊断
    result1 = {}
    pred = dg.predict_batch([X], pred_num=None, prob_delta=0.5, out_format='dict')[0]
    for r1 in pred:
        result1[r1['name']] = r1['prob']
    print(result1)
    
    # 多诊断
    text = '，'.join([str(x) for x in X])
#     for r2 in ie(text):
#         if schema in r2:
#             result2.extend(r2[schema])
    result2 = process_text(model, tokenizer, [text], prompt)[0]
    pprint(result2)

    result12 = merge_results(result1, result2)
    print(result12)
    
    # 金标准
    labels = []
    for idx, y_ in enumerate(y):
        if y_ == 1:
            labels.append(dieases[idx])
            
    print(labels)
    
    print('')

# # print(pred)
# for r, y in zip(pred, y_test):
#     s = ''
#     for r_ in r:
#         s_item = r_['name'] + '\t' + str(r_['prob'])
#         s = s + ',' + s_item if s != '' else s_item
#     print(dieases[y] + '\t' + s)
# # test_and_save(pred, y_test, ids_test, '人机_全特征')

1
{'肠梗阻': 0.9999999659165436}
{'肠梗阻': 0.9993325471878052}
['肠梗阻']
['肠梗阻']

2
{'消化道穿孔': 0.9039818226991675}
{'消化道穿孔': 0.9988742470741272}
['消化道穿孔']
['消化道穿孔']

3
{'上尿路结石': 0.9999990173352235}
{'上尿路结石': 0.9961837530136108}
['上尿路结石']
['上尿路结石']

4
{'肠梗阻': 0.9999987672145347}
{'肠梗阻': 0.996227502822876}
['肠梗阻']
['肠梗阻']

5
{'急性胰腺炎': 0.9999996776973803}
{'急性胆囊炎': 0.9681075811386108,
 '急性胆管炎': 0.7468574047088623,
 '急性胰腺炎': 0.9958971738815308}
['急性胰腺炎']
['急性胰腺炎']

6
{'急性胆管炎': 0.9999943033888808}
{'急性胆囊炎': 0.9890077710151672, '急性胆管炎': 0.7449440956115723}
['急性胆管炎']
['急性胆管炎']

7
{'上尿路结石': 0.9999900399410319}
{'上尿路结石': 0.9985754489898682}
['上尿路结石']
['上尿路结石']

8
{'卵巢囊肿': 0.5681908988861422}
{'急性阑尾炎': 0.9826538562774658}
['卵巢囊肿', '急性阑尾炎']
['卵巢囊肿']

9
{'急性胰腺炎': 0.9799711337775067}
{'急性胆囊炎': 0.6619626879692078,
 '急性胆管炎': 0.9819936752319336,
 '急性胰腺炎': 0.9957678318023682}
['急性胰腺炎', '急性胆管炎']
['急性胆管炎']

10
{'急性胰腺炎': 0.9999999995637072}
{'急性胰腺炎': 0.9991081357002258}
['急性胰腺炎']
['急性胰腺炎']

11
{'急性阑尾炎': 0.9951487

{'上尿路结石': 0.9987021684646606}
['上尿路结石']
['上尿路结石']

93
{'上尿路结石': 0.9999999873962526}
{'上尿路结石': 0.9976317286491394}
['上尿路结石']
['上尿路结石']

94
{'急性胆管炎': 0.9999998239779972}
{'急性胆管炎': 0.998458981513977}
['急性胆管炎']
['急性胆管炎']

95
{'上尿路结石': 0.9999996349339133}
{'上尿路结石': 0.9950928092002869}
['上尿路结石']
['上尿路结石']

96
{'上尿路结石': 0.9999999965819537}
{'上尿路结石': 0.9985950589179993}
['上尿路结石']
['上尿路结石']

97
{'上尿路结石': 0.9999999641471333}
{'上尿路结石': 0.9983420372009277}
['上尿路结石']
['上尿路结石']

98
{'急性胆囊炎': 0.9999971288566943}
{'急性胆囊炎': 0.9934564828872681}
['急性胆囊炎']
['急性胆囊炎']

99
{'上尿路结石': 0.9999999741897593}
{'上尿路结石': 0.9983702898025513}
['上尿路结石']
['上尿路结石']

100
{'上尿路结石': 0.9999938294821048}
{'上尿路结石': 0.9978406429290771}
['上尿路结石']
['上尿路结石']

101
{'上尿路结石': 0.999999998201579}
{'上尿路结石': 0.9985671639442444}
['上尿路结石']
['上尿路结石']

102
{'卵巢囊肿': 0.9972872014935007}
{'卵巢囊肿': 0.9681112170219421}
['卵巢囊肿']
['卵巢囊肿']

103
{'急性胆囊炎': 0.9999856011941085}
{'急性胆囊炎': 0.9963624477386475}
['急性胆囊炎']
['急性胆囊炎']

104
{'急性胰腺炎': 0.99796893796