In [16]:
from simpletransformers.classification import (
    MultiLabelClassificationModel, MultiLabelClassificationArgs
)
import pandas as pd
import numpy
import logging
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [30]:
# 读取数据至pandas
df = pd.read_excel("./data/信立泰 高血压-诊断定义.xlsx")
df.replace(r'[^\u4e00-\u9fa5]', "", regex=True, inplace=True) # 去除所有非中文
df['原始诊断'].fillna("", inplace=True)

In [42]:
# 准备数据格式

df_label = df[df["高血压"].notna()] # Filter出打过标签的行
train_df = pd.DataFrame()
train_df['text'] = df_label['原始诊断']
df_label = df_label.iloc[:,3:].astype("int")
train_df['labels'] = df_label.values.tolist() # 将多列标签数据转换为单列list
df_label.sum()

高血压     194
冠心病     126
糖尿病      54
血脂异常     59
卒中       42
慢性肾病     55
心力衰竭     27
高尿酸      24
dtype: int64

In [19]:
# 训练集和验证集分离
train_df, eval_df = train_test_split(train_df, test_size=.2, random_state=1)
train_df, eval_df

(                          text                    labels
 93                          心悸  [0, 0, 0, 0, 0, 0, 0, 0]
 23                     不稳定性心绞痛  [0, 1, 0, 0, 0, 0, 0, 0]
 42081  头晕高血压病级极高危房性期前收缩房性早搏脑梗死  [1, 0, 0, 0, 1, 0, 0, 0]
 13                     高血压高脂血症  [1, 0, 0, 1, 0, 0, 0, 0]
 90                          胸闷  [0, 0, 0, 0, 0, 0, 0, 0]
 ...                        ...                       ...
 4206        冠状动脉粥样硬化性心脏病原发性高血压  [1, 1, 0, 0, 0, 0, 0, 0]
 72                         脑出血  [0, 0, 0, 0, 1, 0, 0, 0]
 42178          慢性鼻炎高血压呼吸道感染关节炎  [1, 0, 0, 0, 0, 0, 0, 0]
 884                   高血压病心房颤动  [1, 0, 0, 0, 0, 0, 0, 0]
 37                        高血压级  [1, 0, 0, 0, 0, 0, 0, 0]
 
 [320 rows x 2 columns],
                                  text                    labels
 42180                 冠心病支架术后高血压房颤糖尿病  [1, 1, 1, 0, 0, 0, 0, 0]
 125          肾移植术后高血压骨质疏松粒细胞减少贫血肝损害胃炎  [1, 0, 0, 0, 0, 1, 0, 0]
 42110                    冠心病高血压病前列腺增生  [1, 1, 0, 0, 0, 0, 0, 0]
 42121           

In [20]:
# 准备模型可选参数
model_args = MultiLabelClassificationArgs(num_train_epochs=50, overwrite_output_dir=True)

In [32]:
# 根据预训练模型生成模型
model = MultiLabelClassificationModel(
    "bert", "./outputs", num_labels=8, args=model_args
)

In [None]:
# 训练模型
model.train_model(train_df)

In [33]:
# 验证模型
result, model_outputs, wrong_predictions = model.eval_model(
    eval_df
)
result


INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_128_0_80
Running Evaluation: 100%|██████████| 10/10 [00:00<00:00, 40.29it/s]


{'LRAP': 0.9655208333333333, 'eval_loss': 0.13957007676362992}

In [43]:
# 使用模型进行预测
df_nolabel = df[~df["高血压"].notna()] #  没有人为标签过的数据
pred_text = df_nolabel["原始诊断"].sample(n=10).values.tolist()  #  随机抽取10条数据的原始诊断字段
predictions, raw_outputs = model.predict(pred_text)

labels = df_label.columns.values.tolist()
for pred in predictions:
    for i in range(len(pred)):
        if pred[i] == 1:
            pred[i] = labels[i]

pred_text, predictions

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_128_0_10
100%|██████████| 2/2 [00:00<00:00, 37.84it/s]


(['便秘反流性食管炎高血压关节痛过敏性鼻炎变应性鼻炎消化性溃疡行动不便动脉粥样硬化并高脂血症高胆固醇血症冠心病失眠',
  '糖尿病慢性病高血压慢性病胸痛',
  '高甘油三酯血症高尿酸血症高血压肾炎',
  '慢性心功能不全居民',
  '二尖瓣疾病',
  '冠心病高血压便秘',
  '动脉粥样硬化并高脂血症高血压颈动脉狭窄麻木脑供血不全头晕晕动症',
  '高血压病前列腺增生骨痛高脂血症高血压',
  '腕和手损伤',
  '冠状动脉粥样硬化性心脏病型糖尿病伴有并发症'],
    高血压  冠心病  糖尿病  血脂异常   卒中  慢性肾病  心力衰竭  高尿酸
 0  高血压  冠心病  NaN  血脂异常  NaN   NaN   NaN  NaN
 1  高血压  NaN  糖尿病   NaN  NaN   NaN   NaN  NaN
 2  高血压  NaN  NaN  血脂异常  NaN   NaN   NaN  高尿酸
 3  NaN  冠心病  NaN   NaN  NaN   NaN   NaN  NaN
 4  NaN  NaN  NaN   NaN  NaN   NaN   NaN  NaN
 5  高血压  冠心病  NaN   NaN  NaN   NaN   NaN  NaN
 6  高血压  冠心病  NaN  血脂异常   卒中   NaN   NaN  NaN
 7  高血压  NaN  NaN  血脂异常  NaN   NaN   NaN  NaN
 8  NaN  NaN  NaN   NaN  NaN   NaN   NaN  NaN
 9  NaN  冠心病  糖尿病   NaN  NaN   NaN   NaN  NaN)

In [None]:
# 使用模型进行原始数据所有的预测
list_info = []
list_pred = []
labels = df_label.columns.values.tolist()
for index, row in df.head(1000).iterrows():
    pred_text=row["原始诊断"]
    if pd.isna((pred_text)): # 如果原始诊断为na则标签均为0
        list_pred.append([0] *8)
    elif pd.isna(row['高血压']): # 如果没有人为标记的条目使用模型预测
        predictions, raw_outputs = model.predict([pred_text])
        list_pred.append(predictions[0])
    else: # 反之，则保留原人为标记结果
        list_pred.append(row[3:].tolist())
    list_info.append(row[:3])

df_pred = pd.DataFrame(list_pred,columns=labels)
df_info = pd.DataFrame(list_info,columns=["序号","原始诊断","处方张数"])
df_combined = pd.concat([df_info, df_pred], axis=1)
df_combined

In [None]:
# 将模型预测应用到所有数据

pred_text = df["原始诊断"].values.tolist()  
predictions, raw_outputs = model.predict(pred_text)
pred_text, predictions

In [41]:
labels = df_label.columns.values.tolist()
df_info = df.iloc[:,:3]
df_pred = pd.DataFrame(predictions, columns=labels)
df_combined = pd.concat([df_info, df_pred], axis=1)
df_combined.to_csv("./labeled_data.csv", index=False, encoding="utf_8_sig")
df_combined

Unnamed: 0,序号,原始诊断,处方张数,高血压,冠心病,糖尿病,血脂异常,卒中,慢性肾病,心力衰竭,高尿酸
0,1,高血压,81364,1,0,0,0,0,0,0,0
1,2,无诊断,57580,0,0,0,0,0,0,0,0
2,3,高血压病,31515,1,0,0,0,0,0,0,0
3,4,冠状动脉粥样硬化性心脏病,26092,0,1,0,0,0,0,0,0
4,5,冠心病,8964,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
183247,123601,房缺高血压,1,1,0,0,0,0,0,0,0
183248,176201,咳嗽脑动脉血栓形成引起的脑梗死,1,0,0,0,0,1,0,0,0
183249,119590,动脉粥样硬化并高脂血症反复发作低血糖高血压冠心病糖尿病硒缺乏叶酸缺乏,1,1,1,1,1,0,0,0,0
183250,108127,冠状动脉粥样硬化性心脏病不稳定型心绞痛慢性支气管炎急性加重期重症肺炎型呼吸衰竭肺栓塞,1,0,1,0,0,0,0,1,0
