In [1]:
!pip install transformers
!pip install ipywidgets

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-win_amd64.whl (2.0 MB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.2.1 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.13.0


In [2]:
#加载日志信息
import logging
logging.basicConfig(level=logging.ERROR)
# 从 transformers 引入 TFBertForSequenceClassification,BertTokenizer
from transformers import TFBertForSequenceClassification,BertTokenizer
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'tensorflow'

In [3]:
def convert_example_to_feature(review):
  
    return tokenizer.encode_plus(review, 
                add_special_tokens = True, # 添加 [CLS], [SEP]
                max_length = max_length, # bert使用的最大文本长度
                pad_to_max_length = True, # 添加 [PAD] tokens
                return_attention_mask = True, 
            truncation=True
              )

def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label


In [13]:
def encode_examples(ds, limit=-1):
    # 建立模型使用的数据集
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.take(limit)
  
    for index, row in ds.iterrows():
        review = row["Abstract"]
        label = row["y"]
        bert_input = convert_example_to_feature(review)
  
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)


In [14]:
def split_dataset(df):
    train_set, test_set = train_test_split(df, 
        stratify=df['Type'],
        test_size=0.2, 
        random_state=42)

    return train_set, test_set


In [16]:
if __name__ == '__main__': 

    # 参数
    data_path = "Leukemia.csv" # 数据路径
    model_path = "bert-base-uncased" #模型路径，建议预先下载

    max_length = 64
    batch_size = 64
    learning_rate = 2e-5
    number_of_epochs = 10
    num_classes = 4 # 类别数

    # 读数据
    df_raw = pd.read_csv(data_path)    
    abstract = df_raw['Abstract']
    label = df_raw['Type']
    
    # 转换标签
    df_label = pd.DataFrame({"Type":["AML", "ALL", "CML", "CLL"],"y":list(range(4))})
    df_raw = pd.merge(df_raw,df_label,on="Type",how="left")
    # 划分数据
    train_data, test_data = split_dataset(df_raw)
    
    print(len(train_data))
    print(len(test_data))
    print(test_data.keys())

    # 分词器
    tokenizer = BertTokenizer.from_pretrained(model_path)
    # 训练数据集
    ds_train_encoded = encode_examples(train_data).shuffle(50).batch(batch_size)

    # 测试数据集
    ds_test_encoded = encode_examples(test_data).batch(batch_size)

    # 模型初始化
    model = TFBertForSequenceClassification.from_pretrained(model_path, num_labels=num_classes)

    # 使用Adam作为优化器
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,epsilon=1e-08, clipnorm=1)
    # 使用交叉熵作为损失函数
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    # 训练模型
    bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_test_encoded)
    # 估算测试集效果
    print("# evaluate test_set:",model.evaluate(ds_test_encoded))
    

160
40
Index(['Type', 'Authors', 'Title', 'Year', 'Electronic-resource-num',
       'Abstract', 'Periodical', 'Paper_id', 'y'],
      dtype='object')


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
# evaluate test_set: [0.9307827949523926, 0.7250000238418579]


In [17]:
from sklearn.metrics import confusion_matrix

# 得到模型预测结果
res = model.predict(ds_test_encoded)
pre = np.argmax(res["logits"], axis=1)
true = test_data['y']
# 生成混淆矩阵
cf = confusion_matrix(true, pre)
print(cf)

[[8 0 0 2]
 [0 8 1 1]
 [6 0 4 0]
 [1 0 0 9]]
