In [28]:
import os
import datasets
import evaluate
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, BertTokenizerFast
from transformers import TrainingArguments, Trainer

In [29]:
# hyperparameter

OUTPUT_PATH = "./checkpoints"
MODEL_PATH = "./bert-base-chinese"
LEARNING_RATE = 2e-5
BATCH_SIZE = 20
EPOCH = 5
WEIGHT_DECAY = 0.02
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [30]:
# functions

def Map_Process_dataset2tensor(dataset):
    return tokenizer(dataset['text_a'], padding='max_length', truncation = True)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# load model

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
tokenizer.model_max_length = 200
# read data
train_data = pd.read_csv('data/train.tsv', sep='\t')
valid_data = pd.read_csv('data/dev.tsv', sep='\t')
test_data = pd.read_csv('data/test.tsv', sep='\t')
train_ds = datasets.Dataset.from_pandas(train_data)
dev_ds = datasets.Dataset.from_pandas(valid_data)
test_ds = datasets.Dataset.from_pandas(test_data)

# preprocess data
train_ds.rename_column('label', 'labels')
dev_ds.rename_column('label', 'labels')
test_ds.rename_column('label', 'labels')
ds = datasets.DatasetDict({
    "train": train_ds,
    "dev": dev_ds,
    "test": test_ds
})
tkz_ds = ds.map(Map_Process_dataset2tensor, batched=True)

pt_ds = tkz_ds.with_format('torch')

# prepare fine-tuning

metric = evaluate.load("accuracy")

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

training_args = TrainingArguments(
    output_dir=OUTPUT_PATH,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCH,
    weight_decay=WEIGHT_DECAY,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)
task_evaluator = evaluate.evaluator("text-classification")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = pt_ds['train'],
    eval_dataset = pt_ds['dev'],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)


Some weights of the model checkpoint at ./bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint 

In [None]:
# train
trainer.train()

# test part

In [31]:
# load model
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from sklearn.metrics import classification_report
model = AutoModelForSequenceClassification.from_pretrained("checkpoints/checkpoint_best")
tokenizer = AutoTokenizer.from_pretrained("checkpoints/checkpoint_best")
tokenizer.model_max_length = 200
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# receive input function

def Infer(input_review):
    encoded_input = tokenizer.encode(input_review, return_tensors='pt',max_length =200)
    encoded_input = encoded_input.to(device)
    model.to(device)
    result = model(encoded_input)['logits']
    label = torch.argmax(result, dim=-1)
    return int(label)

# define function to evaluate the loaded model

def evaluate_model(model, tokenizer, dataset):
    ds = dataset.map(Map_Process_dataset2tensor, batched=True)
    pt_ds = ds.with_format('torch')
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset = pt_ds['test'],
        tokenizer = tokenizer,
        compute_metrics = compute_metrics,
    )
    result = trainer.evaluate()
    return result

cuda:0


In [32]:
# evaluate the loaded model
evaluate_model(model, tokenizer, ds)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 60/60 [01:28<00:00,  1.48s/it]


{'eval_loss': 0.15233935415744781,
 'eval_accuracy': 0.9541666666666667,
 'eval_runtime': 90.0711,
 'eval_samples_per_second': 13.323,
 'eval_steps_per_second': 0.666}

In [33]:
result_list = []
label_list = []

x_test, y_test = pt_ds['test']['text_a'], pt_ds['test']['label']

In [34]:
for i in range(len(x_test)):
    result = Infer(x_test[i])
    result_list.append(result)
    label_list.append(y_test[i])
    if (i+1) % 100 == 0:
        print("已完成{}条".format(i+1))

from sklearn.metrics import accuracy_score
accuracy_score(label_list, result_list)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


已完成100条
已完成200条
已完成300条
已完成400条
已完成500条
已完成600条
已完成700条
已完成800条
已完成900条
已完成1000条
已完成1100条
已完成1200条


0.9541666666666667

In [35]:
print(classification_report(label_list, result_list, digits=4))

              precision    recall  f1-score   support

           0     0.9559    0.9510    0.9534       592
           1     0.9525    0.9572    0.9549       608

    accuracy                         0.9542      1200
   macro avg     0.9542    0.9541    0.9542      1200
weighted avg     0.9542    0.9542    0.9542      1200



# 实现classification_report

In [36]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

def evaluate_result(result_list, label_list):
    acc = accuracy_score(label_list, result_list)
    f1_macro = f1_score(label_list, result_list, average='macro')
    recall_macro = recall_score(label_list, result_list, average='macro')
    precision_macro = precision_score(label_list, result_list, average='macro')
    f1_weighted = f1_score(label_list, result_list, average='weighted')
    recall_weighted = recall_score(label_list, result_list, average='weighted')
    precision_weighted = precision_score(label_list, result_list, average='weighted')

    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(result_list)):
        if result_list[i] == 1 and label_list[i] == 1:
            TP += 1
        elif result_list[i] == 0 and label_list[i] == 0:
            TN += 1
        elif result_list[i] == 1 and label_list[i] == 0:
            FP += 1
        elif result_list[i] == 0 and label_list[i] == 1:
            FN += 1

    # calculate precision, recall, f1-score, support of each class
    precision_0 = TN / (TN + FP)
    recall_0 = TN / (TN + FN)
    f1_0 = 2 * precision_0 * recall_0 / (precision_0 + recall_0)
    support_0 = TN + FN

    precision_1 = TP / (TP + FN)
    recall_1 = TP / (TP + FP)
    f1_1 = 2 * precision_1 * recall_1 / (precision_1 + recall_1)
    support_1 = TP + FP
    
    # print result like this
    #         precision   recall   f1-score   support

    # 负面       0.73      0.92      0.81      2444
    # 正面       0.96      0.84      0.90      5322

    # accuracy                       0.87      7766
    # macro avg     0.84      0.88      0.85      7766
    # weighted avg  0.89      0.87      0.87      7766

    print("\t\tprecision\trecall\t\tf1-score\tsupport")
    print("负面\t\t%.5f\t\t%.5f\t\t%.5f\t\t%d" % (precision_0, recall_0, f1_0, support_0))
    print("正面\t\t%.5f\t\t%.5f\t\t%.5f\t\t%d" % (precision_1, recall_1, f1_1, support_1))
    print("\naccuracy\t\t\t\t\t%.5f\t\t%d" % (acc,len(result_list)))
    print("macro avg\t%.5f\t\t%.5f\t\t%.5f\t\t%d" % (precision_macro, recall_macro, f1_macro, len(result_list)))
    print("weighted avg\t%.5f\t\t%.5f\t\t%.5f\t\t%d" % (precision_weighted, recall_weighted, f1_weighted, len(result_list)))

In [37]:
evaluate_result(result_list, label_list)

		precision	recall		f1-score	support
负面		0.95101		0.95586		0.95343		589
正面		0.95724		0.95254		0.95488		611

accuracy					0.95417		1200
macro avg	0.95420		0.95413		0.95416		1200
weighted avg	0.95417		0.95417		0.95416		1200
