In [3]:
import time
import torch
import argparse
import numpy as np
import torch.nn as nn
from tqdm.auto import tqdm
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [5]:
class BertClassifier(nn.Module):
    def __init__(self,
                 pretrained: str,
                 num_classes=3,
                 pooling_output_layer=-1):
        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 768, num_classes
        self.bert = BertModel.from_pretrained(pretrained)
        self.classifier = nn.Sequential(nn.Linear(D_in, H), nn.Tanh(), nn.Linear(H, D_out))
        self.dropout = nn.Dropout(0.1)
        self.pooling_output_layer = pooling_output_layer

    def forward(self, input_ids, attention_mask, output_hidden_states=True):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            output_hidden_states=output_hidden_states)
        sentence_embeddings = outputs[1]
        sentence_embeddings = self.dropout(sentence_embeddings)
        logits = self.classifier(sentence_embeddings.to(device))
        return logits

In [13]:
save_model_path="save_model"
final_model_path="final_model"
max_sequence_length=128
batch_size=64
epochs=3,
warmup_steps=2000
lr=3e-5
max_grad_norm=1.0
log_step=100

In [14]:
class MyDataset(Dataset):
    def __init__(self, text_list, label_list, tokenizer, max_sequence_len):
        self.input_ids = []
        self.token_type_ids = []
        self.attention_mask = []
        self.label_list = label_list
        self.len = len(label_list)
        for text in tqdm(text_list):
            text = text[:max_sequence_len - 2]
            title_ids = tokenizer.encode_plus(text, padding='max_length', max_length=max_sequence_len)
            self.input_ids.append(title_ids['input_ids'])
            self.attention_mask.append(title_ids["attention_mask"])

    def __getitem__(self, index):
        tmp_input_ids = self.input_ids[index]
        tmp_attention_mask = self.attention_mask[index]
        tmp_label = self.label_list[index]
        output = {"input_ids": torch.tensor(tmp_input_ids).to(device),
                  "attention_mask": torch.tensor(tmp_attention_mask).to(device)}
        return output, tmp_label

    def __len__(self):
        return self.len

In [15]:
def data_loader(x_list, y_list, tokenizer, max_sequence_len, batch_size, shuffle):
    dataset = MyDataset(x_list, y_list, tokenizer, max_sequence_len)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=shuffle)
    return dataloader

In [16]:
def load_model(num_labels):
    model = BertClassifier("bert-base-uncased", num_labels)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return model, tokenizer

In [17]:
def compute_acc(logits, label):
    predicted_class_id = torch.tensor([w.argmax().item() for w in logits])
    return float((predicted_class_id == label).float().sum()) / label.shape[0]

In [18]:
import pandas as pd
label_dict = {1: 0, 2: 0, 3: 1, 4: 2, 5: 2}
def load_raw_data(data_path):
    # 读取原始数据，原始数据很多，抽取前5万行训练，5000行验证
    train_x, train_y = [], []
    eval_x, eval_y = [], []
    df = pd.read_csv(data_path)
    for idx in range(len(df)):
        if idx < 1300000:
            train_x.append(df['review_text'][idx])
            train_y.append(label_dict[int(df['rating'][idx])])
        else:
            eval_x.append(df['review_text'][idx])
            eval_y.append(label_dict[int(df['rating'][idx])])
    print(len(train_x), len(train_y))
    print(len(eval_x), len(eval_y))
    return train_x, train_y, eval_x, eval_y

In [19]:
def train(args, model, dataloader, device):
    num_training_steps = epochs * len(dataloader)
    optimizer = Adam(model.parameters(), lr=lr)
    model.to(device)
    model.train()
    batch_steps = 0
    loss_fct = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        for batch, label in dataloader:
            batch_steps += 1
            logits = model(**batch)
            acc = compute_acc(logits, label)
            loss = loss_fct(logits.view(-1, 2).to(device), label.view(-1).to(device))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()
            if batch_steps % log_step == 0:
                print("train epoch {}/{}, batch {}/{}, loss {}, acc {}".format(
                    epoch + 1, args.epochs,
                    batch_steps,
                    num_training_steps,
                    loss,
                    acc))
    torch.save(model, 'model_final.pth')

In [22]:
model, tokenizer = load_model(num_labels=3)
train_x, train_y, eval_x, eval_y = load_raw_data("./data/review.csv")
print("训练数据", len(train_x))
print("验证数据", len(eval_x))
train_dataloader = data_loader(train_x,
                               train_y,
                               tokenizer,
                               max_sequence_length,
                               batch_size,
                               True)
train(args, model, train_dataloader, device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1300000 1300000
187747 187747
训练数据 1300000
验证数据 187747


NameError: name 'args' is not defined