In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import pandas as pd

In [5]:
# OPTION
####################################################
data_path = 'data/weibo_senti_100k.csv'
moods = {0: '负向', 1: '正向'}

param_path = './bert-base-chinese'
save_path = './bert-param'
device = 'cpu'
lr = 1e-5
epoch = 10
batch_size = 128


In [6]:
# BEGIN
####################################################
pd_all = pd.read_csv(data_path)
print('数目（总体）：%d' % pd_all.shape[0])

for label, mood in moods.items(): 
    print('数目（{}): {}'.format(mood,  pd_all[pd_all.label==label].shape[0]))


device = torch.device(device)
model_name = param_path
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(moods)).to(device)


class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)
    

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx],padding = 'max_length',truncation = True,max_length = 128,return_tensors='pt')  
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx])
        }
        return item
    
    
def test(model, testloader):
    model.eval()
    y = []
    y_pred = []
    for batch in tqdm(iter(testloader)):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        output = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
        predicted_label = torch.argmax(output.logits, dim=1).tolist()
        
        y += labels.tolist()
        y_pred += predicted_label
    
    accuracy = accuracy_score(y, y_pred)
    print(f"Accuracy: {accuracy}")
    print(classification_report(y, y_pred))

        
        
    

def train(model, dataloader, testloader, num_epochs=10, lr=1e-5):    
    model.train()  
    optimizer = AdamW(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        for batch in tqdm(iter(dataloader)):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
            loss = outputs.loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print('epoch {}'.format(epoch+1))
        test(model, testloader)
        model.save_pretrained(save_path)
        print('保存到：{}'.format(save_path))
            
        

    

s = pd_all.sample(1000)
texts = [item[1] for item in s.values]
labels = [item[0] for item in s.values]

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


dataset = CustomDataset(X_train, y_train, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


test_dataset = CustomDataset(X_test, y_test, tokenizer)
testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

train(model, dataloader, testloader, num_epochs=epoch, lr=lr)

数目（总体）：119988
数目（负向): 59995
数目（正向): 59993


Some weights of the model checkpoint at ./bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint 