In [1]:
import torch  
from torch.utils.data import Dataset, DataLoader  
from transformers import BertTokenizer, BertForMaskedLM  
import pandas as pd  
import os  

# 1. 加载分词器  
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# 2. 数据加载与预处理  
class TextDataset(Dataset):  
    def __init__(self, sentences, labels, tokenizer, max_len=128):  
        self.sentences = sentences  
        self.labels = labels  
        self.tokenizer = tokenizer  
        self.max_len = max_len  

    def __getitem__(self, idx):  
        encoding = self.tokenizer(  
            self.sentences[idx],  
            max_length=self.max_len,  
            padding='max_length',  
            truncation=True,  
            return_tensors='pt'  
        )  
        return {  
            'input_ids': encoding['input_ids'].squeeze(),  
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)  
        }  

    def __len__(self):  
        return len(self.sentences)  

# 3. 加载数据  
df = pd.read_csv('sentences_and_labels.csv')  

sentences = df['Sentence'].tolist()  
labels = df['Label'].tolist()  

# 4. 初始化模型（使用预训练BERT）  
model = BertForMaskedLM.from_pretrained('bert-base-uncased')  

# 5. 微调分类任务（添加分类头）  
from transformers import BertForSequenceClassification  
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  

# 6. 训练循环（示例）  
dataset = TextDataset(sentences, labels, tokenizer)  
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)  
for epoch in range(10):  
    total_loss = 0  
    for batch in dataloader:  
        optimizer.zero_grad()  
        outputs = model(  
            input_ids=batch['input_ids'].to(device),  
            labels=batch['labels'].to(device)  
        )  
        loss = outputs.loss  
        loss.backward()  
        optimizer.step()  
        total_loss += loss.item()  
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")  

# 保存模型
model_save_path = 'saved_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"模型已保存到 {model_save_path}")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

Epoch 1, Loss: 0.6167
Epoch 2, Loss: 0.1247
Epoch 3, Loss: 0.0186
Epoch 4, Loss: 0.0095
Epoch 5, Loss: 0.0063
Epoch 6, Loss: 0.0046
Epoch 7, Loss: 0.0035
Epoch 8, Loss: 0.0028
Epoch 9, Loss: 0.0022
Epoch 10, Loss: 0.0018
模型已保存到 saved_model.pth


In [11]:
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 加载保存的模型权重
model_path = 'saved_model.pth'
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()  # 设置为评估模式

# 定义情感标签映射
sentiment_labels = {
    0: "负面",
    1: "积极"
}

def predict_text(text: str, max_length: int = 128) -> dict:
    """
    使用加载的BERT模型对文本进行分类预测
    
    Args:
        text: 待预测的文本
        max_length: 输入文本的最大长度
        
    Returns:
        包含预测结果的字典，包含类别索引、置信度和情感标签
    """
    # 对输入文本进行分词和编码
    inputs = tokenizer(
        text, 
        return_tensors='pt', 
        max_length=max_length, 
        padding='max_length', 
        truncation=True
    )
    
    # 将输入数据移至设备
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 模型推理 (不计算梯度，提高性能)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 获取预测结果
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    confidence, predicted_class = torch.max(probabilities, dim=1)
    
    return {
        'text': text,
        'predicted_class': predicted_class.item(),
        'sentiment': sentiment_labels[predicted_class.item()],  # 添加情感标签
        'confidence': confidence.item(),
        'probabilities': probabilities.cpu().numpy()[0].tolist()
    }

# 示例：对单个文本进行预测
sample_text = "This movie is good!"
result = predict_text(sample_text)

print(f"文本: {result['text']}")
print(f"预测类别: {result['predicted_class']} ({result['sentiment']})")
print(f"置信度: {result['confidence']:.4f}")
print(f"各类别概率: {result['probabilities']}")    

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


文本: This movie is good!
预测类别: 0 (负面)
置信度: 0.7286
各类别概率: [0.7285577654838562, 0.2714422345161438]
