### **数据集处理**

In [8]:
from datasets import load_dataset

dataset = load_dataset("../imdb")


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification

# 加载预训练的Tokenizer和BERT模型
tokenizer = BertTokenizer.from_pretrained("../bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("../bert-base-uncased", num_labels=2,from_tf=True)

All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


In [9]:
# 定义一个预处理函数
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

# 对训练集和验证集进行处理
tokenized_datasets = dataset.map(preprocess_function, batched=True)

tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [10]:
from torch.utils.data import DataLoader

# 创建DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=8, shuffle=True)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8)


In [6]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR

# 设置优化器
optimizer = AdamW(model.parameters(), lr=2e-5)

# 学习率调度器
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)




In [7]:
import torch
from tqdm import tqdm

# 训练模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        # 前向传播
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # 反向传播
        loss.backward()
        optimizer.step()

        progress_bar.set_postfix(loss=loss.item())
        
    # 更新学习率
    scheduler.step()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch 1/3: 100%|██████████| 3125/3125 [3:48:01<00:00,  4.38s/it, loss=0.111]  ]  
Epoch 2/3: 100%|██████████| 3125/3125 [3:18:56<00:00,  3.82s/it, loss=0.00986]  
Epoch 3/3: 100%|██████████| 3125/3125 [3:53:18<00:00,  4.48s/it, loss=0.00378]   


In [9]:
# 保存调优后的模型和Tokenizer
model.save_pretrained("./finetuned_bert_model")
tokenizer.save_pretrained("./finetuned_bert_model")

('./finetuned_bert_model\\tokenizer_config.json',
 './finetuned_bert_model\\special_tokens_map.json',
 './finetuned_bert_model\\vocab.txt',
 './finetuned_bert_model\\added_tokens.json')

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 加载调优过的模型和Tokenizer
model = BertForSequenceClassification.from_pretrained("../finetuned_bert_model")
tokenizer = BertTokenizer.from_pretrained("../finetuned_bert_model")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm  # 导入tqdm

model.eval()  # 切换为评估模式
predictions, labels = [], []

# 使用tqdm包装test_dataloader以显示进度条
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating", ncols=100, leave=True):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        predictions.extend(preds.cpu().numpy())
        labels.extend(label.cpu().numpy())

# 计算准确率
accuracy = accuracy_score(labels, predictions)
print(f"Accuracy: {accuracy:.4f}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Evaluating: 100%|███████████████████████████████████████████████| 3125/3125 [11:56<00:00,  4.36it/s]

Accuracy: 0.9401





In [12]:
def classify_input(input_text):
    model.eval()  # 切换为评估模式
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    return "positive" if prediction == 1 else "negative"

# 例子: 用户自定义输入
input_text = input("请输入文本")
prediction = classify_input(input_text)
print(f"result: {prediction}")

result: negative
