In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, random_split
from transformers import AutoModelForSequenceClassification, AdamW

batch_size = 16

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载数据
data_dir = '../data/'
train_fname = os.path.join(data_dir, 'traindata.csv')
dev_fname = os.path.join(data_dir, 'devdata.csv')
colum_names = ['Polarity', 'AspectCategory', 'Term', 'Offsets', 'Sentence']

train_data = pd.read_csv(train_fname, delimiter='\t', header=None, names=colum_names)
dev_data = pd.read_csv(dev_fname, delimiter='\t', header=None, names = colum_names)

In [3]:
# 将数据转换为 Hugging Face datasets 的 Dataset 对象
train_dataset = Dataset.from_pandas(train_data)
dev_dataset = Dataset.from_pandas(dev_data)

In [4]:
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [5]:
# 定义预处理函数
def preprocess_function(examples):
    # 分离出术语的起始和结束位置
    start_offsets, end_offsets = zip(*[(int(offset.split(':')[0]), int(offset.split(':')[1]))
                                       for offset in examples['Offsets']])
    # 提取术语及其在句子中的上下文
    term_contexts = [sentence[start:end] for sentence, start, end in zip(examples['Sentence'], start_offsets, end_offsets)]
    # 对句子进行编码，确保同时考虑术语和上下文
    encoded_inputs = tokenizer(examples['Sentence'], term_contexts, truncation=True, padding='max_length', max_length=40)
    # 将极性标签转换为整数形式
    polarity_to_id = {'positive': 2, 'neutral': 1, 'negative': 0}
    encoded_inputs['labels'] = [polarity_to_id[p] for p in examples['Polarity']]

    return encoded_inputs

In [6]:
# 应用预处理
train_dataset = train_dataset.map(preprocess_function, batched=True)
dev_dataset = dev_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1503/1503 [00:00<00:00, 16736.86 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 376/376 [00:00<00:00, 17096.78 examples/s]


In [7]:
def convert_to_tensors(dataset):
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    return dataset

In [8]:
train_dataset = convert_to_tensors(train_dataset)
dev_dataset = convert_to_tensors(dev_dataset)

In [9]:
# 数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [10]:
# 定义预训练的BERT模型名字和设备
new_dropout_rate = 0.2 

model_name = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.dropout.p = new_dropout_rate

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
# 定义优化器
optimizer = optim.AdamW(model.parameters(), lr=2e-6)

In [12]:
# 定义损失函数
criterion = nn.CrossEntropyLoss()

In [13]:
# 训练
def train(train_loader, dev_loader, model, optimizer, criterion, device, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_acc =0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # 梯度清零
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            total_acc += (predictions == labels).sum().item()

        avg_loss = total_loss / len(train_loader)
        avg_accuracy = total_acc / len(train_loader.dataset)
        print(f'Epoch {epoch+1}/{epochs} - Training Loss: {avg_loss:.4f}, Training Accuracy: {avg_accuracy:.4f}')

        evaluate(model, dev_loader, device)

def evaluate(model, dev_loader, device):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            total_eval_accuracy += (predictions == labels).sum().item()

    avg_loss = total_eval_loss / len(dev_loader)
    accuracy = total_eval_accuracy / len(dev_loader.dataset)
    print(f'Validation - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
            

In [14]:
train(train_loader, dev_loader, model, optimizer, criterion, device, epochs=10)

Epoch 1/10 - Training Loss: 0.8017, Training Accuracy: 0.6953
Validation - Loss: 0.6973, Accuracy: 0.7048
Epoch 2/10 - Training Loss: 0.6608, Training Accuracy: 0.7279
Validation - Loss: 0.6182, Accuracy: 0.7261
Epoch 3/10 - Training Loss: 0.5599, Training Accuracy: 0.7911
Validation - Loss: 0.5294, Accuracy: 0.8085
Epoch 4/10 - Training Loss: 0.4892, Training Accuracy: 0.8343
Validation - Loss: 0.4807, Accuracy: 0.8351
Epoch 5/10 - Training Loss: 0.4344, Training Accuracy: 0.8663
Validation - Loss: 0.4569, Accuracy: 0.8378
Epoch 6/10 - Training Loss: 0.3827, Training Accuracy: 0.8776
Validation - Loss: 0.4391, Accuracy: 0.8511
Epoch 7/10 - Training Loss: 0.3543, Training Accuracy: 0.8869
Validation - Loss: 0.4437, Accuracy: 0.8564
Epoch 8/10 - Training Loss: 0.3277, Training Accuracy: 0.8982
Validation - Loss: 0.4366, Accuracy: 0.8564
Epoch 9/10 - Training Loss: 0.3007, Training Accuracy: 0.9088
Validation - Loss: 0.4386, Accuracy: 0.8484
Epoch 10/10 - Training Loss: 0.2862, Training 