In [11]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. 读取数据
data = pd.read_csv("/root/commandDetect/randomForest/cmd2technique2.csv")

# 2. 分割数据集
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 3. 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("/root/commandDetect/randomForest/bert-base-uncased")

# 4. 预处理和准备数据
max_length = 128  # 适当选择一个最大长度
batch_size = 32

def preprocess_text(text):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
    return inputs

train_data['inputs'] = train_data['Command'].apply(preprocess_text)
test_data['inputs'] = test_data['Command'].apply(preprocess_text)

train_input_ids = torch.cat(train_data['inputs'].apply(lambda x: x['input_ids']).apply(lambda x: torch.tensor(x)).tolist())
train_attention_mask = torch.cat(train_data['inputs'].apply(lambda x: x['attention_mask']).apply(lambda x: torch.tensor(x)).tolist())

label_encoder = LabelEncoder()
train_data['Technique'] = label_encoder.fit_transform(train_data['Technique'])
train_labels = torch.tensor(train_data['Technique'])

test_input_ids = torch.cat(test_data['inputs'].apply(lambda x: x['input_ids']).values)
test_attention_mask = torch.cat(test_data['inputs'].apply(lambda x: x['attention_mask']).values)

test_data['Technique'] = label_encoder.transform(test_data['Technique'])
test_labels = torch.tensor(test_data['Technique'])

# 5. 创建数据加载器
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# 6. 加载BERT模型
num_classes = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained("/root/commandDetect/randomForest/bert-base-uncased", num_labels=num_classes)

# 7. 设置优化器
optimizer = AdamW(model.parameters(), lr=2e-5)

# 8. 训练模型
num_epochs = 5  # 适当选择训练的轮数

model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataloader)}")

# 9. 评估模型
model.eval()

true_labels = []
predicted_labels = []

for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_class = torch.argmax(outputs.logits, dim=1).tolist()
        true_labels.extend(labels.tolist())
        predicted_labels.extend(predicted_class)

# 10. 计算评估指标
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


TypeError: cat(): argument 'tensors' (position 1) must be tuple of Tensors, not numpy.ndarray