In [31]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

torch.cuda.empty_cache()
# 设置一些超参数
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_SEQ_LENGTH = 512  # 您可以根据您的数据集和模型选择合适的最大序列长度

# 定义BERT模型和标记化器
BERT_PATH = '/root/commandDetect/randomForest/bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

# 加载数据
data_path = '/root/commandDetect/randomForest/222.csv'
df = pd.read_csv(data_path)
df = df.dropna(subset=['Command', 'Technique'])
# 编码标签
label_map = {label: idx for idx, label in enumerate(df['Technique'].unique())}
df['Technique'] = df['Technique'].map(label_map)
NUM_CLASSES = len(label_map)

# 创建数据集
input_ids = []
attention_masks = []
labels = []

for text, label in zip(df['Command'], df['Technique']):
    # 截断文本以适应最大序列长度
    encoded_text = tokenizer.encode_plus(text, add_special_tokens=True, padding='max_length', max_length=MAX_SEQ_LENGTH, return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_text['input_ids'])
    attention_masks.append(encoded_text['attention_mask'])
    labels.append(label)

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)

# 划分训练集、验证集和测试集
train_size = int(0.8 * len(dataset))
val_size = (len(dataset) - train_size) // 2
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# 创建数据加载器
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


# 定义BERT分类模型
model = BertForSequenceClassification.from_pretrained(BERT_PATH, num_labels=NUM_CLASSES)

# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

# 训练模型
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 内存不够，咋样分隔都不够
device = torch.device("cpu") 
model.to(device)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()

    print(f"Epoch {epoch+1} - Average Loss: {total_loss / len(train_dataloader)}")

# 评估模型
model.eval()
y_true = []
y_pred = []

for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        y_pred.extend(torch.argmax(logits, dim=1).tolist())
        y_true.extend(labels.tolist())

# 报告分类性能
print(classification_report(y_true, y_pred))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/commandDetect/randomForest/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Average Loss: 2.7112604777018228
Epoch 2 - Average Loss: 2.5004400147332086
Epoch 3 - Average Loss: 2.3516721460554333
              precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.40      1.00      0.57         8
           2       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         5
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         5
          16       0.14      1.00      0.25         2

    accuracy                           0.31        35
   macro avg       0.13      0.18      0.10        35
weighted avg       0.24      0.31      0.19        35



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
from transformers import BertTokenizer

BERT_PATH = '/root/commandDetect/randomForest/bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
print(tokenizer.tokenize('nc -lvpn 2223'))

example_text = 'I will watch Memento tonight'
bert_input = tokenizer(example_text,padding='max_length', 
                       max_length = 10, 
                       truncation=True,
                       return_tensors="pt")
# ------- bert_input ------
print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])


['nc', '-', 'l', '##v', '##p', '##n', '222', '##3']
tensor([[  101,  1045,  2097,  3422,  2033, 23065,  3892,   102,     0,     0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


In [40]:
open('slpp.py', 'w').write(open('/root/commandDetect/xgb/slp/slp2.py', 'r').read())

from slpp import ShellTokenizer

X={}
t = ShellTokenizer(verbose=True)
X["tokenized"], X["counter"] = t.tokenize(['nc -lvnp 2223'])
print(X)

{'tokenized': [['nc', '-lvnp', '2223']], 'counter': Counter({'nc': 1, '-lvnp': 1, '2223': 1})}


In [44]:
from transformers import PreTrainedTokenizerFast

class CustomShellTokenizer(PreTrainedTokenizerFast):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def tokenize(self, text):
        # 使用您的ShellTokenizer来分词
        t = ShellTokenizer(verbose=True)
        tokens, _ = t.tokenize(text)
        return tokens

In [46]:
from transformers import PreTrainedTokenizerFast, BertTokenizer, BertModel

BERT_PATH = '/root/commandDetect/randomForest/bert-base-uncased'
# 加载BERT模型
model = BertModel.from_pretrained(BERT_PATH)

# 创建自定义ShellTokenizer实例
custom_shell_tokenizer = CustomShellTokenizer.from_pretrained(BERT_PATH)

# 将自定义tokenizer与BERT模型关联
model.set_input_embeddings(custom_shell_tokenizer)
model.set_output_embeddings(custom_shell_tokenizer)

# 使用自定义tokenizer进行分词
text = "nc -lvnp 2223"
tokens = custom_shell_tokenizer.tokenize(text)
print(tokens)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'CustomShellTokenizer'.


TypeError: cannot assign '__main__.CustomShellTokenizer' as child module 'word_embeddings' (torch.nn.Module or None expected)