In [1]:
import pandas as pd
import numpy as np

X_train = pd.read_csv('./cvss_2022_2024_X_train-attack-vector.csv')
y_train = pd.read_csv('./cvss_2022_2024_y_train-attack-vector.csv')

X_test = pd.read_csv('./cvss_2022_2024_X_test-attack-vector.csv')
y_test = pd.read_csv('./cvss_2022_2024_y_test-attack-vector.csv')

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(y_train['cvssv3_attack_vector'].value_counts(dropna=False) / y_train.shape[0])
print(y_test['cvssv3_attack_vector'].value_counts(dropna=False) / y_test.shape[0])
print(X_train.info())
print(y_train.info())

(142192, 1)
(35548, 1)
(142192, 1)
(35548, 1)
cvssv3_attack_vector
ADJACENT_NETWORK    0.250225
NETWORK             0.250077
PHYSICAL            0.249887
LOCAL               0.249810
Name: count, dtype: float64
cvssv3_attack_vector
LOCAL               0.250760
PHYSICAL            0.250450
NETWORK             0.249691
ADJACENT_NETWORK    0.249100
Name: count, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142192 entries, 0 to 142191
Data columns (total 1 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   english_description  142192 non-null  object
dtypes: object(1)
memory usage: 1.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142192 entries, 0 to 142191
Data columns (total 1 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   cvssv3_attack_vector  142192 non-null  object
dtypes: object(1)
memory usage: 1.1+ MB
None


In [2]:
label_column_name = "cvssv3_attack_vector"
train_labels = y_train.loc[:, label_column_name]
test_labels = y_test.loc[:, label_column_name]


from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_labels)
print(le.classes_)

NUM_CLASSES = len(le.classes_)
print(NUM_CLASSES)

encoded_train_labels = le.transform(train_labels)
encoded_test_labels = le.transform(test_labels)

print(train_labels[:10], encoded_train_labels[:10])
print(len(X_train), len(train_labels), len(X_test), len(test_labels))

['ADJACENT_NETWORK' 'LOCAL' 'NETWORK' 'PHYSICAL']
4
0               LOCAL
1    ADJACENT_NETWORK
2               LOCAL
3            PHYSICAL
4            PHYSICAL
5               LOCAL
6             NETWORK
7    ADJACENT_NETWORK
8            PHYSICAL
9               LOCAL
Name: cvssv3_attack_vector, dtype: object [1 0 1 3 3 1 2 0 3 1]
142192 142192 35548 35548


In [3]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-small')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train_encodings = tokenizer(X_train.loc[:,"english_description"].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.loc[:,"english_description"].tolist(), truncation=True, padding=True, max_length=128)

In [5]:
import torch

class CVEDataset(torch.utils.data.Dataset):
    def __init__(self, X, encodings, labels, encoded_labels):
        self.texts = X.loc[:,"english_description"].tolist()
        self.encodings = encodings
        self.labels = labels.tolist()
        self.encoded_labels = encoded_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['text_labels'] = self.labels[idx]
        item['encoded_labels'] = torch.tensor(self.encoded_labels[idx])
        item['vulnerability_description'] = self.texts[idx]
        
        return item

    def __len__(self):
        return len(self.labels)


In [6]:
train_dataset = CVEDataset(X_train, train_encodings, train_labels, encoded_train_labels)
test_dataset = CVEDataset(X_test, test_encodings, test_labels, encoded_test_labels)

In [7]:
train_dataset[0]

{'input_ids': tensor([  101,  1000, 18106,  9353,  3217, 14479,  8068,  4617,  2570,  1012,
         25604,  1012,  2297,  2475,  1006,  1998,  3041,  1007,  1010,  2322,
          1012,  4002,  2629,  1012, 19988, 22022,  1006,  1998,  3041,  1007,
          1998,  2459,  1012,  5890,  2475,  1012, 22060, 24594,  1006,  1998,
          3041,  1007,  2024,  5360,  2011,  2019,  2041,  1011,  1997,  1011,
         19202,  3191, 18130,  2043, 11968,  7741,  1037, 19275,  5371,  1010,
          2029,  2071,  2765,  1999,  1037,  3191,  2627,  1996,  2203,  1997,
          2019, 11095,  3638,  3252,  1012,  2019, 17346,  2071, 21155,  2023,
         18130,  2000, 15389,  3642,  1999,  1996,  6123,  1997,  1996,  2783,
          5310,  1012, 14427,  1997,  2023,  3277,  5942,  5310,  8290,  1999,
          2008,  1037,  6778,  2442,  2330,  1037, 24391,  5371,  1012,  1000,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [8]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-small', num_labels=NUM_CLASSES)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-1

In [9]:
for param in model.base_model.parameters():
    param.requires_grad = False
    
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

classifier.weight tensor([[-0.0314, -0.0097, -0.0061,  ...,  0.0090, -0.0013,  0.0197],
        [-0.0002, -0.0158,  0.0058,  ...,  0.0057, -0.0043,  0.0264],
        [ 0.0054, -0.0179,  0.0141,  ...,  0.0263, -0.0139, -0.0144],
        [-0.0172,  0.0377,  0.0102,  ...,  0.0009,  0.0093,  0.0172]])
classifier.bias tensor([0., 0., 0., 0.])


In [10]:
from torch.utils.data import DataLoader
from transformers import AdamW
import torch.nn.functional as F

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

training_loss_batch = []
training_loss_epoch = []

for epoch in range(3):
    model.train()
    training_loss = 0
    num_correct = 0 
    num_examples = 0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['encoded_labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        training_loss_batch.append(loss.data.item())
        training_loss += loss.data.item() * input_ids.size(0)
        correct = torch.eq(torch.max(F.softmax(outputs.logits, dim=1), dim=1)[1], labels)
        num_correct += torch.sum(correct).item()
        num_examples += correct.shape[0]
    training_loss /= len(train_loader.dataset)
    training_loss_epoch.append(training_loss)
    
    print('Epoch: {}, Training Loss: {}, Training Accuracy = {}'.format(epoch, training_loss, num_correct / num_examples))

  



Epoch: 0, Training Loss: 1.1583875328439877, Training Accuracy = 0.5215061325531676
Epoch: 1, Training Loss: 1.0265454620817078, Training Accuracy = 0.5901808821874648
Epoch: 2, Training Loss: 0.9861498570246356, Training Accuracy = 0.6073900078766737


In [11]:
for param in model.base_model.parameters():
    param.requires_grad = True
    
for name, param in model.named_parameters():
    if param.requires_grad:
        # print(name, param.data)
        pass

In [12]:
from torch.utils.data import DataLoader
from transformers import AdamW
import torch.nn.functional as F

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

training_loss_batch = []
training_loss_epoch = []

for epoch in range(3):
    model.train()
    training_loss = 0
    num_correct = 0 
    num_examples = 0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['encoded_labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        training_loss_batch.append(loss.data.item())
        training_loss += loss.data.item() * input_ids.size(0)
        correct = torch.eq(torch.max(F.softmax(outputs.logits, dim=1), dim=1)[1], labels)
        num_correct += torch.sum(correct).item()
        num_examples += correct.shape[0]
    training_loss /= len(train_loader.dataset)
    training_loss_epoch.append(training_loss)
    
    print('Epoch: {}, Training Loss: {}, Training Accuracy = {}'.format(epoch, training_loss, num_correct / num_examples))

  

Epoch: 0, Training Loss: 0.17524454202803993, Training Accuracy = 0.9388854506582649
Epoch: 1, Training Loss: 0.08279674382779084, Training Accuracy = 0.9726144930797794
Epoch: 2, Training Loss: 0.06153090472359961, Training Accuracy = 0.9803575447282548


In [13]:
model.save_pretrained('../models/bert-small-vulnerability_attack_vector-classification')

In [14]:
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()
num_correct = 0 
num_examples = 0
test_loss = 0
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['encoded_labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs[0]
    test_loss += loss.data.item() * input_ids.size(0)
    correct = torch.eq(torch.max(F.softmax(outputs.logits, dim=1), dim=1)[1], labels)
    num_correct += torch.sum(correct).item()
    num_examples += correct.shape[0]
test_loss /= len(test_loader.dataset)

        
print('Test Loss: {}, Test Accuracy = {}'.format(test_loss, num_correct / num_examples))

Test Loss: 0.06995268233049547, Test Accuracy = 0.9790986834702374
