## Importing Libraries

In [1]:
pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
# TFrom tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
# Pandas for data import 
import pandas as pd
from sklearn.model_selection import train_test_split

## Import data from csv

In [4]:
data = pd.read_csv('NVD_2020_CVSSV3_train.csv')

## Shape of Data

In [5]:
data.shape

(17783, 13)

## Removing Null Values 

In [6]:
data=data.dropna( axis=0, how="any", thresh=None, subset=None, inplace=False)

## Checking Null Values

In [7]:
data.isnull().sum()

CVE_ID                   0
PublishTime              0
ModifyTime               0
Report                   0
CVSSV3                   0
AttackVector             0
AttackComplexity         0
PrivilegesRequired       0
UserInteraction          0
Scope                    0
ConfidentialityImpact    0
IntegrityImpact          0
AvailabilityImpact       0
dtype: int64

## Select required columns

In [8]:
data = data[['Report','AttackVector','AttackComplexity','PrivilegesRequired','UserInteraction','Scope','ConfidentialityImpact','IntegrityImpact','AvailabilityImpact']]

In [9]:
data.head()

Unnamed: 0,Report,AttackVector,AttackComplexity,PrivilegesRequired,UserInteraction,Scope,ConfidentialityImpact,IntegrityImpact,AvailabilityImpact
0,In getProcessRecordLocked of ActivityManagerSe...,LOCAL,LOW,LOW,NONE,UNCHANGED,HIGH,HIGH,HIGH
1,"In ih264d_init_decoder of ih264d_api.c, there ...",NETWORK,LOW,NONE,REQUIRED,UNCHANGED,HIGH,HIGH,HIGH
2,"In onCreate of InstallStart.java, there is a p...",LOCAL,HIGH,LOW,REQUIRED,UNCHANGED,HIGH,HIGH,HIGH
3,In generateCrop of WallpaperManagerService.jav...,LOCAL,LOW,LOW,NONE,UNCHANGED,NONE,NONE,HIGH
4,In btm_read_remote_ext_features_complete of bt...,LOCAL,LOW,HIGH,NONE,UNCHANGED,HIGH,HIGH,HIGH


## Remove rows, where the label is present only ones (can't be split)

In [10]:
data = data.groupby('AttackVector').filter(lambda x : len(x) > 1)
data = data.groupby('AttackComplexity').filter(lambda x : len(x) > 1)
data = data.groupby('PrivilegesRequired').filter(lambda x : len(x) > 1)
data = data.groupby('UserInteraction').filter(lambda x : len(x) > 1)
data = data.groupby('Scope').filter(lambda x : len(x) > 1)
data = data.groupby('ConfidentialityImpact').filter(lambda x : len(x) > 1)
data = data.groupby('IntegrityImpact').filter(lambda x : len(x) > 1)
data = data.groupby('AvailabilityImpact').filter(lambda x : len(x) > 1)

## Setting model output as categorical and save in new label col

In [11]:
data['AttackVector'] = pd.Categorical(data['AttackVector'])
data['AttackComplexity'] = pd.Categorical(data['AttackComplexity'])
data['PrivilegesRequired'] = pd.Categorical(data['PrivilegesRequired'])
data['UserInteraction'] = pd.Categorical(data['UserInteraction'])                                                 
data['Scope'] = pd.Categorical(data['Scope'])
data['ConfidentialityImpact'] = pd.Categorical(data['ConfidentialityImpact'])                                                 
data['IntegrityImpact'] = pd.Categorical(data['IntegrityImpact'])                                                 
data['AvailabilityImpact'] = pd.Categorical(data['AvailabilityImpact'])

## Transform your output to numeric

In [12]:
data['AttackVector'] = data['AttackVector'].cat.codes
data['AttackComplexity'] = data['AttackComplexity'].cat.codes
data['PrivilegesRequired'] =  data['PrivilegesRequired'].cat.codes
data['UserInteraction'] = data['UserInteraction'].cat.codes
data['Scope'] = data['Scope'].cat.codes                                                
data['ConfidentialityImpact'] = data['ConfidentialityImpact'].cat.codes                                                 
data['IntegrityImpact'] = data['IntegrityImpact'].cat.codes                                                 
data['AvailabilityImpact'] = data['AvailabilityImpact'].cat.codes                                                

## Split into train and test 

In [13]:
data, data_test = train_test_split(data, test_size = 0.2)

## Using Bert Model

In [14]:
model_name = 'bert-base-uncased'
# Max length of tokens
max_length = 100
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
# Then build your model output
AttackVector= Dense(units=len(data.AttackVector.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='AttackVector')(pooled_output)
AttackComplexity = Dense(units=len(data.AttackComplexity.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='AttackComplexity')(pooled_output)
PrivilegesRequired= Dense(units=len(data.PrivilegesRequired.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='PrivilegesRequired')(pooled_output)
UserInteraction = Dense(units=len(data.UserInteraction.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='UserInteraction')(pooled_output)
Scope = Dense(units=len(data.Scope.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='Scope')(pooled_output)
ConfidentialityImpact= Dense(units=len(data.ConfidentialityImpact.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='ConfidentialityImpact')(pooled_output)
IntegrityImpact = Dense(units=len(data.IntegrityImpact.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='IntegrityImpact')(pooled_output)
AvailabilityImpact = Dense(units=len(data.AvailabilityImpact.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='AvailabilityImpact')(pooled_output)



outputs = {'AttackVector':AttackVector,'AttackComplexity':AttackComplexity,'PrivilegesRequired':PrivilegesRequired,'UserInteraction':UserInteraction,'Scope':Scope,'ConfidentialityImpact':ConfidentialityImpact,'IntegrityImpact':IntegrityImpact,'AvailabilityImpact':AvailabilityImpact }
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]']              
                                thPoolingAndCrossAt                                               
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                               
                                 768),                                                            
                                 pooler_output=(Non                      

In [16]:
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

In [19]:
# Then build your model output
AttackVector= Dense(units=len(data.AttackVector.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='AttackVector')(pooled_output)
AttackComplexity = Dense(units=len(data.AttackComplexity.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='AttackComplexity')(pooled_output)
PrivilegesRequired= Dense(units=len(data.PrivilegesRequired.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='PrivilegesRequired')(pooled_output)
UserInteraction = Dense(units=len(data.UserInteraction.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='UserInteraction')(pooled_output)
Scope = Dense(units=len(data.Scope.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='Scope')(pooled_output)
ConfidentialityImpact= Dense(units=len(data.ConfidentialityImpact.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='ConfidentialityImpact')(pooled_output)
IntegrityImpact = Dense(units=len(data.IntegrityImpact.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='IntegrityImpact')(pooled_output)
AvailabilityImpact = Dense(units=len(data.AvailabilityImpact.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='AvailabilityImpact')(pooled_output)
outputs = {'AttackVector':AttackVector,'AttackComplexity':AttackComplexity,'PrivilegesRequired':PrivilegesRequired,'UserInteraction':UserInteraction,'Scope':Scope,'ConfidentialityImpact':ConfidentialityImpact,'IntegrityImpact':IntegrityImpact,'AvailabilityImpact':AvailabilityImpact}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]']              
                                thPoolingAndCrossAt                                               
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                               
                                 768),                                                            
                                 pooler_output=(Non                      

In [20]:

# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    clipnorm=1.0)


In [21]:
# Set loss and metrics
loss = {'AttackVector': CategoricalCrossentropy(from_logits = True), 'AttackComplexity': CategoricalCrossentropy(from_logits = True), 'PrivilegesRequired': CategoricalCrossentropy(from_logits = True), 'UserInteraction': CategoricalCrossentropy(from_logits = True), 'Scope': CategoricalCrossentropy(from_logits = True), 'ConfidentialityImpact': CategoricalCrossentropy(from_logits = True), 'IntegrityImpact': CategoricalCrossentropy(from_logits = True), 'AvailabilityImpact': CategoricalCrossentropy(from_logits = True)}
metric = {'AttackVector': CategoricalAccuracy('accuracy'), 'AttackComplexity': CategoricalAccuracy('accuracy'), 'PrivilegesRequired': CategoricalAccuracy('accuracy'), 'UserInteraction': CategoricalAccuracy('accuracy'), 'Scope': CategoricalAccuracy('accuracy'), 'ConfidentialityImpact': CategoricalAccuracy('accuracy'), 'IntegrityImpact': CategoricalAccuracy('accuracy'), 'AvailabilityImpact': CategoricalAccuracy('accuracy')}

In [22]:
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [23]:
# Ready output data for the model
y_AttackVector = to_categorical(data['AttackVector'])
y_AttackComplexity = to_categorical(data['AttackComplexity'])
y_PrivilegesRequired = to_categorical(data['PrivilegesRequired'])
y_UserInteraction = to_categorical(data['UserInteraction'])
y_Scope = to_categorical(data['Scope'])
y_ConfidentialityImpact = to_categorical(data['ConfidentialityImpact'])
y_IntegrityImpact = to_categorical(data['IntegrityImpact'])
y_AvailabilityImpact = to_categorical(data['AvailabilityImpact'])

In [24]:
# Tokenize the input (takes some time)
x = tokenizer(
    text=data['Report'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [25]:
# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'AttackVector': y_AttackVector, 'AttackComplexity': y_AttackComplexity, 'PrivilegesRequired':y_PrivilegesRequired,'UserInteraction':y_UserInteraction ,'Scope' :y_Scope, 'ConfidentialityImpact':y_ConfidentialityImpact, 'IntegrityImpact':y_IntegrityImpact,'AvailabilityImpact' :y_AvailabilityImpact},
    validation_split=0.2,
    batch_size=64,
    epochs=1) ##Can be increased



In [26]:
# Ready test data
test_y_AttackVector = to_categorical(data_test['AttackVector'])
test_y_AttackComplexity = to_categorical(data_test['AttackComplexity'])
test_y_PrivilegesRequired = to_categorical(data_test['PrivilegesRequired'])
test_y_UserInteraction= to_categorical(data_test['UserInteraction'])
test_y_Scope = to_categorical(data_test['Scope'])
test_y_ConfidentialityImpact = to_categorical(data_test['ConfidentialityImpact'])
test_y_IntegrityImpact = to_categorical(data_test['IntegrityImpact'])
test_y_AvailabilityImpact = to_categorical(data_test['AvailabilityImpact'])

test_x = tokenizer(
    text=data_test['Report'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

In [27]:
# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'AttackVector': test_y_AttackVector, 'AttackComplexity': test_y_AttackComplexity, 'PrivilegesRequired':test_y_PrivilegesRequired,'UserInteraction':test_y_UserInteraction ,'Scope' :test_y_Scope, 'ConfidentialityImpact':test_y_ConfidentialityImpact, 'IntegrityImpact':test_y_IntegrityImpact,'AvailabilityImpact' :test_y_AvailabilityImpact}
)



In [28]:
import pickle
pickle.dump(model, open('BERT_model.pkl', 'wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dense_2
......vars
.........0
.........1
...layers\dense_3
......vars
.........0
.........1
...layers\dense_4
......vars
.........0
.........1
...layers\dense_5
......vars
.........0
.........1
...layers\dense_6
......vars
.........0
.........1
...layers\dense_7
......vars
.........0
.........1
...layers\dropout
......vars
...layers\input_layer
......vars
...layers\tf_bert_main_layer
......vars
...layers\tf_bert_main_layer\embeddings
......vars
.........0
.........1
.........2
...layers\tf_bert_main_layer\embeddings\LayerNorm
......vars
.........0
.........1
...layers\tf_bert_main_layer\embeddings\dropout
......vars
...layers\tf_bert_main_layer\encoder
......vars
...layers\tf_bert_main_layer\encoder\layer\tf_bert_layer
......vars
...layers\tf_bert_main_layer\encoder\layer\tf_bert_layer\attention
......vars
...lay

Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-01-13 23:25:41        11704
metadata.json                                  2023-01-13 23:25:41           64
variables.h5                                   2023-01-13 23:25:45   1314725528


In [None]:
pred=model.predict(test_x['input_ids'])

  4/112 [>.............................] - ETA: 10:30

In [32]:
pred

{'AttackVector': array([[-1.6996812e+00, -5.5840063e-01,  2.4909909e+00, -2.2022693e+00],
        [-1.1215954e+00,  1.5591439e+00,  4.9483830e-01, -1.8439174e+00],
        [-8.3919853e-01, -1.1848016e+00,  2.6322248e+00, -1.9242529e+00],
        ...,
        [-5.7215440e-01, -8.8021493e-01,  1.9782079e+00, -1.8016477e+00],
        [-1.5345405e+00,  1.8403761e+00, -1.1109561e-03, -1.7896570e+00],
        [-2.0842028e+00, -1.9360473e-02,  2.5931647e+00, -2.4230523e+00]],
       dtype=float32),
 'AttackComplexity': array([[-1.7356235 ,  1.3559531 ],
        [-1.558967  ,  1.3829057 ],
        [-1.0033826 ,  0.82682097],
        ...,
        [-1.2945329 ,  0.87769836],
        [-1.1651653 ,  1.6839353 ],
        [-2.0303905 ,  1.7151369 ]], dtype=float32),
 'PrivilegesRequired': array([[-0.9982106 , -0.12668854,  1.0910358 ],
        [-1.5292466 , -0.01373485,  1.1084751 ],
        [-0.81958926,  0.02669806,  1.2004863 ],
        ...,
        [-0.96528286, -0.667322  ,  1.5912975 ],
      