In [1]:
import pandas as pd

In [3]:
# Loading test data
test_df = pd.read_csv('../data_processed/model_data/test_data.csv')
val_df = pd.read_csv('../data_processed/model_data/val_data.csv')
test_df = pd.concat([test_df, val_df], ignore_index=True)
test_df['label'] = test_df['label'].apply(lambda x: [x])
X_test = test_df['text'].values.tolist()
y_test = test_df['label'].values.tolist()

In [4]:
import pickle
#mlb_path = 'models/mlb.pkl'
mlb_path = '../models/mlb.pkl'
mlb = pickle.load(open(mlb_path, 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
y_test = mlb.transform(y_test)

In [8]:
from torch import cuda

In [9]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [12]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
model = AutoModelForSequenceClassification.from_pretrained('../models/distilbert_model')

In [16]:
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [17]:
import torch

In [20]:
test_data = test_df['init_text'].values.tolist()
batch_size = 100
model.eval()
fin_outputs = []

with torch.no_grad():
    for idx in range(0, len(test_data), batch_size):

        batch = test_data[idx : min(len(test_data), idx+batch_size)]

        encoded = tokenizer.batch_encode_plus(batch,max_length=512, padding='max_length', truncation=True)
        encoded = {key:torch.LongTensor(value) for key, value in encoded.items()}
        ids = encoded['input_ids'].to(device)
        mask = encoded['attention_mask'].to(device)

        outputs = model(input_ids=ids, attention_mask=mask)[0]
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())


In [21]:
import numpy as np

In [22]:
y_pred = np.copy(fin_outputs)
# выставляю порог в 0.5, как и для других моделей (по дефолту)
THRESHOLD = 0.5
y_pred = np.where(y_pred > THRESHOLD, 1.0, 0.0)

In [25]:
import sklearn.metrics

In [27]:
print('Exact Match Ratio: {0}'.format(sklearn.metrics.accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)))

print('Hamming loss: {0}'.format(sklearn.metrics.hamming_loss(y_test, y_pred)))

Exact Match Ratio: 0.7224094972885827
Hamming loss: 0.02146782940055694


In [28]:
print('Recall micro: {0}'.format(sklearn.metrics.precision_score(y_true=y_test, y_pred=y_pred, average='samples')))

print('Precision micro: {0}'.format(sklearn.metrics.recall_score(y_true=y_test, y_pred=y_pred, average='samples')))
# считает общее число True Positives (TP), False Positives (FP) and False Negatives (FN) (не для каждого класса)
print('F1 Measure micro: {0}'.format(sklearn.metrics.f1_score(y_true=y_test, y_pred=y_pred, average='micro')))

Recall micro: 0.7262201377693097
Precision micro: 0.7300307782500366
F1 Measure micro: 0.7727572431447077


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
f1_scores = sklearn.metrics.f1_score(y_test, y_pred, average=None)
precision_scores = sklearn.metrics.precision_score(y_test, y_pred, average=None)
recall_scores = sklearn.metrics.recall_score(y_test, y_pred, average=None)
df_accuracy = pd.DataFrame({"label": mlb.classes_,
                            # "auc": auc_scores,
                            'f1-score': f1_scores,
                            'precision': precision_scores,
                            'recall': recall_scores})
#df_accuracy = df_accuracy.sort_values('f1-score')[::-1]


In [30]:
df_accuracy

Unnamed: 0,label,f1-score,precision,recall
0,CASB,0.722662,0.752475,0.695122
1,EDR,0.689362,0.735027,0.649038
2,MDR,0.797277,0.848631,0.751783
3,NDR,0.863568,0.883436,0.844575
4,NGFW,0.834395,0.861842,0.808642
5,SASE,0.679153,0.780899,0.600865
6,SIEM,0.638801,0.713028,0.578571
7,SOAR,0.786936,0.853288,0.730159
8,anti-counterfeit,0.879464,0.87426,0.884731
9,application_control,0.890182,0.921687,0.860759


In [32]:
df_accuracy.to_csv('results/nn_classes.csv', index=False)