## CVE Predictor Engine
The goal of this notebook is to start fresh and implement new techniques like *conditional entropy sorting*
to reduce feature dimensionality.

### Part 1: GPU Training / Benchmarking

Before implementing any new techniques, I want to benchmark the training process to compare
CPU training vs GPU training with an `Intel Arc A370M` on my HP Spectre x16.

Results:
- CPU with dim 23228 for `AV` 70sec / 100 epochs

----

#### Training attempt with max vector contributions

Note to self: each index in `cves` has an array called `source_data`;
this is my true raw data.  If a `source_data` entry has all of the following, I will include it.
1. `cve_id`
1. `description`
1. `scores.[].vector`

The parent description should be copied to each of the vectors in `scores`.

**Important:** some of this code is copy-pasta from the other notebook and may diverge slightly.

In [None]:
# load the intel drivers
# required for torch import to work
# "jupyter.runStartupCommands": []
# below doesn't work; the only thing that works is to
# source the env files before starting vscode such that
# vscode inherits the variables...
import dotenv

dotenv.load_dotenv(override=True)


In [None]:
import json
import pandas as pd
import torch
import sys
import pickle
import os
import numpy as np

def load_cves():
    """Loads all cve data, indexed by cve_id"""
    cves = {}
    for subdir in ("2017", "2018", "2019", "2020", "2021", "2022", "2023"):
        path = os.path.join("../data/cve", subdir)
        for file in os.listdir(path):
            with open(os.path.join(path, file)) as f:
                cves[file.removesuffix(".json")] = json.load(f)
    return cves


def construct_training_set(cves: dict):
    """
    Scan through all CVEs for cve.source_data elements.
    For each element, couple the cve.source_data.elem.description
    with each cve.source_data.elem.score.
    """
    examples = []
    for cve_data in cves.values():
        for sd in cve_data["source_data"]:
            if "scores" not in sd:
                continue
            examples.extend(
                [{"description": sd["description"]} | score for score in sd["scores"]]
            )
    return examples

In [None]:
pkl_path = "../cves.pkl"

if os.path.isfile(pkl_path):
    with open(pkl_path, "rb") as f:
        cves = pickle.load(f)
else:
    # can take a few seconds
    cves = load_cves()
    with open(pkl_path, "wb") as f:
        pickle.dump(cves, f)

print(f"{sys.getsizeof(cves) / 1e6} mb")

In [None]:
df = pd.DataFrame(construct_training_set(cves))
df

In [None]:
import logging
from cve_engine.cvss_data import CVSS_BASE_METRICS
from cve_engine.data_processing import (
    clean_cvss_vector,
    desc_preprocess,
    vec_parse_metric,
    create_bow,
)

logging.basicConfig(
    format="[%(levelname)-8s] (%(name)s) %(message)s",
    level=logging.DEBUG,
)
log = logging.getLogger(__name__)
logging.getLogger("cve_engine.data_processing").setLevel(logging.INFO)


def extract_cvss_vector_components(df: pd.DataFrame, vector: pd.Series):
    for metric in CVSS_BASE_METRICS:
        df[metric] = vector.dropna().apply(lambda v: vec_parse_metric(v, metric))
    return df

log.info("cleaning cvss vectors")
df["vector_clean"] = df["vector"].apply(clean_cvss_vector)
log.info("processing descriptions")
df["processed_desc"] = df["description"].apply(desc_preprocess)
log.info("extracting cvss vector components")
df = extract_cvss_vector_components(df, df["vector_clean"])


In [None]:
# only this compact version is used going forward
df_clean = df.dropna(subset=["vector_clean"]).copy()
df_clean

In [None]:
from sklearn.preprocessing import LabelEncoder


for metric in CVSS_BASE_METRICS.keys():
    encoder = LabelEncoder()
    df_clean[metric + "_Y"] = encoder.fit_transform(df_clean[metric])

Y_np = df_clean[[metric + "_Y" for metric in CVSS_BASE_METRICS.keys()]].values
Y = torch.from_numpy(Y_np)

Y.shape


In [None]:
# split the data and create Y matrices
train_split = 0.8
i = int(train_split * len(Y))
X_train_raw, X_test_raw = df_clean["processed_desc"][:i], df_clean["processed_desc"][i:]
Y_train, Y_test = Y[:i], Y[i:]

# compute X_train_np just so we can examine the shape;
# the actual X_train will be constructed just before training
bow_vec, X_train_np = create_bow(X_train_raw.to_list())
X_train_np.shape, Y_train.shape


In [None]:
from cve_engine.engine import CVEEngineModel

cvem = CVEEngineModel()

load = False
if load:
    cvem.load_latest_models()
    cvem.display_parameters()
else:
    cvem.new_model(bow_vec)
    # this crashes every time (at least on my spectre; not sure about other machines)
    # cvem.optimize_intel_ipex()
    cvem.display_parameters()
    # ~2 min for 100 epochs
    # with cuda: much faster!  30 min for full training run
    cvem.train_all(X_train_raw.to_numpy(), Y_train)
    # cvem.train_all_v2(X_train_raw.to_numpy(), Y_train, X_test_raw.to_numpy(), Y_test)
    cvem.save_models_full()

In [None]:
pred, cs = cvem.predict(X_test_raw.to_numpy())
pred, cs

In [None]:
# pct correct
np.mean(Y_test.numpy() == pred, axis=0)

In [None]:
# average confidence scores
np.mean(cs, axis=0)