## CVE Predictor Engine
The goal of this notebook is to start fresh and implement new techniques like *conditional entropy sorting*
to reduce feature dimensionality.

### Part 1: GPU Training / Benchmarking

Before implementing any new techniques, I want to benchmark the training process to compare
CPU training vs GPU training with an `Intel Arc A370M` on my HP Spectre x16.

----

#### Training attempt with max vector contributions

Note to self: each index in `cves` has an array called `source_data`;
this is my true raw data.  If a `source_data` entry has all of the following, I will include it.
1. `cve_id`
1. `description`
1. `scores.[].vector`

The parent description should be copied to each of the vectors in `scores`.

**Important:** some of this code is copy-pasta from the other notebook and may diverge slightly.

In [None]:
import os
import json
import pandas as pd

def load_cves():
    """Loads all cve data, indexed by cve_id"""
    cves = {}
    for subdir in ("2017", "2018", "2019", "2020", "2021", "2022", "2023"):
        path = os.path.join("../data/cve", subdir)
        for file in os.listdir(path):
            with open(os.path.join(path, file)) as f:
                cves[file.removesuffix(".json")] = json.load(f)
    return cves

def construct_training_set(cves: dict):
    """
    Scan through all CVEs for cve.source_data elements.
    For each element, couple the cve.source_data.elem.description
    with each cve.source_data.elem.score.
    """
    examples = []
    for cve_data in cves.values():
        for sd in cve_data["source_data"]:
            if "scores" not in sd: continue
            examples.extend(
                [
                    {"description": sd["description"]} | score
                    for score in sd["scores"]
                ]
            )
    return examples

In [None]:
# can take a few seconds
cves = load_cves()

In [None]:
df_x = pd.DataFrame(construct_training_set(cves))

In [None]:
import logging

logging.getLogger("cve_engine.data_processing").setLevel(logging.INFO)
from cve_engine.cvss_data import CVSS_BASE_METRICS
from cve_engine.data_processing import (
    clean_cvss_vector,
    desc_preprocess,
    vec_parse_metric,
    create_bow,
)


def extract_cvss_vector_components(df: pd.DataFrame, vector: pd.Series):
    for metric in CVSS_BASE_METRICS:
        df[metric] = vector.dropna().apply(lambda v: vec_parse_metric(v, metric))
    return df

df_x["vector_clean"] = df_x["vector"].apply(clean_cvss_vector)
df_x["processed_desc"] = df_x["description"].apply(desc_preprocess)
df_x = extract_cvss_vector_components(df_x, df_x["vector_clean"])

df_x.to_csv("../df_x.csv")

In [None]:
# only this compact is version is used going forward
df_x_clean = df_x.dropna(subset=["vector_clean"]).copy()

In [None]:
from sklearn.preprocessing import OneHotEncoder


for metric in CVSS_BASE_METRICS.keys():
    encoder = LabelEncoder()
    df_x_clean[metric + "_Y"] = encoder.fit_transform(df_x_clean[metric])

Y_np = df_x_clean[[metric + "_Y" for metric in CVSS_BASE_METRICS.keys()]].values
Y = torch.from_numpy(Y_np)

Y.shape


In [None]:
# split the data and create Y matrices
train_split = 0.8
i = int(0.8 * len(Y))
X_train_raw, X_test_raw = df_x_clean["processed_desc"][:i], df_x_clean["processed_desc"][i:]
Y_train, Y_test = Y[:i], Y[i:]

# compute X_train_np just so we can examine the shape;
# the actual X_train will be constructed just before training
bow_vec, X_train_np = create_bow(X_train_raw.to_list())
X_train_np.shape, Y_train.shape


In [None]:
from cve_engine.engine import CVEEngineModel

cvem = CVEEngineModel()


In [None]:
cvem.new_model(bow_vec)
cvem.display_parameters()


In [None]:
cvem.train_all(X_train_raw.to_numpy(), Y_train)