### Using sklearn to preprocess CVE data

[Working With Text Data — scikit-learn 1.3.2 documentation](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [None]:
import json
import pandas as pd
import torch
import sys
import pickle
import os
import numpy as np

import logging
from cve_engine.cvss_data import CVSS_BASE_METRICS
from cve_engine.data_processing import (
    clean_cvss_vector,
    desc_preprocess,
    vec_parse_metric,
    create_bow,
)

logging.basicConfig(
    format="[%(levelname)-8s] (%(name)s) %(message)s",
    level=logging.DEBUG,
)
log = logging.getLogger(__name__)
logging.getLogger("cve_engine.data_processing").setLevel(logging.INFO)


def load_cves():
    """Loads all cve data, indexed by cve_id"""
    cves = {}
    for subdir in ("2017", "2018", "2019", "2020", "2021", "2022", "2023"):
        path = os.path.join("../data/cve", subdir)
        for file in os.listdir(path):
            with open(os.path.join(path, file)) as f:
                cves[file.removesuffix(".json")] = json.load(f)
    return cves


def construct_training_set(cves: dict):
    """
    Scan through all CVEs for cve.source_data elements.
    For each element, couple the cve.source_data.elem.description
    with each cve.source_data.elem.score.
    """
    examples = []
    for cve_data in cves.values():
        for sd in cve_data["source_data"]:
            if "scores" not in sd:
                continue
            examples.extend(
                [{"description": sd["description"]} | score for score in sd["scores"]]
            )
    return examples

In [None]:
pkl_path = "../cves.pkl"

if os.path.isfile(pkl_path):
    with open(pkl_path, "rb") as f:
        cves = pickle.load(f)
else:
    # can take a few seconds
    cves = load_cves()
    with open(pkl_path, "wb") as f:
        pickle.dump(cves, f)

print(f"{sys.getsizeof(cves) / 1e6} mb")

In [None]:
df = pd.DataFrame(construct_training_set(cves))
df

In [None]:

def extract_cvss_vector_components(df: pd.DataFrame, vector: pd.Series):
    for metric in CVSS_BASE_METRICS:
        df[metric] = vector.dropna().apply(lambda v: vec_parse_metric(v, metric))
    return df

log.info("cleaning cvss vectors")
df["vector_clean"] = df["vector"].apply(clean_cvss_vector)
log.info("processing descriptions")
df["processed_desc"] = df["description"].apply(desc_preprocess)
log.info("extracting cvss vector components")
df = extract_cvss_vector_components(df, df["vector_clean"])

In [None]:
# only this compact version is used going forward
df_clean = df.dropna(subset=["vector_clean"]).copy()
df_clean