### Using sklearn to preprocess CVE data

[Working With Text Data — scikit-learn 1.3.2 documentation](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [None]:
import json
import logging
import os
import pickle
import sys

import numpy as np
import pandas as pd
import torch
import sklearn.pipeline
import sklearn.feature_extraction.text
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics
import matplotlib.pyplot as plt

from cve_engine.cvss_data import CVSS_BASE_METRICS
from cve_engine.data_processing import (
    clean_cvss_vector,
    create_bow,
    desc_preprocess,
    vec_parse_metric,
)

logging.basicConfig(
    format="[%(levelname)-8s] (%(name)s) %(message)s",
    level=logging.DEBUG,
)
log = logging.getLogger(__name__)
logging.getLogger("cve_engine.data_processing").setLevel(logging.INFO)
logging.getLogger("matplotlib").setLevel(logging.INFO)


def load_cves():
    """Loads all cve data, indexed by cve_id"""
    cves = {}
    for subdir in ("2017", "2018", "2019", "2020", "2021", "2022", "2023"):
        path = os.path.join("../data/cve", subdir)
        for file in os.listdir(path):
            with open(os.path.join(path, file)) as f:
                cves[file.removesuffix(".json")] = json.load(f)
    return cves


def construct_training_set(cves: dict):
    """
    Scan through all CVEs for cve.source_data elements.
    For each element, couple the cve.source_data.elem.description
    with each cve.source_data.elem.score.
    """
    examples = []
    for cve_data in cves.values():
        for sd in cve_data["source_data"]:
            if "scores" not in sd:
                continue
            examples.extend(
                [{"description": sd["description"]} | score for score in sd["scores"]]
            )
    return examples

In [None]:
pkl_path = "../cves.pkl"

if os.path.isfile(pkl_path):
    with open(pkl_path, "rb") as f:
        cves = pickle.load(f)
else:
    # can take a few seconds
    cves = load_cves()
    with open(pkl_path, "wb") as f:
        pickle.dump(cves, f)

print(f"{sys.getsizeof(cves) / 1e6} mb")

In [None]:
df = pd.DataFrame(construct_training_set(cves))
df

In [None]:
def extract_cvss_vector_components(df: pd.DataFrame, vector: pd.Series):
    for metric in CVSS_BASE_METRICS:
        df[metric] = vector.dropna().apply(lambda v: vec_parse_metric(v, metric))
    return df

log.info("cleaning cvss vectors")
df["vector_clean"] = df["vector"].apply(clean_cvss_vector)
log.info("processing descriptions")
df["processed_desc"] = df["description"].apply(desc_preprocess)
log.info("extracting cvss vector components")
df = extract_cvss_vector_components(df, df["vector_clean"])

#### Imputation

In [None]:
# only this compact version is used going forward
df_clean = df.dropna(subset="vector_clean").copy()
# remove descriptions with REJECT in them
df_clean.drop(df_clean.index[df_clean["description"].str.contains("REJECT")], inplace=True)
df_clean.drop(df_clean.index[df_clean["description"].apply(str.lower).str.contains("no description is available for this cve")], inplace=True)
df_clean.drop_duplicates(subset="cve_id", inplace=True)
# note it may also be prudent to dedupe on description,
# but I'll leave this for now considering the CVE_IDs are different
df_clean.describe()

In [None]:
metric = "S"

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    df_clean["description"], df_clean[metric], test_size=0.3, random_state=9
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
text_clf = sklearn.pipeline.Pipeline(
    [
        ("vect", sklearn.feature_extraction.text.CountVectorizer()),
        ("tfidf", sklearn.feature_extraction.text.TfidfTransformer()),
        (
            "clf",
            sklearn.linear_model.SGDClassifier(
                # loss="hinge",
                # penalty="l2",
                alpha=1e-5,
                # random_state=42,
                max_iter=50,
                tol=None,
            ),
        ),
    ]
)

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
print(sklearn.metrics.classification_report(y_test, text_clf.predict(X_test)))

In [None]:
ax = plt.gca()

sklearn.metrics.ConfusionMatrixDisplay(
    confusion_matrix=sklearn.metrics.confusion_matrix(y_test, text_clf.predict(X_test)),
    display_labels=text_clf.classes_,
).plot(ax=ax, cmap=plt.cm.PuRd)

ax.grid(False)

In [None]:
param_grid = {
    "vect__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "clf__alpha": (1e-4, 1e-5, 1e-6),
    # "clf__tol": (1e-3, None),
    # "clf__loss": ("hinge", "squared_hinge"),
    # "clf__max_iter": (5, 10, 50),
}

gs = sklearn.model_selection.GridSearchCV(
    text_clf,
    param_grid,
    scoring="balanced_accuracy",
    cv=5,
    n_jobs=-1,
    verbose=1,
)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
print(sklearn.metrics.classification_report(y_test, gs.predict(X_test)))
print(sklearn.metrics.accuracy_score(y_test, gs.predict(X_test)))

In [None]:
ax = plt.gca()

sklearn.metrics.ConfusionMatrixDisplay(
    confusion_matrix=sklearn.metrics.confusion_matrix(y_test, gs.predict(X_test)),
    display_labels=text_clf.classes_,
).plot(ax=ax, cmap=plt.cm.PuRd)

ax.grid(False)

Next steps:
- look up how to deal with data imbalances
- understand the classification report and decide upon a CV scoring that makes sense
- record an initial set of best params for each metric (e.g. "C" preferred (1,3) grams)
- understand this: [3.2. Tuning the hyper-parameters of an estimator — scikit-learn 1.3.2 documentation](https://scikit-learn.org/stable/modules/grid_search.html#specifying-multiple-metrics-for-evaluation)
- 

In [None]:
519/(519+99)

In [None]:
from statistics import harmonic_mean

x = 0.1
data = [x, 1-x]

harmonic_mean(data), np.exp(np.log(data).mean()), np.prod(data)**0.5

----

## Further EDA

In [None]:
import matplotlib.pyplot as plt

plt.style.use("seaborn-v0_8")

df_melted = df_clean[list(CVSS_BASE_METRICS.keys())].melt(
    var_name="metric_key", value_name="category"
)

df_grouped = df_melted.groupby(["metric_key", "category"]).size().unstack()
df_grouped.index = df_grouped.index.map(
    {k: v.name for k, v in CVSS_BASE_METRICS.items()}
)

ax = df_grouped.plot(kind="bar", stacked=True)
plt.ylabel("Category counts")
plt.xlabel("CVSS Metric")
plt.title("CVSS Metric Category Values")

for i, (index, row) in enumerate(df_grouped.iterrows()):
    cumulative_size = 0

    for col in df_grouped.columns:
        value = row[str(col)]

        if np.isnan(value):
            continue

        x_position = i
        y_position = cumulative_size + (value / 2)

        ax.text(x_position, y_position, str(col), ha="center", va="center")

        cumulative_size += value

ax.legend().remove()
plt.tight_layout()
plt.savefig("../stacks_big_dataset.png", dpi=500)
