<a href="https://colab.research.google.com/github/anismegharbi/dev.support/blob/main/projet_ai_sec_cvs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install datasets scikit-learn pandas numpy

from datasets import load_dataset
import pandas as pd
import numpy as np

ds = load_dataset("CIRCL/vulnerability-scores")
train_df = ds["train"].to_pandas()
test_df  = ds["test"].to_pandas()

train_df.shape, test_df.shape


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/145M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/585421 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/65047 [00:00<?, ? examples/s]

((585421, 9), (65047, 9))

In [None]:
import pandas as pd

# Keep only rows that have CVSS v3.1 (more consistent labels)
train_df = train_df[train_df["cvss_v3_1"].notna()].copy()
test_df  = test_df[test_df["cvss_v3_1"].notna()].copy()

# Use v3.1 as the ONLY score
train_df["score"] = pd.to_numeric(train_df["cvss_v3_1"], errors="coerce")
test_df["score"]  = pd.to_numeric(test_df["cvss_v3_1"], errors="coerce")

# Safety cleaning
train_df = train_df.dropna(subset=["description","score"])
test_df  = test_df.dropna(subset=["description","score"])
train_df = train_df[train_df["score"] > 0]
test_df  = test_df[test_df["score"] > 0]

def to_label(s):
    if s < 4.0: return "Low"
    if s < 7.0: return "Medium"
    if s < 9.0: return "High"
    return "Critical"

train_df["label"] = train_df["score"].apply(to_label)
test_df["label"]  = test_df["score"].apply(to_label)

print("Train:", train_df.shape)
print(train_df["label"].value_counts(normalize=True).round(3))

print("\nTest:", test_df.shape)
print(test_df["label"].value_counts(normalize=True).round(3))


Train: (308201, 11)
label
Medium      0.470
High        0.378
Critical    0.116
Low         0.037
Name: proportion, dtype: float64

Test: (34054, 11)
label
Medium      0.468
High        0.378
Critical    0.118
Low         0.036
Name: proportion, dtype: float64


In [None]:
BORDER = 0.20   # strong filter.
thresholds = [4.0, 7.0, 9.0]

def not_borderline(score):
    return all(abs(score - t) > BORDER for t in thresholds)

train_df = train_df[train_df["score"].apply(not_borderline)].copy()
test_df  = test_df[test_df["score"].apply(not_borderline)].copy()

# also remove very short descriptions (often low information)
train_df = train_df[train_df["description"].str.len() >= 40].copy()
test_df  = test_df[test_df["description"].str.len() >= 40].copy()

print("Train:", train_df.shape)
print(train_df["label"].value_counts(normalize=True).round(3))

print("\nTest:", test_df.shape)
print(test_df["label"].value_counts(normalize=True).round(3))


Train: (269004, 11)
label
Medium      0.531
High        0.311
Critical    0.116
Low         0.041
Name: proportion, dtype: float64

Test: (29722, 11)
label
Medium      0.530
High        0.311
Critical    0.118
Low         0.041
Name: proportion, dtype: float64


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

X_train = train_df["description"]
y_train = train_df["label"]
X_test  = test_df["description"]
y_test  = test_df["label"]

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1,2),
        min_df=3,
        max_df=0.9,
        sublinear_tf=True
    )),
    ("svm", LinearSVC(
        class_weight="balanced",
        C=2.0
    ))
])

model.fit(X_train, y_train)
pred = model.predict(X_test)

acc = accuracy_score(y_test, pred)
print("Accuracy:", acc)
print(classification_report(y_test, pred))


Accuracy: 0.8779018908552587
              precision    recall  f1-score   support

    Critical       0.82      0.87      0.85      3519
        High       0.85      0.85      0.85      9253
         Low       0.75      0.79      0.77      1206
      Medium       0.92      0.90      0.91     15744

    accuracy                           0.88     29722
   macro avg       0.83      0.85      0.84     29722
weighted avg       0.88      0.88      0.88     29722



In [None]:
import numpy as np

tfidf = model.named_steps["tfidf"]
svm = model.named_steps["svm"]

feature_names = np.array(tfidf.get_feature_names_out())
classes = svm.classes_

# For LinearSVC (one-vs-rest), coef_[i] corresponds to class i
for i, cls in enumerate(classes):
    top_pos = np.argsort(svm.coef_[i])[-25:]  # top 25 keywords pushing toward that class
    print("\n========================")
    print("Severity:", cls)
    print("Top keywords:")
    print(", ".join(feature_names[top_pos]))



Severity: Critical
Top keywords:
disputed vulnerability, network file, traversal information, twitch player, deserialization, rsvpmaker 10, 20240902, corrupt process, exists puneethreddyhc, icms, attempts github, automate allows, fi firmware, information cause, power platform, source git, affected disclosure, remote authentication, using malicious, upload title, crafted overflow, overflow data, integrity availability, read delete, 896

Severity: High
Top keywords:
authenticated sql, remotely exploit, protocol remote, vulnerability zone, v2 stack, additional execution, result takeover, campcodes employee, access include, takeover mysql, score confidentiality, current process, able gain, fix buffer, local privilege, privilege ue, 13 attacker, execution execution, fix double, results stored, local escalation, context current, virtualbox cvss, uaf, fix use

Severity: Low
Top keywords:
consumption upgrading, server weak, v14 teamcenter, dereference parsing, sureforms wordpress, conduct los

In [None]:
!python -m src.predict --model models/svm_tfidf.joblib --text "identifier patch, favorites, difficult exploit, overflow zoom, leads cros , validation untrusted, bluetooth additional, advancecomp v2, path intel, takeover mysql, defenderfirewall, access inject, app root, injection exploit, exploitation product, scripting vulnerability, forgerypossible, forgery attack, injection identifier, local denial, privilege execution, xss."


Medium


In [None]:
!cp -r /content/cve-severity-classifier /content/drive/MyDrive/
!ls -lah /content/drive/MyDrive/cve-severity-classifier



total 24K
-rw------- 1 root root  122 Dec 31 21:34 .gitignore
-rw------- 1 root root  232 Dec 31 21:34 Makefile
drwx------ 2 root root 4.0K Dec 31 21:34 models
drwx------ 2 root root 4.0K Dec 31 21:34 notebooks
-rw------- 1 root root 1.9K Dec 31 21:34 README.md
-rw------- 1 root root  111 Dec 31 21:34 requirements.txt
drwx------ 2 root root 4.0K Dec 31 21:34 results
drwx------ 3 root root 4.0K Dec 31 21:34 src
drwx------ 2 root root 4.0K Dec 31 21:34 tests


In [None]:
!mkdir -p /content/drive/MyDrive/cve_project_backup/models
!mkdir -p /content/drive/MyDrive/cve_project_backup/results

!cp -f /content/cve-severity-classifier/models/svm_tfidf.joblib /content/drive/MyDrive/cve_project_backup/models/
!cp -f /content/cve-severity-classifier/results/* /content/drive/MyDrive/cve_project_backup/results/ 2>/dev/null || true

!ls -lh /content/drive/MyDrive/cve_project_backup/models
!ls -lh /content/drive/MyDrive/cve_project_backup/results


total 26M
-rw------- 1 root root 26M Dec 31 21:35 svm_tfidf.joblib
total 0
