In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np

df = pd.read_csv("vulns.csv")

print(df.head())
print(df["business_critical"].value_counts())


   cvss3  epss_score  kev_flag  exploit_available  internet_facing  \
0    9.8        0.92         1                  1                1   
1    8.8        0.75         1                  1                1   
2    7.5        0.61         0                  1                1   
3    9.1        0.85         1                  1                0   
4    8.2        0.40         0                  1                0   

   asset_criticality  asset_role_server  age_days  business_critical  
0                  5                  1         5                  1  
1                  4                  1        12                  1  
2                  4                  1        20                  1  
3                  5                  1         8                  1  
4                  4                  1        35                  0  
business_critical
0    11
1     9
Name: count, dtype: int64


In [20]:
X = df[[
    "cvss3",
    "epss_score",
    "kev_flag",
    "exploit_available",
    "internet_facing",
    "asset_criticality",
    "asset_role_server",
    "age_days"
]]

y = df["business_critical"]

In [21]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        class_weight="balanced",
        solver="lbfgs",
        max_iter=1000,
        random_state=42
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

pipeline.fit(X_train, y_train)


In [22]:
coef = pipeline.named_steps["model"].coef_[0]

importance = (
    pd.Series(coef, index=features)
      .sort_values(ascending=False)
)

print("Feature importance (log-odds):")
print(importance)

Feature importance (log-odds):
epss_score           0.766557
asset_criticality    0.741832
internet_facing      0.538985
kev_flag             0.501325
cvss3                0.394330
exploit_available    0.293909
asset_role_server    0.123922
age_days            -0.534475
dtype: float64


In [23]:
odds_ratios = np.exp(importance)

print("Odds ratios:")
print(odds_ratios)


Odds ratios:
epss_score           2.152343
asset_criticality    2.099778
internet_facing      1.714267
kev_flag             1.650908
cvss3                1.483389
exploit_available    1.341662
asset_role_server    1.131928
age_days             0.585977
dtype: float64


In [24]:
df["business_critical_probability"] = pipeline.predict_proba(X)[:, 1]

df[[
    "cvss3",
    "epss_score",
    "internet_facing",
    "asset_criticality",
    "business_critical_probability"
]].sort_values(
    "business_critical_probability", ascending=False
).head(10)


Unnamed: 0,cvss3,epss_score,internet_facing,asset_criticality,business_critical_probability
0,9.8,0.92,1,5,0.991306
18,8.9,0.88,1,5,0.98685
10,8.6,0.7,1,5,0.976395
3,9.1,0.85,0,5,0.959055
1,8.8,0.75,1,4,0.949325
15,8.4,0.68,1,4,0.934772
11,9.0,0.81,0,4,0.892983
2,7.5,0.61,1,4,0.704608
14,7.9,0.63,1,4,0.701072
9,7.2,0.55,1,3,0.3562


In [25]:
df[["business_critical", "business_critical_probability"]].head(10)

Unnamed: 0,business_critical,business_critical_probability
0,1,0.991306
1,1,0.949325
2,1,0.704608
3,1,0.959055
4,0,0.319148
5,0,0.118903
6,0,0.009823
7,0,0.001234
8,0,0.030493
9,0,0.3562
