In [None]:
%cd ~/Documents/cvd-predictor/
from sklearnex import patch_sklearn
patch_sklearn()
import polars as pl
from CVD.utils import correlation_matrix, get_metrics, plot_feature_importances
from CVD.utils.encode import encode_data
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from plotly.graph_objs._figure import Figure
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

In [None]:
# df = pl.read_csv("data/intermediate/heart_cdc_2023.csv")
# df.write_parquet("data/intermediate/heart_cdc_2023.parquet")
df: pl.DataFrame = pl.read_parquet("data/intermediate/heart_cdc_2023.parquet")
df = encode_data(df)
df = df.drop_nulls()
df.write_parquet("data/intermediate/heart_cdc_2023_cleaned.parquet")

In [None]:
fig: Figure = correlation_matrix(df, ["Sex"])
fig.show()

In [None]:
X: pl.DataFrame = df.drop(["CVD"])
y: pl.Series = df["CVD"]

scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
y_train.value_counts()

In [None]:
models: dict = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True, random_state=42),  # SVM with probability
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, eval_metric="logloss", random_state=42),
}

In [None]:
results: list[dict] = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_prob: np.ndarray = model.predict_proba(X_test)[:, 1]
    y_pred_binary: np.ndarray = (y_pred_prob >= 0.5).astype(int)
    # print(classification_report(y_test, y_pred_binary))
    results.append(get_metrics(y_test, y_pred_binary, name))
    # plot_feature_importances(model, X, 15, name)

In [None]:
results_df = pl.DataFrame(results)
results_df