In [None]:
%cd ~/Documents/cvd-predictor/
from sklearnex import patch_sklearn
patch_sklearn()
import polars as pl
from CVD.utils import correlation_matrix, get_metrics, plot_feature_importances
from CVD.utils.encode import encode_data
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from plotly.graph_objs._figure import Figure
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
import pickle

In [None]:
csv_path = "data/intermediate/heart_cdc_2023.csv"
parquet_path = "data/intermediate/heart_cdc_2023.parquet"
# pl.read_csv(csv_path).write_parquet(parquet_path)

In [None]:
df: pl.DataFrame = pl.read_parquet(parquet_path)
df: pl.DataFrame = encode_data(df).drop_nulls()
df
# df.write_parquet("data/intermediate/heart_cdc_2023_cleaned.parquet")

In [None]:
fig: Figure = correlation_matrix(df, ["Sex"])
fig.show()

In [None]:
X: pl.DataFrame = df.drop(["CVD"])
y: pl.Series = df["CVD"]

scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X.to_numpy())

# pickle.dump(scaler, open("data/training/Scaler.pkl", "wb"))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
y_train.value_counts()

In [None]:
models = {
    # "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    # "Naive Bayes": GaussianNB(),
    # "SVM": SVC(probability=True),  # SVM with probability
    # "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LightGBM": LGBMClassifier(
        n_estimators=1_000, random_state=42, n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        n_estimators=1_000,
        eval_metric="map",
        random_state=42,
        tree_method="hist",
        device="cuda",
        verbosity=2 
    ),
	"MLP": MLPClassifier(
	hidden_layer_sizes=(64, 32),
	activation="relu",
	solver="adam",
	alpha=1e-4,
	batch_size="auto",
	learning_rate="adaptive",
	max_iter=200,
	random_state=42,
	verbose=True
    ),
}

In [None]:
results: list[dict] = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_prob: np.ndarray = model.predict_proba(X_test)[:, 1]
    y_pred_binary: np.ndarray = (y_pred_prob >= 0.5).astype(int)
    print(classification_report(y_test, y_pred_binary))
    results.append(get_metrics(y_test, y_pred_binary, name))
    # pickle.dump(model, open(f"data/training/{name}.pkl", "wb"))
    # plot_feature_importances(model, X, 15, name)

In [None]:
results_df = pl.DataFrame(results)
results_df
# 82.64 lightGbm
# 79.3 XGboosts