In [None]:
%cd ~/Documents/cvd-predictor/
from sklearnex import patch_sklearn
patch_sklearn()
import polars as pl
from sklearn.calibration import LabelEncoder
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import plotly.graph_objects as go
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from CVD.utils import get_metrics, plot_feature_importances

sns.set_theme()

In [None]:
# df = pl.read_csv("data/intermediate/heart_cdc_2023.csv")
# df.write_parquet("data/intermediate/heart_cdc_2023.parquet")
df: pl.DataFrame = pl.read_parquet("data/intermediate/heart_cdc_2023.parquet")

SMOKER: dict[int, int] = {1: 1, 2: 1, 3: 0, 4: 0}
BIN: dict[int, int] = {1: 1, 2: 0}
ASTHMA: dict[int, int] = {1: 0, 2: 1} # asthma is flipped performance reasons

df: pl.DataFrame = df.with_columns(
    pl.col("SmokerStatus").map_elements(SMOKER.get, return_dtype=pl.Int8),
    pl.col("AlcoholDrinkers").map_elements(BIN.get, return_dtype=pl.Int8),
    pl.col("HadKidneyDisease").map_elements(BIN.get, return_dtype=pl.Int8),
    pl.col("HadDepressiveDisorder").map_elements(BIN.get, return_dtype=pl.Int8),
    pl.col("HadCOPD").map_elements(BIN.get, return_dtype=pl.Int8),
    pl.col("HadSkinCancer").map_elements(BIN.get, return_dtype=pl.Int8),
    pl.col("HadAsthma").map_elements(ASTHMA.get, return_dtype=pl.Int8),
    pl.col("PhysicalActivities").map_elements(ASTHMA.get, return_dtype=pl.Int8),
    (pl.col("WeightInKilograms") / (pl.col("HeightInMeters") ** 2)).alias("BMI"),
)
df = df.drop_nulls()
df.write_parquet("data/intermediate/heart_cdc_2023_cleaned.parquet")

In [None]:
label_encoders: dict = {}
copy = df.drop(["Sex"])

for col in copy.columns:
    le = LabelEncoder()
    encoded_col: np.ndarray = le.fit_transform(copy[col].to_numpy())
    copy:  pl.DataFrame = copy.with_columns(pl.Series(col, encoded_col))
    label_encoders[col] = le

correlation_matrix: np.ndarray = copy.corr().to_numpy()
formatted_text: list[list[str]] = [[f"{value:.2f}" for value in row] for row in correlation_matrix]

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix,
    x=copy.columns,
    y=copy.columns,
    colorscale="rdbu",
    zmin=-1,
    zmax=1,
    reversescale=True,
    text=formatted_text,
    texttemplate="%{text}",
    colorbar=dict(title="Correlation")
))

fig.update_layout(
    title="Feature Correlation Heatmap",
    xaxis=dict(tickangle=45, title="Features"),
    yaxis=dict(title="Features"),
    width=900,
    height=900
)

fig.show()

In [None]:
X: pl.DataFrame = df.drop(["CVD"])
y: pl.Series = df["CVD"]

scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
y_train.value_counts()

In [None]:
models: dict = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Naive Bayes": GaussianNB(),
    "SVM": LinearSVC(random_state=42, dual=False),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, eval_metric="logloss", random_state=42),
}

In [None]:
results: list[dict] = []
for name, model in models.items():
    model.fit(X_train, y_train) 
    y_pred: np.ndarray = model.predict(X_test) 
    # print(classification_report(y_test, y_pred))
    results.append(get_metrics(y_test, y_pred, name))
    # plot_feature_importances(model, X, 15, name)

In [None]:
results_df = pl.DataFrame(results)
results_df