In [None]:
# """Comment out if powerful pc"""
# # %pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
%cd ~/Documents/cvd-predictor
import polars as pl
from sklearn.calibration import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from imblearn.combine import SMOTEENN  # SMOTE + Edited Nearest Neighbors
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, classification_report,f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score

sns.set_theme()

In [None]:
# df = pl.read_csv("data/intermediate/heart_2023.csv")
# df.write_parquet("data/intermediate/heart_2023.parquet")
df = pl.read_parquet("data/intermediate/heart_2023.parquet")

In [None]:
df.filter(pl.col("CVD")==0)

In [None]:
Sex: dict[str, int] = {"Male":1, "Female": 1}

GeneralHealth: dict[str, int] = {
    "Excellent": 5,
    "Very good": 4,
    "Good": 3,
    "Fair": 2,
    "Poor": 1,
}

mean_value = df.filter(df['PhysicalHealthDays'] != -1)['PhysicalHealthDays'].mean()
df = df.with_columns(pl.when(df['PhysicalHealthDays'] == -1).then(mean_value).otherwise(df['PhysicalHealthDays']).alias("PhysicalHealthDays"))

mean_value = df.filter(df['MentalHealthDays'] != -1)['MentalHealthDays'].mean()
df = df.with_columns(pl.when(df['MentalHealthDays'] == -1).then(mean_value).otherwise(df['MentalHealthDays']).alias("MentalHealthDays"))

LastCheckupTime: dict[str, int] = {
    "Within past year (anytime less than 12 months ago)": 1,
    "Within past 2 years (1 year but less than 2 years ago)": 2,
    "Within past 5 years (2 years but less than 5 years ago)": 3,
    "5 or more years ago": 4,
}

YesOrNo: dict[str, int] = {
    "No": 0,
    "Yes": 1
}


SmokerStatus: dict[str, int] = {
    "Current smoker - now smokes every day": 1,
    "Current smoker - now smokes some days": 2,
    "Former smoker": 3,
    "Never smoked": 4,
}

ECigaretteUsage: dict[str, int] = {
    "Never used e-cigarettes in my entire life": 1,
    "Use them every day": 2,
    "Use them some days": 3,
    "Not at all (right now)": 4,
}

AgeCategory: dict[str, int] = {
    "Age 18 to 24": 1,
    "Age 25 to 29": 2,
    "Age 30 to 34": 3,
    "Age 35 to 39": 4,
    "Age 40 to 44": 5,
    "Age 45 to 49": 6,
    "Age 50 to 54": 7,
    "Age 55 to 59": 8,
    "Age 60 to 64": 9,
    "Age 65 to 69": 10,
    "Age 70 to 74": 11,
    "Age 75 to 79": 12,
    "Age 80 or older": 13,
}

df = df.with_columns(
    (df['WeightInKilograms'] / (df['HeightInMeters'] ** 2)).alias("BMI")
)

In [None]:
df: pl.DataFrame = df.with_columns(
    pl.col("AgeCategory").map_elements(AgeCategory.get, return_dtype=pl.Int8).alias("AgeCategory"),
    pl.col("HeightInMeters").alias("HeightInMeters"),
    pl.col("WeightInKilograms").alias("WeightInKilograms"),
    pl.col("BMI").alias("BMI"),
    pl.col("Sex").map_elements(Sex.get, return_dtype=pl.Int8).alias("Sex"),
    pl.col("GeneralHealth").map_elements(GeneralHealth.get, return_dtype=pl.Int8).alias("GeneralHealth"),  
    pl.col("PhysicalHealthDays").alias("PhysicalHealthDays"),
    pl.col("MentalHealthDays").alias("MentalHealthDays"),
    pl.col("LastCheckupTime").map_elements(LastCheckupTime.get, return_dtype=pl.Int8).alias("LastCheckupTime"),
    pl.col("PhysicalActivities").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("PhysicalActivities"),
    pl.col("HadAsthma").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("HadAsthma"),
    pl.col("HadSkinCancer").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("HadSkinCancer"),
    pl.col("HadCOPD").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("HadCOPD"),
    pl.col("HadDepressiveDisorder").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("HadDepressiveDisorder"),
    pl.col("HadKidneyDisease").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("HadKidneyDisease"),
    pl.col("HadArthritis").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("HadArthritis"),
    pl.col("HadDiabetes").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("HadDiabetes"),
    pl.col("AlcoholDrinkers").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("AlcoholDrinkers"),
    pl.col("ECigaretteUsage").map_elements(ECigaretteUsage.get, return_dtype=pl.Int8).alias("ECigaretteUsage"),
    pl.col("SmokerStatus").map_elements(SmokerStatus.get, return_dtype=pl.Int8).alias("SmokerStatus"),
    pl.col("HaveHighCholesterol").map_elements(YesOrNo.get, return_dtype=pl.Int8).alias("HaveHighCholesterol"),
)

In [None]:
df

In [None]:
# df: pl.DataFrame = df.drop_nans("CVD")
# df: pl.DataFrame = df.drop_nulls()


df: pd.DataFrame = df.to_pandas().dropna()
df.info()

In [None]:
X = df.drop(columns=["CVD"])
y = df["CVD"]

categorical_cols: pd.Index = X.select_dtypes(include=["object"]).columns
X[categorical_cols] = X[categorical_cols].astype(str)

In [None]:
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

smote_enn = SMOTEENN(sampling_strategy="minority", random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

In [None]:
def get_metrics(y_true, y_pred, model_name):
    return {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=1),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "ROC AUC": roc_auc_score(y_true, y_pred),
        "Confusion Matrix": confusion_matrix(y_true, y_pred)
    }

models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Naive Bayes": GaussianNB(),
    "SVM": LinearSVC(random_state=42, dual=False),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, eval_metric="logloss", random_state=42)
}

In [None]:
results = []
for name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled) 
    y_pred = model.predict(X_test) 
    results.append(get_metrics(y_test, y_pred, name))

In [None]:
results_df = pd.DataFrame(results)
results_df

In [None]:
# label_encoders: dict = {}
# df_copy = df.copy()
# df_copy.drop(["Sex"], axis=1, inplace=True)
# for col in df_copy.columns:
#     le = LabelEncoder()
#     df_copy[col] = le.fit_transform(df_copy[col])
#     label_encoders[col] = le

# plt.figure(figsize=(20, 20))
# sns.heatmap(df_copy.corr(), annot=True, cmap="coolwarm", fmt=".2f")
# plt.title("Feature Correlation Heatmap")
# plt.show()

In [None]:
# df_num = df.select(pl.col(pl.Float64, pl.Int64)).to_pandas()
df_temp = pl.DataFrame(df)
df_num = df_temp.select(pl.col(pl.Float64, pl.Int64)).to_pandas()
df_num

In [None]:
# df_num.hist(figsize=(16, 20), bins=40, xlabelsize=6, ylabelsize=6)

In [None]:
# fig, axes = plt.subplots(nrows=len(df_num.columns) // 2, ncols=2, figsize=(13, 10))

# for idx, column in enumerate(df_num.columns):
#     if column == "CVD":  # Skip the CVD column
#         continue

#     row_idx = idx // 2
#     col_idx = idx % 2

#     sns.kdeplot(
#         df.filter(pl.col("CVD") == 1).select(column).to_series(),
#         alpha=0.5,
#         fill=True,
#         color="#000CEB",
#         label="CVD",
#         ax=axes[row_idx, col_idx],
#     )
#     sns.kdeplot(
#         df.filter(pl.col("CVD") == 0).select(column).to_series(),
#         alpha=0.5,
#         fill=True,
#         color="#97B9F4",
#         label="Normal",
#         ax=axes[row_idx, col_idx],
#     )

#     axes[row_idx, col_idx].set_xlabel(column)
#     axes[row_idx, col_idx].set_ylabel("Frequency")
#     axes[row_idx, col_idx].set_title(f"{column} Distribution over Heart Disease")
#     axes[row_idx, col_idx].legend()

# plt.tight_layout()
# plt.show()