In [2]:
from ucimlrepo import fetch_ucirepo
# fetch dataset
heart_disease = fetch_ucirepo(id=45)

In [2]:
import pandas as pd
df = pd.read_csv("heart.csv")
print(df.head())  # Print the first few rows
print(df.info())  # Print data types and missing values
print(df.describe()) # Print statistics


   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, roc_curve, ConfusionMatrixDisplay)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib
import warnings
import mlflow
import mlflow.xgboost
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

# Set up MLflow
mlflow.set_experiment("Heart Disease Prediction - Simplified")

# 1. Data Loading and Preprocessing
def load_and_preprocess_data(data_path="heart.csv"):
    """Loads, preprocesses, and splits the heart disease dataset.  Simplified version.
    Args:
        data_path (str, optional): Path to the CSV file. Defaults to "heart.csv".

    Returns:
        tuple: X_train, X_test, y_train, y_test, feature_names
    """
    print("Loading and preprocessing data...")
    df = pd.read_csv(data_path)

    # Handle missing values using imputation
    imputer = SimpleImputer(strategy='mean')  # Use 'mean', 'median', or 'most_frequent'
    df.iloc[:, :] = imputer.fit_transform(df)

    # Separate features and target variable
    X = df[['age', 'sex', 'cp', 'trestbps', 'chol', 'thalach']]  # Key features only
    y = df['target']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    feature_names = list(X.columns)  # Store feature names
    return X_train, X_test, y_train, y_test, feature_names

# 2. Model Training (Simplified)
def train_model(X_train, y_train, X_val, y_val, feature_names):
    """Trains an XGBoost model with MLflow logging. Simplified version.

    Args:
        X_train (array): Training features.
        y_train (array): Training labels.

    Returns:
        XGBClassifier: Trained XGBoost model.
    """
    with mlflow.start_run() as run:
        # Define XGBoost model
        model = XGBClassifier(objective='binary:logistic',
                              eval_metric='auc',
                              use_label_encoder=False,
                              random_state=42)

        # Train model
        print("Training model...")
        model.fit(X_train, y_train)

        # Log parameters and metrics to MLflow
        mlflow.log_metric("train_roc_auc", roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])) # Add training ROC
        mlflow.log_metric("val_roc_auc", roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])) # Log validation metric


       # Log the model, including signature
        input_example = X_train[:1]  # Use the first training sample as an example
        mlflow.xgboost.log_model(
            xgb_model=model,
            artifact_path="xgboost-model",
            input_example=input_example
            )
        print(f"MLflow Run ID: {run.info.run_id}")
    return model

# 3. Model Evaluation and Validation
def evaluate_model(model, X_test, y_test, feature_names):
    """Evaluates the trained model and logs metrics to MLflow.

    Args:
        model (XGBClassifier): Trained XGBoost model.
        X_test (array): Testing features.
        y_test (array): Testing labels.
    """
    print("\nEvaluating model...")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Log metrics to MLflow
    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("test_roc_auc", auc)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)
    #Log confusion matrix as artifact
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title('Confusion Matrix')
    mlflow.log_figure(plt.gcf(), "confusion_matrix.png")
    plt.close()

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    mlflow.log_figure(plt.gcf(), "roc_curve.png")
    plt.close()

    print(f"Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")

# 4. Prediction Function (Simplified)
def predict_heart_disease(model, new_data, imputer, scaler, feature_names):
    """Predicts heart disease probability for new data.

    Args:
        model (XGBClassifier): Trained XGBoost model.
        new_data (dict): New patient data (dictionary format).
        feature_names (list): list of feature names

    Returns:
        tuple: Probability of heart disease, Predicted class (0 or 1).
    """
    print("\nPredicting for new data...")

    # 1. Ensure correct feature order is passed.
    new_data_df = pd.DataFrame([new_data], columns=feature_names)

    # 2. Transform new data using the fitted imputer and scaler
    new_data_imputed = imputer.transform(new_data_df)
    new_data_scaled = scaler.transform(new_data_imputed)


    pred_proba = model.predict_proba(new_data_scaled)[:, 1]
    pred_class = 1 if pred_proba >= 0.5 else 0  # Threshold of 0.5

    return pred_proba[0], pred_class

# 5. Main Execution Block
if __name__ == "__main__":
    # Enable autologging
    mlflow.xgboost.autolog()

    #Data loading and preprocessing
    X_train, X_test, y_train, y_test, feature_names = load_and_preprocess_data("heart.csv")

    # Create validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

   # Train the imputer and scaler on the training data
    imputer = SimpleImputer(strategy='mean')
    imputer.fit(X_train)
    X_train = imputer.transform(X_train)
    X_test = imputer.transform(X_test)  # Apply to test data

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)  # Apply to test data

    # Train model
    model = train_model(X_train, y_train, X_val, y_val, feature_names)

    # Evaluate model
    evaluate_model(model, X_test, y_test, feature_names)

    # Example Prediction - data needs to be a dictionary
    new_patient_data = {
        'age': 52,
        'sex': 1,
        'cp': 0,
        'trestbps': 125,
        'chol': 212,
        'thalach': 168 #Removed some columns as they were useless
    }

    #Need to specify feature_names for the sample data.
    probability, prediction = predict_heart_disease(model, new_patient_data, imputer, scaler, feature_names) # Pass imputer and scaler
    print(f"\nPrediction Results:")
    print(f"  Probability of Heart Disease: {probability:.4f}")
    print(f"  Prediction: {'Heart Disease Present' if prediction == 1 else 'No Heart Disease'}")


2025/03/19 08:31:35 INFO mlflow.tracking.fluent: Experiment with name 'Heart Disease Prediction - Simplified' does not exist. Creating a new experiment.


Loading and preprocessing data...
Training model...
MLflow Run ID: 2e42cbb1357d4ede8138d1671dd7afaa

Evaluating model...
Confusion Matrix:
[[ 96   4]
 [  0 105]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       100
           1       0.96      1.00      0.98       105

    accuracy                           0.98       205
   macro avg       0.98      0.98      0.98       205
weighted avg       0.98      0.98      0.98       205

Accuracy: 0.9805, AUC: 1.0000

Predicting for new data...

Prediction Results:
  Probability of Heart Disease: 0.1401
  Prediction: No Heart Disease
