# Heart Attack - Kaggle competition V 5.0  
## XGBoost with probability estimation, optimised for precision
### Author: Aniko Maraz, PhD

Note: This is the final version of the improved model, currently running in production: https://fake-heart-attack.streamlit.app/
This version is optimised for **precision** (not accuracy as required on Kaggle). 

Further info and versions in my Git Repo: https://github.com/anikomaraz/heart_attack_kaggle

# Imports

In [None]:
import sys
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    OneHotEncoder,
)

from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

import jupyter_black

%load_ext jupyter_black

## DATA: GET AND EXPLORE

In [None]:
df_raw_train = pd.read_csv("../data/train.csv")

## PREPROCESSING PIPELINE

In [None]:
# def function to split blood pressure data (current format: 129/90)
def split_blood_pressure(df):
    df[["Systolic", "Diastolic"]] = df["Blood Pressure"].str.split("/", expand=True)
    df["Systolic"] = pd.to_numeric(df["Systolic"])
    df["Diastolic"] = pd.to_numeric(df["Diastolic"])
    df.drop(columns=["Blood Pressure"], inplace=True)


# split cholesterol according to sample mean
cholesterol_sample_mean = df_raw_train["Cholesterol"].mean()


def split_cholesterol_sample(df):
    df["Cholesterol_sample_split"] = np.where(
        df["Cholesterol"] > cholesterol_sample_mean, 1, 0
    )


# create the new variables
df = df_raw_train.copy()

split_blood_pressure(df=df)
split_cholesterol_sample(df=df)

### Define features

In [None]:
# Defining the features and the target
X = df.drop(columns="Heart Attack Risk")
y = df["Heart Attack Risk"]

# Opt-in continuous and categorical variables
continuous_vars = [
    "Age",
    "Heart Rate",
    "Exercise Hours Per Week",
    "Stress Level",
    "Sedentary Hours Per Day",
    "Income",
    "BMI",
    "Triglycerides",
    "Physical Activity Days Per Week",
    "Sleep Hours Per Day",
    "Systolic",
    "Diastolic",
]

categorical_vars = [
    "Diabetes",
    "Family History",
    "Obesity",
    "Alcohol Consumption",
    "Previous Heart Problems",
    "Medication Use",
    "Cholesterol_sample_split",
    "Sex",
    "Continent",
    "Diet",
    "Hemisphere",
]

X_selected = X[continuous_vars + categorical_vars]

### Create preprocessing pipeline and train/test data

In [None]:
# Define preprocessing steps for continuous and categorical features
num_transformer = MinMaxScaler()
cat_transformer = OneHotEncoder(drop="first")

preproc_basic = ColumnTransformer(
    transformers=[
        ("num", num_transformer, continuous_vars),
        ("cat", cat_transformer, categorical_vars),
    ],
    remainder="passthrough",
)

# Create pipelines for SVC
svm_pipe = make_pipeline(
    preproc_basic, SVC(class_weight="balanced", random_state=6, probability=True)
)

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=6
)

## TRAIN AND TUNE THRESHOLD FOR PRECISION

In [None]:
# Fit the pipeline
xgb_pipe.fit(X_train, y_train)

# Get predicted probabilities for the training set
train_probs = xgb_pipe.predict_proba(X_train)[:, 1]

# Evaluate thresholds
thresholds = np.linspace(0.4, 0.6, 50)  # Adjust the range to focus on more balanced thresholds
best_threshold = None
best_precision = 0.0

for threshold in thresholds:
    # Convert probabilities to binary predictions based on the threshold
    train_predictions = (train_probs > threshold).astype(int)

    # Evaluate precision
    precision = precision_score(y_train, train_predictions)

    # Check if this threshold gives better precision
    if precision > best_precision:
        best_precision = precision
        best_threshold = threshold

# Print the best threshold found
print(f"Best threshold: {best_threshold} with precision score: {best_precision}")

## APPLY THE BEST THRESHOLD TO THE TEST SET

In [None]:
# Predict probabilities for the test set
test_probs = xgb_pipe.predict_proba(X_test)[:, 1]
test_predictions = (test_probs > best_threshold).astype(int)

# Evaluate precision on the test set with the tuned threshold
test_precision = precision_score(y_test, test_predictions)
print(f"Test set precision with tuned threshold: {test_precision}")

## PREDICT ON KAGGLE TEST SET AND SAVE SUBMISSION

In [None]:
# Predict probabilities for the Kaggle test set
kaggle_test_probs = xgb_pipe.predict_proba(X_df_kaggle_test_selected)[:, 1]

# Apply the best threshold to Kaggle test set predictions
kaggle_test_predictions = (kaggle_test_probs > best_threshold).astype(int)

In [None]:
# Prepare submission dataframe
df_kaggle_test = pd.read_csv("../data/test.csv")
df_kaggle_predicted_V5 = {
    "Patient ID": df_kaggle_test["Patient ID"],
    "Heart Attack Risk": kaggle_test_predictions,
}
df_kaggle_predicted_V5_xgb_precision = pd.DataFrame(df_kaggle_predicted_V5)

# Save submission to CSV
df_kaggle_predicted_V5_xgb_precision.to_csv(
    "../submission/df_kaggle_predicted_V5_xgb_precision.csv", index=False
)

# Number of cases in the unseen Kaggle test set
len(df_kaggle_test)

Best threshold: 0.5306122448979592 with precision score: 0.9994206257242179
Test set precision with tuned threshold: 0.3554987212276215


1753

In [16]:
sum(kaggle_test_predictions)

339