In [3]:
# Heart Attack - Kaggle competition V 4.0

# Author: Aniko Maraz, PhD

# Imports
import sys
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    OneHotEncoder,
)

from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

import jupyter_black

%load_ext jupyter_black

## DATA: GET AND EXPLORE

df_raw_train = pd.read_csv("../data/train.csv")

## PREPROCESSING PIPELINE


# def function to split blood pressure data (current format: 129/90)
def split_blood_pressure(df):
    df[["Systolic", "Diastolic"]] = df["Blood Pressure"].str.split("/", expand=True)
    df["Systolic"] = pd.to_numeric(df["Systolic"])
    df["Diastolic"] = pd.to_numeric(df["Diastolic"])
    df.drop(columns=["Blood Pressure"], inplace=True)


# split cholesterol according to sample mean
cholesterol_sample_mean = df_raw_train["Cholesterol"].mean()


def split_cholesterol_sample(df):
    df["Cholesterol_sample_split"] = np.where(
        df["Cholesterol"] > cholesterol_sample_mean, 1, 0
    )


# create the new variables
df = df_raw_train.copy()

split_blood_pressure(df=df)
split_cholesterol_sample(df=df)

### Define features

# Defining the features and the target
X = df.drop(columns="Heart Attack Risk")
y = df["Heart Attack Risk"]

# Opt-in continuous and categorical variables
continuous_vars = [
    "Age",
    "Heart Rate",
    "Exercise Hours Per Week",
    "Stress Level",
    "Sedentary Hours Per Day",
    "Income",
    "BMI",
    "Triglycerides",
    "Physical Activity Days Per Week",
    "Sleep Hours Per Day",
    "Systolic",
    "Diastolic",
]

categorical_vars = [
    "Diabetes",
    "Family History",
    "Obesity",
    "Alcohol Consumption",
    "Previous Heart Problems",
    "Medication Use",
    "Cholesterol_sample_split",
    "Sex",
    "Continent",
    "Diet",
    "Hemisphere",
]

X_selected = X[continuous_vars + categorical_vars]

### Create preprocessing pipeline and train/test data

# Define preprocessing steps for continuous and categorical features
num_transformer = MinMaxScaler()
cat_transformer = OneHotEncoder(drop="first")

preproc_basic = ColumnTransformer(
    transformers=[
        ("num", num_transformer, continuous_vars),
        ("cat", cat_transformer, categorical_vars),
    ],
    remainder="passthrough",
)

# Create pipelines for SVC
logreg_pipe = make_pipeline(preproc_basic, LogisticRegression(random_state=6))

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=6
)

## FIT and EVALUATE pipeline with competing classification models

# Fit the pipeline
logreg_pipe.fit(X_train, y_train)

# Get cross-validated accuracy
cv_score = cross_val_score(
    logreg_pipe, X_train, y_train, cv=5, scoring="accuracy"
).mean()
print(f"Cross-validated accuracy for logreg_pipe: {cv_score}")

# Fit preprocessing on the entire dataset
X_train_preprocessed = preproc_basic.fit_transform(X_train)

# Convert the transformed data to a DataFrame
X_train_preprocessed_df = pd.DataFrame(
    X_train_preprocessed,
    columns=continuous_vars
    + list(
        preproc_basic.named_transformers_["cat"].get_feature_names_out(categorical_vars)
    ),
)

## PREPROCESS INPUT DATA

df_kaggle_test = pd.read_csv("../data/test.csv")  # read in test data provided by Kaggle

# preprocess input data
df_kaggle_test = df_kaggle_test.copy()

split_blood_pressure(df=df_kaggle_test)
split_cholesterol_sample(df=df_kaggle_test)

X_df_kaggle_test_selected = df_kaggle_test[continuous_vars + categorical_vars]

# Create SVM pipeline with best parameters
# best_params = {"C": 0.0001, "kernel": "linear", "gamma": "scale", "class_weight": None}

logreg_pipe = Pipeline(
    [
        ("preprocessor", preproc_basic),
        ("classifier", LogisticRegression(random_state=6)),
    ]
)

## FIT

# Train the SVM model on the entire preprocessed training dataset
logreg_pipe.fit(X_selected, y)

## PREDICT

# Get predicted probabilities for the training set
train_probs = logreg_pipe.predict_proba(X_train)[:, 1]

# Evaluate thresholds
thresholds = np.linspace(0.1, 0.9, 50)  # Test 50 thresholds from 0.1 to 0.9
best_threshold = None
best_score = 0.0

for threshold in thresholds:
    # Convert probabilities to binary predictions based on the threshold
    train_predictions = (train_probs > threshold).astype(int)

    # Evaluate accuracy
    score = accuracy_score(y_train, train_predictions)

    # Check if this threshold is better
    if score > best_score:
        best_score = score
        best_threshold = threshold

# Print the best threshold found
print(f"Best threshold: {best_threshold} with accuracy score: {best_score}")

# Apply the best threshold to the test set predictions
test_probs = logreg_pipe.predict_proba(X_test)[:, 1]
test_predictions = (test_probs > best_threshold).astype(int)

# Evaluate test set accuracy with the tuned threshold
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test set accuracy with tuned threshold: {test_accuracy}")

# Predict probabilities for the Kaggle test set
kaggle_test_probs = logreg_pipe.predict_proba(X_df_kaggle_test_selected)[:, 1]

# Apply the best threshold to Kaggle test set predictions
kaggle_test_predictions = (kaggle_test_probs > best_threshold).astype(int)

# Prepare submission dataframe
df_kaggle_test = pd.read_csv("../data/test.csv")
df_kaggle_predicted_V5 = {
    "Patient ID": df_kaggle_test["Patient ID"],
    "Heart Attack Risk": kaggle_test_predictions,
}
df_kaggle_predicted_V5_logreg = pd.DataFrame(df_kaggle_predicted_V5)

# Save submission to CSV
df_kaggle_predicted_V5_logreg.to_csv(
    "../submission/df_kaggle_predicted_V5_logreg.csv", index=False
)

# Number of cases in the unseen Kaggle test set
len(df_kaggle_test)

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black
Cross-validated accuracy for logreg_pipe: 0.6425514510942116
Best threshold: 0.4755102040816327 with accuracy score: 0.6427552476054615
Test set accuracy with tuned threshold: 0.6428911079410367


1753

In [4]:
sum(kaggle_test_predictions)

0