In [1]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pylab as plt

df = pd.read_csv('data/heart_2022.csv')
print ("Number of Columns: ", df.shape[1])
print ("Number of Rows: ", df.shape[0])

Number of Columns:  40
Number of Rows:  246022


In [2]:
from sklearn.model_selection import train_test_split

y = df['HadHeartAttack']
X = df.loc[:, df.columns != 'HadHeartAttack']


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler

# Redefine ordinal features and categories
ordinal_cats = [
    ['Poor', 'Fair', 'Good', 'Very good', 'Excellent'], 
    ['5 or more years ago', 
     'Within past 5 years (2 years but less than 5 years ago)', 
     'Within past 2 years (1 year but less than 2 years ago)', 
     'Within past year (anytime less than 12 months ago)'], 
    ['All', '6 or more, but not all', '1 to 5', 'None of them'], 
    ['Current smoker - now smokes every day', 
     'Current smoker - now smokes some days', 
     'Former smoker', 
     'Never smoked'], 
    ['Use them every day', 'Use them some days', 'Not at all (right now)', 
     'Never used e-cigarettes in my entire life'], 
    ['Age 18 to 24', 'Age 25 to 29', 'Age 30 to 34', 'Age 35 to 39', 
     'Age 40 to 44', 'Age 45 to 49', 'Age 50 to 54', 'Age 55 to 59', 
     'Age 60 to 64', 'Age 65 to 69', 'Age 70 to 74', 'Age 75 to 79', 
     'Age 80 or older']
]

ordinal_ftrs = ['GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 'SmokerStatus', 'ECigaretteUsage', 'AgeCategory']
minmax_ftrs = ['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'HeightInMeters', 'WeightInKilograms', 'BMI']

# Extract binary columns dynamically (only columns with 2 unique values)
binary_columns = [
    col for col in df.columns
    if df[col].nunique() == 2 and df[col].dtype == 'object' and col != 'HadHeartAttack'  # Exclude target variable
]

# Extract remaining categorical columns for one-hot encoding
remaining_onehot_ftrs = [
    col for col in df.columns
    if col not in binary_columns + ordinal_ftrs + minmax_ftrs + ['HadHeartAttack']
]

# Updated ColumnTransformer
preprocessor_fixed = ColumnTransformer(
    transformers=[
        ('binary', OrdinalEncoder(), binary_columns),  # Encode binary columns as 0/1
        ('ord', OrdinalEncoder(categories=ordinal_cats), ordinal_ftrs),  # Ordinal encoding
        ('onehot', OneHotEncoder(handle_unknown='ignore'), remaining_onehot_ftrs),  # Remaining categorical
        ('minmax', MinMaxScaler(), minmax_ftrs)  # Scaling for numerical features
    ]
)

# Define the final pipeline
clf_fixed = Pipeline(steps=[('preprocessor', preprocessor_fixed)])


In [1]:
from sklearn.model_selection import ParameterGrid
import numpy as np

nr_states = 5

# Loop through the different random states
for i in range(nr_states):
    print('Random state', 42 * i)


Random state 0
Random state 42
Random state 84
Random state 126
Random state 168


In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.metrics import fbeta_score
import numpy as np

# Assuming df is already defined
y_target = df['HadHeartAttack']
X_features = df.loc[:, df.columns != 'HadHeartAttack']

# Reduce the dataset to 20% of its original size
X_features, _, y_target, _ = train_test_split(
    X_features, y_target, train_size=0.2, stratify=y_target, random_state=42
)


nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []

# Loop through the different random states
for i in range(nr_states):
    print('Random state', i + 1)

    # First split to separate out the training set
    X_train, X_other, y_train, y_other = train_test_split(
        X_features, y_target, train_size=0.6, stratify=y_target, random_state=42 * i
    )

    # Second split to separate out the validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(
        X_other, y_other, train_size=0.5, stratify=y_other, random_state=42 * i
    )

    # Preprocess the datasets using the existing pipeline
    X_train_prep = clf_fixed.fit_transform(X_train)
    X_val_prep = clf_fixed.transform(X_val)
    X_test_prep = clf_fixed.transform(X_test)

    # Encode the target variable
    y_train_encoded = y_train.map({'No': 0, 'Yes': 1})
    y_val_encoded = y_val.map({'No': 0, 'Yes': 1})
    y_test_encoded = y_test.map({'No': 0, 'Yes': 1})

    # Define the parameter grid for SVC
    
    param_grid = {
        'C': [1e-2, 1e-1, 1e0, 1e1, 1e2],  # Regularization parameter
        'gamma': ['scale', 'auto']  
    }


    # Save train and validation scores
    train_score = np.zeros(len(ParameterGrid(param_grid)))
    val_score = np.zeros(len(ParameterGrid(param_grid)))
    models = []

    # Loop through all combinations of hyperparameter combos
    for p, params in enumerate(ParameterGrid(param_grid)):
        print('   ', params)
        
        try:
            # Initialize the SVC classifier
            clf_svc = SVC(**params, probability=True, class_weight='balanced')
            clf_svc.fit(X_train_prep, y_train_encoded)  # Fit the model
            models.append(clf_svc)  # Save it

            # Calculate train and validation F2 scores
            y_train_pred = clf_svc.predict(X_train_prep)
            train_score[p] = fbeta_score(
                y_train_encoded, y_train_pred, beta=2, average='binary'
            )  # Use F2 score
            y_val_pred = clf_svc.predict(X_val_prep)
            val_score[p] = fbeta_score(
                y_val_encoded, y_val_pred, beta=2, average='binary'
            )  # Use F2 score
            print('   Train F2:', train_score[p], 'Validation F2:', val_score[p])
        
        except ValueError as e:
            print(f"Skipped params due to error: {params}\nError: {e}")
            train_score[p] = -1  # Mark invalid combinations
            val_score[p] = -1

    # Print out model parameters that maximize validation F2 score
    best_idx = np.argmax(val_score)
    print('Best model parameters:', ParameterGrid(param_grid)[best_idx])
    print('Corresponding validation F2 score:', np.max(val_score))

    # Collect and save the best model
    final_models.append(models[best_idx])

    # Calculate and save the test F2 score
    y_test_pred = final_models[-1].predict(X_test_prep)
    test_scores[i] = fbeta_score(
        y_test_encoded, y_test_pred, beta=2, average='binary'
    )  # Use F2 score
    print('Test F2 score:', test_scores[i])

# Print the average test F2 score across all random states
print('Average Test F2 Score:', np.mean(test_scores))



Random state 1
    {'C': 0.01, 'gamma': 'scale'}
   Train F2: 0.4594017094017094 Validation F2: 0.4355400696864111
    {'C': 0.01, 'gamma': 'auto'}
   Train F2: 0.4561248126447745 Validation F2: 0.43354943273906
    {'C': 0.1, 'gamma': 'scale'}
   Train F2: 0.5155529953917051 Validation F2: 0.4905335628227194
    {'C': 0.1, 'gamma': 'auto'}
   Train F2: 0.5149162239476911 Validation F2: 0.48851978505129456
    {'C': 1.0, 'gamma': 'scale'}
   Train F2: 0.5462822458270106 Validation F2: 0.4941713127217435
    {'C': 1.0, 'gamma': 'auto'}
   Train F2: 0.5400919305413687 Validation F2: 0.4992339121552605
    {'C': 10.0, 'gamma': 'scale'}
   Train F2: 0.6342697609492235 Validation F2: 0.48332019963225636
    {'C': 10.0, 'gamma': 'auto'}
   Train F2: 0.6055674518201285 Validation F2: 0.4858404780462458
    {'C': 100.0, 'gamma': 'scale'}
   Train F2: 0.8121390211774242 Validation F2: 0.4213397999393756
    {'C': 100.0, 'gamma': 'auto'}
   Train F2: 0.7603864734299517 Validation F2: 0.434027777

In [11]:
import numpy as np

# Example array
svc_array = np.array([0.5171974522292994, 0.5061609084319884, 0.5031367628607277, 0.5092824887104868, 0.5073547743704812])

# Calculate mean and standard deviation
mean = np.mean(svc_array)
std_dev = np.std(svc_array)

print(f"Support Vector Classifier Mean: {mean}")
print(f"Support Vector Classifier Standard Deviation: {std_dev}")


Support Vector Classifier Mean: 0.5086264773205966
Support Vector Classifier Standard Deviation: 0.004727053733944148


In [2]:
# Given confusion matrix values
false_positives = 232587/246022
false_negatives = 0
true_positives = 13435/246022

# Calculate precision and recall for the majority class baseline
# Baseline assumes predicting all as the majority class (negative class)
precision_baseline = true_positives / (true_positives + false_positives)
recall_baseline = true_positives / (true_positives + false_negatives)

# Calculate the F2 score
beta = 2
f2_baseline = (1 + beta**2) * (precision_baseline * recall_baseline) / (
    beta**2 * precision_baseline + recall_baseline
)

# Print the results
precision_baseline, recall_baseline, f2_baseline



(0.054608937412101356, 1.0, 0.22409444826228805)

#### 