In [61]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import os

In [28]:
cwd = os.path.join(os.getcwd())
input_csv = os.path.join(cwd, "../dataset/inputs.csv")
label_csv = os.path.join(cwd, "../dataset/labels.csv")

In [29]:
features = pd.read_csv(input_csv)
labels = pd.read_csv(label_csv)

In [30]:
X = features.drop(columns=['PatientID'])
y = labels['HadHeartAttack']

In [49]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
print(f"There are {len(numerical_features)} numerical features:\n", numerical_features)
print(f"There are {len(categorical_features)} categorical features:\n", categorical_features)

There are 24 numerical features:
 Index(['HeightInMeters', 'WeightInKilograms', 'BMI', 'HadAngina', 'HadStroke',
       'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder',
       'HadKidneyDisease', 'HadArthritis', 'DeafOrHardOfHearing',
       'BlindOrVisionDifficulty', 'DifficultyConcentrating',
       'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands',
       'ChestScan', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12',
       'PneumoVaxEver', 'HighRiskLastYear', 'CovidPos'],
      dtype='object')
There are 9 categorical features:
 Index(['State', 'Sex', 'GeneralHealth', 'AgeCategory', 'HadDiabetes',
       'SmokerStatus', 'ECigaretteUsage', 'RaceEthnicityCategory',
       'TetanusLast10Tdap'],
      dtype='object')


In [54]:
# numerical features are kept as number, categorical or one-hot-encoded
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [64]:
X_preprocessed = preprocessor.fit_transform(X)

In [90]:
categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
print(f"There are {len(categorical_feature_names)} after one-hot-encoding.")
print(f"The names of the categorical features are:\n", categorical_feature_names)

There are 95 after one-hot-encoding.
The names of the categorical features are:
 ['State_Alabama', 'State_Alaska', 'State_Arizona', 'State_Arkansas', 'State_California', 'State_Colorado', 'State_Connecticut', 'State_Delaware', 'State_District of Columbia', 'State_Florida', 'State_Georgia', 'State_Guam', 'State_Hawaii', 'State_Idaho', 'State_Illinois', 'State_Indiana', 'State_Iowa', 'State_Kansas', 'State_Kentucky', 'State_Louisiana', 'State_Maine', 'State_Maryland', 'State_Massachusetts', 'State_Michigan', 'State_Minnesota', 'State_Mississippi', 'State_Missouri', 'State_Montana', 'State_Nebraska', 'State_Nevada', 'State_New Hampshire', 'State_New Jersey', 'State_New Mexico', 'State_New York', 'State_North Carolina', 'State_North Dakota', 'State_Ohio', 'State_Oklahoma', 'State_Oregon', 'State_Pennsylvania', 'State_Puerto Rico', 'State_Rhode Island', 'State_South Carolina', 'State_South Dakota', 'State_Tennessee', 'State_Texas', 'State_Utah', 'State_Vermont', 'State_Virgin Islands', 'Sta

In [84]:
# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_preprocessed)

# Calculate cumulative explained variance
cumulative_variance = pca.explained_variance_ratio_.cumsum()

# Determine the number of components to capture 80% variance
n_components_80 = next(i for i, v in enumerate(cumulative_variance) if v >= 0.80) + 1
print(f"Number of components to retain 80% variance: {n_components_80}")

# Reduce to the top n_components_80 components
pca = PCA(n_components=n_components_80)
X_reduced = pca.fit_transform(X_preprocessed)


Number of components to retain 80% variance: 22


In [88]:
# Get feature names from the preprocessor
numerical_feature_names = numerical_features.tolist()
categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
all_feature_names = numerical_feature_names + categorical_feature_names

# Create a DataFrame for PCA loadings
loading_matrix = pd.DataFrame(
    pca.components_,
    columns=all_feature_names,
    index=[f'PC{i+1}' for i in range(pca.n_components_)]
)

top_features = set()

for i in range(n_components_80):
    top_features.update(
        loading_matrix.iloc[i].abs().sort_values(ascending=False).head(5).index
    )

print(f"Selected features: {list(top_features)}")


Selected features: ['DifficultyErrands', 'HadArthritis', 'HIVTesting', 'Sex_Female', 'HadAsthma', 'DifficultyConcentrating', 'HeightInMeters', 'HadSkinCancer', 'HadCOPD', 'BlindOrVisionDifficulty', 'Sex_Male', 'HadKidneyDisease', 'HadDepressiveDisorder', 'ChestScan', 'DifficultyDressingBathing', 'BMI', 'AlcoholDrinkers', 'HighRiskLastYear', 'HadAngina', 'CovidPos', 'PneumoVaxEver', 'DeafOrHardOfHearing', 'WeightInKilograms', 'FluVaxLast12', 'DifficultyWalking', 'HadStroke']


In [89]:
# List of original categorical columns (from your data)
original_categorical_features = categorical_features.tolist()

# Create a mapping of one-hot-encoded feature names back to original column names
def map_to_original_feature(feature_name):
    for cat_feature in original_categorical_features:
        if feature_name.startswith(cat_feature + "_"):
            return cat_feature
    return feature_name  # Return numerical features or already non-encoded names

# Map the top features back to their original names
mapped_features = list(set(map(map_to_original_feature, top_features)))
print(mapped_features)
print(len(mapped_features))

['DifficultyErrands', 'HadArthritis', 'HIVTesting', 'HadAsthma', 'DifficultyConcentrating', 'HeightInMeters', 'HadSkinCancer', 'HadCOPD', 'BlindOrVisionDifficulty', 'HadKidneyDisease', 'HadDepressiveDisorder', 'ChestScan', 'DifficultyDressingBathing', 'Sex', 'BMI', 'AlcoholDrinkers', 'HighRiskLastYear', 'HadAngina', 'CovidPos', 'PneumoVaxEver', 'DeafOrHardOfHearing', 'WeightInKilograms', 'FluVaxLast12', 'DifficultyWalking', 'HadStroke']
25


In [None]:
# test using random forest classifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Using the components for 80% variance
print(f"Number of components: {n_components_80}")

# Use the optimal number of components for training and testing
pca = PCA(n_components=n_components_80)
X_reduced_final = pca.fit_transform(X_preprocessed)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reduced_final, y, test_size=0.2, random_state=42)

# Train the final model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the final model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Final model accuracy with {n_components_80} components: {accuracy:.2f}")

Number of components: 22
