In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
import os

In [2]:
# Create a directory for saving visualizations
os.makedirs('visualizations', exist_ok=True)

# Function to save figures
def save_figure(fig, filename):
    fig.savefig(os.path.join('visualizations', filename))
    plt.close(fig)

In [3]:
# 1. Read the data
df = pd.read_csv(r'C:\Users\iamaa\OneDrive\Documents\GitHub\heartDiseaseRiskAssessment\heart_2022_with_nans.csv')

In [4]:
print("Data Shape:", df.shape)

Data Shape: (445132, 40)


In [5]:
print("\nData Info:")
df.info()


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      445132 non-null  object 
 1   Sex                        445132 non-null  object 
 2   GeneralHealth              443934 non-null  object 
 3   PhysicalHealthDays         434205 non-null  float64
 4   MentalHealthDays           436065 non-null  float64
 5   LastCheckupTime            436824 non-null  object 
 6   PhysicalActivities         444039 non-null  object 
 7   SleepHours                 439679 non-null  float64
 8   RemovedTeeth               433772 non-null  object 
 9   HadHeartAttack             442067 non-null  object 
 10  HadAngina                  440727 non-null  object 
 11  HadStroke                  443575 non-null  object 
 12  HadAsthma                  443359 non-null  object 
 13  HadSkinCancer    

In [6]:
print("\nData Description:")
print(df.describe())


Data Description:
       PhysicalHealthDays  MentalHealthDays     SleepHours  HeightInMeters  \
count       434205.000000     436065.000000  439679.000000   416480.000000   
mean             4.347919          4.382649       7.022983        1.702691   
std              8.688912          8.387475       1.502425        0.107177   
min              0.000000          0.000000       1.000000        0.910000   
25%              0.000000          0.000000       6.000000        1.630000   
50%              0.000000          0.000000       7.000000        1.700000   
75%              3.000000          5.000000       8.000000        1.780000   
max             30.000000         30.000000      24.000000        2.410000   

       WeightInKilograms            BMI  
count      403054.000000  396326.000000  
mean           83.074470      28.529842  
std            21.448173       6.554889  
min            22.680000      12.020000  
25%            68.040000      24.130000  
50%            80.740000  

In [7]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
State                            0
Sex                              0
GeneralHealth                 1198
PhysicalHealthDays           10927
MentalHealthDays              9067
LastCheckupTime               8308
PhysicalActivities            1093
SleepHours                    5453
RemovedTeeth                 11360
HadHeartAttack                3065
HadAngina                     4405
HadStroke                     1557
HadAsthma                     1773
HadSkinCancer                 3143
HadCOPD                       2219
HadDepressiveDisorder         2812
HadKidneyDisease              1926
HadArthritis                  2633
HadDiabetes                   1087
DeafOrHardOfHearing          20647
BlindOrVisionDifficulty      21564
DifficultyConcentrating      24240
DifficultyWalking            24012
DifficultyDressingBathing    23915
DifficultyErrands            25656
SmokerStatus                 35462
ECigaretteUsage              35660
ChestScan                    56046
Rac

In [8]:
# Print column names and data types
print("\nColumn Names and Data Types:")
print(df.dtypes)


Column Names and Data Types:
State                         object
Sex                           object
GeneralHealth                 object
PhysicalHealthDays           float64
MentalHealthDays             float64
LastCheckupTime               object
PhysicalActivities            object
SleepHours                   float64
RemovedTeeth                  object
HadHeartAttack                object
HadAngina                     object
HadStroke                     object
HadAsthma                     object
HadSkinCancer                 object
HadCOPD                       object
HadDepressiveDisorder         object
HadKidneyDisease              object
HadArthritis                  object
HadDiabetes                   object
DeafOrHardOfHearing           object
BlindOrVisionDifficulty       object
DifficultyConcentrating       object
DifficultyWalking             object
DifficultyDressingBathing     object
DifficultyErrands             object
SmokerStatus                  object
ECigaret

In [9]:
# Identify the target variable
target_variable = 'HadHeartAttack'
print(f"\nTarget Variable '{target_variable}' Data Type:", df[target_variable].dtype)
print(f"\nUnique values in '{target_variable}':", df[target_variable].unique())


Target Variable 'HadHeartAttack' Data Type: object

Unique values in 'HadHeartAttack': ['No' 'Yes' nan]


In [10]:
# 2. Visualize original data
# Correlation heatmap for numeric features
numeric_features = df.select_dtypes(include=[np.number]).columns
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(df[numeric_features].corr(), annot=False, cmap='coolwarm', linewidths=0.5, ax=ax)
ax.set_title('Correlation Heatmap of Numerical Features')
save_figure(fig, 'correlation_heatmap_original.png')
plt.show()

In [11]:
# Distribution of target variable
fig, ax = plt.subplots(figsize=(8, 6))
df[target_variable].value_counts().plot(kind='bar', ax=ax)
ax.set_title(f'Distribution of {target_variable} Cases')
ax.set_xlabel(target_variable)
ax.set_ylabel('Count')
save_figure(fig, 'target_distribution.png')
plt.show()

In [12]:
# 3. Preprocess the data
# Identify numeric and categorical columns
numeric_features = df.select_dtypes(include=[np.number]).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Ensure target variable is not in feature lists
if target_variable in numeric_features:
    numeric_features = numeric_features.drop(target_variable)
if target_variable in categorical_features:
    categorical_features = categorical_features.drop(target_variable)

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
X = df.drop(target_variable, axis=1)
y = df[target_variable]

# Convert target variable to numeric if it's categorical
if y.dtype == 'object':
    y = pd.get_dummies(y, drop_first=True).iloc[:, 0]

X_preprocessed = preprocessing_pipeline.fit_transform(X)

# Get feature names after preprocessing
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_feature_names = onehot_encoder.get_feature_names_out(categorical_features)
feature_names = list(numeric_features) + list(cat_feature_names)

# Convert to DataFrame for easier handling
df_preprocessed = pd.DataFrame(X_preprocessed.toarray(), columns=feature_names)
df_preprocessed[target_variable] = y

# Display the column names (attributes) used in the post-processed data
print("Attributes used in the post-processed data:")
df_preprocessed


Attributes used in the post-processed data:


Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI,State_Alabama,State_Alaska,State_Arizona,State_Arkansas,...,"TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap",TetanusLast10Tdap_missing,HighRiskLastYear_No,HighRiskLastYear_Yes,HighRiskLastYear_missing,CovidPos_No,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes,CovidPos_missing,HadHeartAttack
0,0.0,0.000000,0.5,0.017937,0.061559,0.036340,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,False
1,0.0,0.000000,-0.5,-0.666667,-0.588159,-0.260606,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,False
2,0.5,0.684517,-1.0,-0.866667,-0.784356,-0.406061,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,False
3,0.0,0.000000,0.0,-0.333333,-0.784356,-0.756061,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,False
4,0.5,0.000000,1.0,-0.866667,-1.195765,-0.987879,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445127,0.0,0.684517,-0.5,-0.333333,-0.509939,-0.403030,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,False
445128,0.5,0.456345,0.0,0.000000,0.058773,0.056061,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,False
445129,7.5,6.845175,-1.0,0.000000,-1.372083,-1.675758,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,False
445130,0.0,0.000000,-1.0,0.866667,1.175886,0.645455,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,True


In [13]:
# 4. Visualize preprocessed data
# Correlation heatmap of preprocessed data
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(df_preprocessed.corr(), annot=False, cmap='coolwarm', linewidths=0.5, ax=ax)
ax.set_title('Correlation Heatmap of Preprocessed Features')
save_figure(fig, 'correlation_heatmap_preprocessed.png')
plt.show()

In [14]:
# 5. Split the data
X_train, X_test, y_train, y_test = train_test_split(df_preprocessed.drop(target_variable, axis=1), df_preprocessed[target_variable], test_size=0.25, random_state=45)

In [15]:
# 6. Train and evaluate models
# Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)

print("\nLogistic Regression Results:")
print("Accuracy:", lr_accuracy)
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, lr_pred))


Logistic Regression Results:
Accuracy: 0.9461463116558684

Classification Report:
              precision    recall  f1-score   support

       False       0.96      0.99      0.97    104989
        True       0.55      0.25      0.34      6294

    accuracy                           0.95    111283
   macro avg       0.75      0.62      0.66    111283
weighted avg       0.93      0.95      0.94    111283


Confusion Matrix:
[[103729   1260]
 [  4733   1561]]


In [16]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("\nRandom Forest Results:")
print("Accuracy:", rf_accuracy)
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, rf_pred))


Random Forest Results:
Accuracy: 0.9464967694975872

Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.99      0.97    104989
        True       0.59      0.17      0.26      6294

    accuracy                           0.95    111283
   macro avg       0.77      0.58      0.62    111283
weighted avg       0.93      0.95      0.93    111283


Confusion Matrix:
[[104261    728]
 [  5226   1068]]


In [17]:
# Compare feature importance (for Random Forest)
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': rf_model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20), ax=ax)
ax.set_title('Top 20 Feature Importance (Random Forest)')
save_figure(fig, 'feature_importance.png')
plt.show()

In [18]:
# Apply PCA after preprocessing
n_components = 10  # Adjust the number of components or set to None to keep all components with variance explained
pca = PCA(n_components=n_components)

# Fit PCA on the preprocessed data (excluding target variable)
X_pca = pca.fit_transform(df_preprocessed.drop(target_variable, axis=1))

# Display explained variance ratios for each component
print("\nExplained Variance Ratios by PCA Components:")
print(pca.explained_variance_ratio_)



Explained Variance Ratios by PCA Components:
[0.23229727 0.1133136  0.06505329 0.06240663 0.03913342 0.03052855
 0.02282342 0.02160606 0.01834263 0.01602356]

PCA-Transformed Data (first few rows):
        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0 -1.482430 -0.337281  0.016872 -0.551503 -0.656689 -0.312154  0.198048   
1 -1.541383 -0.212033 -0.864492 -0.749205 -0.373818 -1.195581 -1.010707   
2 -0.854133  0.188091 -1.149555 -0.443943 -0.112956 -1.365193 -0.692408   
3 -1.412349 -0.435456 -1.168436 -0.568296 -1.490527  0.858088 -0.381260   
4 -1.218443 -0.553269 -1.798268 -0.755508 -0.587452 -0.020735  0.391304   

        PC8       PC9      PC10  HadHeartAttack  
0  0.897981 -0.578438 -0.023269           False  
1  0.984390 -0.920374 -0.573150           False  
2 -0.280108 -0.801381 -0.141610           False  
3 -0.093023 -0.294754  0.123212           False  
4  0.697714  0.386159 -0.346730           False  

Logistic Regression Results (PCA):
Accuracy: 0.94

In [None]:
# Convert PCA results to DataFrame
df_pca = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
df_pca[target_variable] = df_preprocessed[target_variable]

print("\nPCA-Transformed Data (first few rows):")
print(df_pca.head())

# Split the PCA-transformed data
X_train_pca, X_test_pca, y_train, y_test = train_test_split(df_pca.drop(target_variable, axis=1), df_pca[target_variable], test_size=0.25, random_state=45)

In [None]:
# Train and evaluate models using PCA-transformed data
# Logistic Regression
lr_model_pca = LogisticRegression(random_state=42)
lr_model_pca.fit(X_train_pca, y_train)
lr_pred_pca = lr_model_pca.predict(X_test_pca)
lr_accuracy_pca = accuracy_score(y_test, lr_pred_pca)

print("\nLogistic Regression Results (PCA):")
print("Accuracy:", lr_accuracy_pca)
print("\nClassification Report:")
print(classification_report(y_test, lr_pred_pca))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, lr_pred_pca))

In [None]:
# Random Forest using PCA-transformed data
rf_model_pca = RandomForestClassifier(random_state=42)
rf_model_pca.fit(X_train_pca, y_train)
rf_pred_pca = rf_model_pca.predict(X_test_pca)
rf_accuracy_pca = accuracy_score(y_test, rf_pred_pca)

print("\nRandom Forest Results (PCA):")
print("Accuracy:", rf_accuracy_pca)
print("\nClassification Report:")
print(classification_report(y_test, rf_pred_pca))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, rf_pred_pca))

In [None]:
# Visualize explained variance ratios of PCA components
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=[f'PC{i+1}' for i in range(len(pca.explained_variance_ratio_))], y=pca.explained_variance_ratio_, ax=ax)
ax.set_title('Explained Variance Ratio by PCA Components')
ax.set_ylabel('Explained Variance Ratio')
save_figure(fig, 'pca_explained_variance.png')
plt.show()