In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('./dataset/Heart_cleaned.csv')

In [3]:
pd.set_option('display.max_columns', None)  # No limit on number of columns
pd.set_option('display.width', None)        # No width limit for wrapping
pd.set_option('display.max_rows', 500)     # No limit on number of rows (if needed)
pd.set_option('future.no_silent_downcasting', True)  # Opt-in to future behavior

In [4]:
df.dtypes

State                                           object
Sex                                             object
GeneralHealth                                   object
PhysicalHealthDays                             float64
MentalHealthDays                               float64
LastCheckupTime                                 object
PhysicalActivities                              object
SleepHours                                     float64
RemovedTeeth                                    object
HadHeartAttack                                  object
HadAngina                                       object
HadStroke                                       object
HadAsthma                                       object
HadSkinCancer                                   object
HadCOPD                                         object
HadDepressiveDisorder                           object
HadKidneyDisease                                object
HadArthritis                                    object
HadDiabete

## Feature engineering

Chronic conditions + impairments + Comorbidity

In [5]:
# ------------------------------ Chronic Conditions ------------------------------
chronic_conditions = ['HadHeartAttack', 'HadStroke', 'HadAngina', 'HadAsthma', 'HadSkinCancer', 
                      'HadCOPD', 'HadDiabetes', 'HadDepressiveDisorder', 'HadArthritis', 'HadKidneyDisease']

# Convert to lowercase, strip whitespace, and then map 'yes'/'no' to 1/0
df[chronic_conditions] = df[chronic_conditions].apply(lambda col: 
    col.astype(str).str.strip().str.lower().replace(
        {'yes': 1, 'no': 0, 
         'no, pre-diabetes or borderline diabetes': 0, 
         'yes, but only during pregnancy (female)': 0
        }
    )
).astype(int, errors='ignore')  # Explicitly handle downcasting errors

# Calculate the count of chronic conditions
df['ChronicConditionCount'] = df[chronic_conditions].sum(axis=1)


# ------------------------------------ Impairments ------------------------------------
impairments = ['DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'DifficultyConcentrating']

# Convert to lowercase, strip whitespace, and then map 'yes'/'no' to 1/0
df[impairments] = df[impairments].apply(lambda col: 
    col.astype(str).str.strip().str.lower().replace(
        {'yes': 1, 'no': 0}
    )
).astype(int, errors='ignore')  # Force integers and fill NaN as 0

# Calculate the count of impairments
df['ImpairmentCount'] = df[impairments].sum(axis=1)


# ------------------------------------ Comorbidity ------------------------------------
comorbidity_count = chronic_conditions + impairments
df['ComorbidityCount'] = df[comorbidity_count].sum(axis=1)


Sex, Age, Race Classifications

In [6]:
# ------------------------------ create AgeCategory_numeric ------------------------------
# Mapping AgeCategory to numeric values
df['AgeCategory_numeric'] = df['AgeCategory'].map({
    'age 18 to 24': 1, 'age 25 to 29': 2, 'age 30 to 34': 3, 'age 35 to 39': 4, 'age 40 to 44': 5, 
    'age 45 to 49': 6, 'age 50 to 54': 7, 'age 55 to 59': 8, 'age 60 to 64': 9, 'age 65 to 69': 10,
    'age 70 to 74': 11, 'age 75 to 79': 12, 'age 80 or older': 13
})

# Define the function to classify age categories based on the numeric value
def map_age_status(age_score):
    if age_score in [1, 2]:  # Young
        return 'young'
    elif age_score in [3, 4, 5, 6]:  # Middle-aged
        return 'middle-aged'
    elif age_score in [7, 8, 9, 10, 11, 12, 13]:  # Older Adults
        return 'older-adult'
    return 'unknown'  # Handle unexpected values

# Apply the classification to the AgeCategory_numeric
df['AgeCategory_Classification'] = df['AgeCategory_numeric'].apply(map_age_status)

# Map AgeCategory_Classification to numeric ordinal values
df['AgeCategory_Classification_numeric'] = df['AgeCategory_Classification'].map({
    'young': 1, 'middle-aged': 2, 'older-adult': 3
})

# Convert AgeCategory_numeric to an ordered categorical type after classification
df['AgeCategory_numeric'] = pd.Categorical(df['AgeCategory_numeric'], ordered=True)
# check this again


# ------------------------------ create Sex_numeric ------------------------------
df['Sex_numeric'] = df['Sex'].map({'male': 1, 'female': 2})



# ------------------------------ create RaceEthnicityCategory_numeric ------------------------------
df['RaceEthnicityCategory_numeric'] = df['RaceEthnicityCategory'].map({
    'white': 1, 'black': 2, 'multiracial': 3, 'others': 4, 'hispanic': 5})

Misc Classifications + Numerics

In [7]:
# ------------------------------ create BMI_classification ------------------------------
def classify_calculated_bmi(bmi):
    if bmi < 15:
        return 'anorexic'
    elif 15 <= bmi < 18.5:
        return 'underweight'
    elif 18.5 <= bmi < 25:
        return 'normal weight'
    elif 25 <= bmi < 35:
        return 'overweight'
    else:
        return 'morbid obesity'

df['CalculatedBMI_Classification'] = df['CalculatedBMI'].apply(classify_calculated_bmi)
df['CalculatedBMI_Classification_numeric'] = df['CalculatedBMI_Classification'].map({'anorexic': 0, 'underweight': 1, 'normal weight': 2, 'overweight': 3, 'morbid obesity': 4})


# ------------------------------ create PhysicalHealthDays_classification ------------------------------
def classify_physical_health_days(physicalhealthdays):
    if physicalhealthdays == 0:
        return 'excellent'
    elif 1 <= physicalhealthdays <= 3:
        return 'good'
    elif 4 <= physicalhealthdays <= 7:
        return 'fair'
    elif 8 <= physicalhealthdays <= 14:
        return 'poor'
    else:
        return 'very poor'

df['PhysicalHealthDays_Classification'] = df['PhysicalHealthDays'].apply(classify_physical_health_days)

df['PhysicalHealthDays_Classification_numeric'] = df['PhysicalHealthDays_Classification'].map({
    'excellent': 0, 
    'good': 1, 
    'fair': 2, 
    'poor': 8, 
    'very poor': 10
})


# ------------------------------ create MentalHealthDays_classification ------------------------------
def classify_mental_health_days(mentalhealthdays):
    if mentalhealthdays == 0:
        return 'excellent'
    elif 1 <= mentalhealthdays <= 3:
        return 'good'
    elif 4 <= mentalhealthdays <= 7:
        return 'fair'
    elif 8 <= mentalhealthdays <= 14:
        return 'poor'
    else:
        return 'very poor'

df['MentalHealthDays_Classification'] = df['MentalHealthDays'].apply(classify_mental_health_days)

df['MentalHealthDays_Classification_numeric'] = df['MentalHealthDays_Classification'].map({
    'excellent': 0, 
    'good': 1, 
    'fair': 2, 
    'poor': 8, 
    'very poor': 10
})

# ------------------------------ create SleepHour_classification ------------------------------
# create SleepHour_Classification
def classify_sleephour(sleephour, comorbidity_count):
    if sleephour < 4:
        return 'critically low'
    elif 4 <= sleephour < 7:
        return 'low'
    elif 7 <= sleephour <= 9:
        return 'optimal'
    elif 10 <= sleephour <= 12:
        if comorbidity_count > 2:
            return 'optimal'  
        else:
            return 'excessive'
    else:
        return 'excessive'  # If sleephour > 12, classify as 'excessive'


df['SleepHours_Classification'] = df.apply(lambda row: classify_sleephour(row['SleepHours'], row['ComorbidityCount']), axis=1)
df['SleepHours_Classification_numeric'] = df['SleepHours_Classification'].map({'critically low': 0, 'low': 1, 'optimal': 2, 'excessive': 3})



# ------------------------------ LastCheckupTime ------------------------------
LastCheckupTime_order = ['within past year (anytime less than 12 months ago)',
                         'within past 2 years (1 year but less than 2 years ago)',
                         'within past 5 years (2 years but less than 5 years ago)',
                         '5 or more years ago']
df['LastCheckupTime'] = pd.Categorical(df['LastCheckupTime'], categories=LastCheckupTime_order, ordered=True)
df['LastCheckupTime'] = df['LastCheckupTime'].cat.codes


# ------------------------------ RemovedTeeth ------------------------------
RemovedTeeth_order = ['none of them', '1 to 5', '6 or more, but not all', 'all']
df['RemovedTeeth'] = pd.Categorical(df['RemovedTeeth'], categories=RemovedTeeth_order, ordered=True)
df['RemovedTeeth'] = df['RemovedTeeth'].cat.codes


# ------------------------------ SmokerStatus ------------------------------
SmokerStatus_order = ['never smoked', 'former smoker', 'current smoker - now smokes some days', 'current smoker - now smokes every day']
df['SmokerStatus'] = pd.Categorical(df['SmokerStatus'], categories=SmokerStatus_order, ordered=True)
df['SmokerStatus'] = df['SmokerStatus'].cat.codes


# ------------------------------ ECigaretteUsage ------------------------------
ECigaretteUsage_order = ['never used e-cigarettes in my entire life', 
                         'not at all (right now)', 
                         'use them some days', 
                         'use them every day']
df['ECigaretteUsage'] = pd.Categorical(df['ECigaretteUsage'], categories=ECigaretteUsage_order, ordered=True)
df['ECigaretteUsage'] = df['ECigaretteUsage'].cat.codes


# ------------------------------ TetanusLast10Tdap ------------------------------
TetanusLast10Tdap_order = ['no, did not receive any tetanus shot in the past 10 years',
                           'yes, received tetanus shot but not sure what type',
                           'yes, received tetanus shot, but not tdap',
                           'yes, received tdap']
df['TetanusLast10Tdap'] = pd.Categorical(df['TetanusLast10Tdap'], categories=TetanusLast10Tdap_order, ordered=True)
df['TetanusLast10Tdap'] = df['TetanusLast10Tdap'].cat.codes




# ---------------------------------------- Misc ----------------------------------------
# Define the order for 'Yes'/'No' columns
yes_no_order = ['no', 'yes']

# Convert each specified column to categorical with ordered categories and map to numerical codes
yes_no_columns = ['AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'HighRiskLastYear', 'CovidPos']

for col in yes_no_columns:
    df[col] = pd.Categorical(df[col].str.strip().str.lower(), categories=yes_no_order, ordered=True)
    df[col] = df[col].cat.codes

df['ChestScan'] = pd.Categorical(df['ChestScan'].str.strip().str.lower(), categories=yes_no_order, ordered=True)
df['ChestScan'] = df['ChestScan'].cat.codes

df['PhysicalActivities'] = pd.Categorical(df['PhysicalActivities'].str.strip().str.lower(), categories=yes_no_order, ordered=True)
df['PhysicalActivities'] = df['PhysicalActivities'].cat.codes


In [8]:
missing_values = df['SleepHours_Classification'].isnull().sum()
print(f"Number of missing values in 'SleepHours_Classification': {missing_values}")

Number of missing values in 'SleepHours_Classification': 0


GeneralHealth Category adjustments.

In [9]:
# ------------------------------ create simplified_GeneralHealth ------------------------------
df['GeneralHealth_numeric'] = df['GeneralHealth'].map({'poor': 1, 'fair': 2, 'good': 3, 'very good': 4, 'excellent': 5})

generalhealth_mapping = {
    1: 'below average',  # Poor
    2: 'below average',  # Fair
    3: 'above average',  # Good
    4: 'above average',  # Very Good
    5: 'above average'   # Excellent
}

df['GeneralHealth_Classification'] = df['GeneralHealth_numeric'].map(generalhealth_mapping)
df['GeneralHealth_Classification_numeric'] = df['GeneralHealth_Classification'].map({'above average': 1, 'below average': 0})

In [10]:
df.dtypes

State                                            object
Sex                                              object
GeneralHealth                                    object
PhysicalHealthDays                              float64
MentalHealthDays                                float64
LastCheckupTime                                    int8
PhysicalActivities                                 int8
SleepHours                                      float64
RemovedTeeth                                       int8
HadHeartAttack                                    int64
HadAngina                                         int64
HadStroke                                         int64
HadAsthma                                         int64
HadSkinCancer                                     int64
HadCOPD                                           int64
HadDepressiveDisorder                             int64
HadKidneyDisease                                  int64
HadArthritis                                    

In [11]:
df.to_csv('./dataset/Heart_preprocessed.csv', index=False)