In [1]:
# This is code to implement LGBM
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
from lightgbm import LGBMClassifier, early_stopping
import matplotlib.pyplot as plt


origin = pd.read_csv('heart_2022_with_nans.csv')

In [2]:
# Check correlation between features after one-hot encoding (get_dummies),
# and filter out feature pairs whose correlation is greater than 0.7. It found
# out BMI and WeightInKilograms have high correlation. However, I tried to 
# train model with or without WeightInKilograms, and it exhibited no much 
# differences in model performance and permutation importance.
df_corr = origin.copy()
df_corr = df_corr.drop(columns='State')
df_corr = pd.get_dummies(df_corr, drop_first=True)
corr_matrix = df_corr.corr()
corr_pairs = corr_matrix.abs().stack().reset_index()
corr_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']
corr_pairs[(corr_pairs['Correlation'] >= 0.7) & (corr_pairs['Correlation'] != 1)]

Unnamed: 0,Feature_1,Feature_2,Correlation
281,WeightInKilograms,BMI,0.859557
349,BMI,WeightInKilograms,0.859557


In [3]:
# The target is imbalanced and contains NaN
origin['HadHeartAttack'].value_counts(normalize=True, dropna=False)

No     0.936709
Yes    0.056406
NaN    0.006886
Name: HadHeartAttack, dtype: float64

In [4]:
origin.columns.tolist()

['State',
 'Sex',
 'GeneralHealth',
 'PhysicalHealthDays',
 'MentalHealthDays',
 'LastCheckupTime',
 'PhysicalActivities',
 'SleepHours',
 'RemovedTeeth',
 'HadHeartAttack',
 'HadAngina',
 'HadStroke',
 'HadAsthma',
 'HadSkinCancer',
 'HadCOPD',
 'HadDepressiveDisorder',
 'HadKidneyDisease',
 'HadArthritis',
 'HadDiabetes',
 'DeafOrHardOfHearing',
 'BlindOrVisionDifficulty',
 'DifficultyConcentrating',
 'DifficultyWalking',
 'DifficultyDressingBathing',
 'DifficultyErrands',
 'SmokerStatus',
 'ECigaretteUsage',
 'ChestScan',
 'RaceEthnicityCategory',
 'AgeCategory',
 'HeightInMeters',
 'WeightInKilograms',
 'BMI',
 'AlcoholDrinkers',
 'HIVTesting',
 'FluVaxLast12',
 'PneumoVaxEver',
 'TetanusLast10Tdap',
 'HighRiskLastYear',
 'CovidPos']

In [5]:
target = 'HadHeartAttack'
df = origin.copy()
X = df.drop(columns=target)
categoryFeatures = X.select_dtypes(include='object').columns.tolist()
numericalFeatures = X.select_dtypes(exclude='object').columns.tolist()

In [6]:
X = X[categoryFeatures + numericalFeatures]

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 39 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      445132 non-null  object 
 1   Sex                        445132 non-null  object 
 2   GeneralHealth              443934 non-null  object 
 3   LastCheckupTime            436824 non-null  object 
 4   PhysicalActivities         444039 non-null  object 
 5   RemovedTeeth               433772 non-null  object 
 6   HadAngina                  440727 non-null  object 
 7   HadStroke                  443575 non-null  object 
 8   HadAsthma                  443359 non-null  object 
 9   HadSkinCancer              441989 non-null  object 
 10  HadCOPD                    442913 non-null  object 
 11  HadDepressiveDisorder      442320 non-null  object 
 12  HadKidneyDisease           443206 non-null  object 
 13  HadArthritis               44

In [8]:
X.iloc[:,33].isna().sum()

10927

In [9]:
X.iloc[:,38].describe()

count    396326.000000
mean         28.529842
std           6.554889
min          12.020000
25%          24.130000
50%          27.440000
75%          31.750000
max          99.640000
Name: BMI, dtype: float64

In [10]:
X.iloc[:,38].value_counts(normalize=True, dropna=False).sort_index()

12.02    0.000002
12.05    0.000002
12.06    0.000002
12.11    0.000007
12.15    0.000002
           ...   
97.43    0.000002
97.65    0.000011
99.34    0.000002
99.64    0.000002
NaN      0.109644
Name: BMI, Length: 3986, dtype: float64

In [11]:
numericalFeatures

['PhysicalHealthDays',
 'MentalHealthDays',
 'SleepHours',
 'HeightInMeters',
 'WeightInKilograms',
 'BMI']