1. [Train Test Split](#Train-Test-Split)
2. [Feature Engineering](#Feature-Engineering)
   - [Feature Scaling](#Feature-Scaling)
   - [Feature Encoding](#Feature-Encoding)
   - [Missing Data](#Missing-Data)

In [4]:
import pandas as pd
from ycimpute.imputer import knnimput
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split

from warnings import filterwarnings
filterwarnings('ignore')


In [5]:
heart_diases = pd.read_csv("../data/raw/heart_disease.csv")
df = heart_diases.copy()

## Train Test Split 

In [6]:
X = df.drop(["Heart Disease Status"] , axis = 1)
y = df["Heart Disease Status"]
X_train , X_test,y_train , y_test = train_test_split(X , y , test_size = 0.20 , random_state = 42)

In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8000, 20) (2000, 20) (8000,) (2000,)


In [8]:
X_train.head(2)

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level
9254,78.0,Male,140.0,289.0,High,Yes,Yes,No,25.086206,Yes,Yes,No,Low,High,9.024681,Medium,222.0,137.0,14.481422,11.320909
1561,64.0,Male,143.0,203.0,Low,Yes,Yes,Yes,23.791361,Yes,Yes,No,High,Medium,6.543419,Medium,188.0,111.0,6.485267,15.761419


In [9]:
X_test.head(2)

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,Low HDL Cholesterol,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level
6252,38.0,Male,169.0,156.0,Medium,Yes,No,Yes,35.303549,No,No,Yes,Medium,High,4.213721,Medium,321.0,158.0,9.778347,17.400056
4684,77.0,Female,174.0,280.0,Medium,Yes,Yes,No,29.792178,Yes,No,Yes,,Medium,5.081128,Medium,312.0,80.0,14.370456,8.72418


In [10]:
y_train.head()

9254    Yes
1561     No
1670     No
6087     No
6669     No
Name: Heart Disease Status, dtype: object

In [11]:
y_test.head()

6252    No
4684    No
1731    No
4742    No
4521    No
Name: Heart Disease Status, dtype: object

In [12]:
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

Heart Disease Status
No     0.798375
Yes    0.201625
Name: proportion, dtype: float64
Heart Disease Status
No     0.8065
Yes    0.1935
Name: proportion, dtype: float64


## Feature Engineering

### Feature Scaling

In [13]:
numeric_cols = X_train.select_dtypes(include=["float64", "int64"]).columns.tolist()

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

### Feature Encoding 

In [14]:
categorical_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
pref = ["Gen","Exercise","Smok","Family","Dia","HighB","LowHDL","HighLDL","Alch","Stress","Sugar"]

X_train = pd.get_dummies(data = X_train ,columns =  categorical_cols, prefix = pref,drop_first = True)
X_test = pd.get_dummies(data = X_test ,columns = categorical_cols ,prefix = pref,drop_first = True)

bool_columns = X_train.select_dtypes(include = "bool").columns
X_train[bool_columns] = X_train[bool_columns].astype(int)

bool_columns = X_test.select_dtypes(include = "bool").columns
X_test[bool_columns] = X_test[bool_columns].astype(int)

In [15]:
X_train.head(2)

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,BMI,Sleep Hours,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Gen_Male,...,Dia_Yes,HighB_Yes,LowHDL_Yes,HighLDL_Yes,Alch_Low,Alch_Medium,Stress_Low,Stress_Medium,Sugar_Low,Sugar_Medium
9254,1.587617,-0.553458,1.460896,-0.627637,1.165436,-0.32961,0.722793,1.617575,-0.260051,1,...,0,1,1,0,1,0,0,0,0,1
1561,0.817468,-0.382678,-0.510937,-0.832741,-0.249128,-0.718604,-0.379149,-0.225335,0.764507,1,...,1,1,1,0,0,0,0,1,0,1


In [16]:
X_test.head(2)

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,BMI,Sleep Hours,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Gen_Male,...,Dia_Yes,HighB_Yes,LowHDL_Yes,HighLDL_Yes,Alch_Low,Alch_Medium,Stress_Low,Stress_Medium,Sugar_Low,Sugar_Medium
6252,-0.612808,1.097412,-1.588567,0.990795,-1.577285,0.803048,1.612823,0.533636,1.14259,1,...,1,0,0,1,0,1,0,0,0,1
4684,1.532606,1.382044,1.254541,0.117791,-1.082777,0.700079,-1.693003,1.592,-0.859194,0,...,0,1,0,1,0,0,0,1,0,1


### Missing Data

In [17]:
X_train.isnull().sum()

Age                    25
Blood Pressure         16
Cholesterol Level      25
BMI                    16
Sleep Hours            22
Triglyceride Level     22
Fasting Blood Sugar    12
CRP Level              18
Homocysteine Level     16
Gen_Male                0
Exercise_Low            0
Exercise_Medium         0
Smok_Yes                0
Family_Yes              0
Dia_Yes                 0
HighB_Yes               0
LowHDL_Yes              0
HighLDL_Yes             0
Alch_Low                0
Alch_Medium             0
Stress_Low              0
Stress_Medium           0
Sugar_Low               0
Sugar_Medium            0
dtype: int64

In [18]:
(X_train.isnull().sum()/ len(df)) * 100

Age                    0.25
Blood Pressure         0.16
Cholesterol Level      0.25
BMI                    0.16
Sleep Hours            0.22
Triglyceride Level     0.22
Fasting Blood Sugar    0.12
CRP Level              0.18
Homocysteine Level     0.16
Gen_Male               0.00
Exercise_Low           0.00
Exercise_Medium        0.00
Smok_Yes               0.00
Family_Yes             0.00
Dia_Yes                0.00
HighB_Yes              0.00
LowHDL_Yes             0.00
HighLDL_Yes            0.00
Alch_Low               0.00
Alch_Medium            0.00
Stress_Low             0.00
Stress_Medium          0.00
Sugar_Low              0.00
Sugar_Medium           0.00
dtype: float64

In [19]:
X_test.isnull().sum()

Age                     4
Blood Pressure          3
Cholesterol Level       5
BMI                     6
Sleep Hours             3
Triglyceride Level      4
Fasting Blood Sugar    10
CRP Level               8
Homocysteine Level      4
Gen_Male                0
Exercise_Low            0
Exercise_Medium         0
Smok_Yes                0
Family_Yes              0
Dia_Yes                 0
HighB_Yes               0
LowHDL_Yes              0
HighLDL_Yes             0
Alch_Low                0
Alch_Medium             0
Stress_Low              0
Stress_Medium           0
Sugar_Low               0
Sugar_Medium            0
dtype: int64

In [20]:
(X_test.isnull().sum()/ len(df)) * 100

Age                    0.04
Blood Pressure         0.03
Cholesterol Level      0.05
BMI                    0.06
Sleep Hours            0.03
Triglyceride Level     0.04
Fasting Blood Sugar    0.10
CRP Level              0.08
Homocysteine Level     0.04
Gen_Male               0.00
Exercise_Low           0.00
Exercise_Medium        0.00
Smok_Yes               0.00
Family_Yes             0.00
Dia_Yes                0.00
HighB_Yes              0.00
LowHDL_Yes             0.00
HighLDL_Yes            0.00
Alch_Low               0.00
Alch_Medium            0.00
Stress_Low             0.00
Stress_Medium          0.00
Sugar_Low              0.00
Sugar_Medium           0.00
dtype: float64

In [21]:
(df.isnull().sum()/ len(df)) * 100

Age                      0.29
Gender                   0.19
Blood Pressure           0.19
Cholesterol Level        0.30
Exercise Habits          0.25
Smoking                  0.25
Family Heart Disease     0.21
Diabetes                 0.30
BMI                      0.22
High Blood Pressure      0.26
Low HDL Cholesterol      0.25
High LDL Cholesterol     0.26
Alcohol Consumption     25.86
Stress Level             0.22
Sleep Hours              0.25
Sugar Consumption        0.30
Triglyceride Level       0.26
Fasting Blood Sugar      0.22
CRP Level                0.26
Homocysteine Level       0.20
Heart Disease Status     0.00
dtype: float64

**Overview of Missing Data**:

A detailed analysis of missing data was conducted on the dataset. The findings indicate that:

* The missing data percentage ranges between 0.19% and 0.30% for most features.

* The 'Alcohol Consumption' column has a significantly higher missing data percentage (25.86%), which is approximately one-fourth of the dataset.

* No evident correlation was found between missing values and other variables.

* There are no outliers in the dataset that could distort the missing data imputation process.

**Strategy for Handling Missing Data**:

1. Dropping 'Alcohol Consumption' Column:

* The missing rate is excessively high (~26%), making it impractical to impute.

* Retaining this column and applying imputation could introduce significant biases.

* Therefore, the best approach is to remove this column from the dataset.

2. Imputing Missing Values for Other Features:

* Given the low missing percentages (0.12% - 0.25%), imputation is preferred over deletion.

* KNN Imputer will be used for missing value imputation because:

    * The dataset has no outliers, making KNN a suitable method.

    * Methods like mean/median imputation could distort feature distributions.

    * Regression-based imputation might be excessive for a medium-sized dataset.

    * KNN Imputer leverages feature similarity to generate realistic missing value replacements.

**Final Decision**:

* 'Alcohol Consumption' will be removed from the dataset. KNN Imputer will be used to fill missing values in all other columns.

* This strategy ensures data integrity while minimizing information loss, thus maintaining the quality of the dataset for machine learning applications.

In [22]:
X_train= X_train.drop(["Alch_Low","Alch_Medium"] , axis = 1)
X_test= X_test.drop(["Alch_Low","Alch_Medium"], axis = 1)

In [23]:
X_train_filled = knnimput.KNN(k= 5).complete(X_train.values)
X_train = pd.DataFrame(X_train_filled ,columns = X_train.columns)

X_test_filled = knnimput.KNN(k= 5).complete(X_test.values)
X_test = pd.DataFrame(X_test_filled ,columns = X_train.columns)

Imputing row 1/8000 with 0 missing, elapsed time: 8.952
Imputing row 101/8000 with 0 missing, elapsed time: 8.952
Imputing row 201/8000 with 0 missing, elapsed time: 8.953
Imputing row 301/8000 with 0 missing, elapsed time: 8.953
Imputing row 401/8000 with 0 missing, elapsed time: 8.953
Imputing row 501/8000 with 0 missing, elapsed time: 8.953
Imputing row 601/8000 with 0 missing, elapsed time: 8.953
Imputing row 701/8000 with 0 missing, elapsed time: 8.953
Imputing row 801/8000 with 0 missing, elapsed time: 8.954
Imputing row 901/8000 with 0 missing, elapsed time: 8.954
Imputing row 1001/8000 with 0 missing, elapsed time: 8.954
Imputing row 1101/8000 with 0 missing, elapsed time: 8.954
Imputing row 1201/8000 with 0 missing, elapsed time: 8.954
Imputing row 1301/8000 with 0 missing, elapsed time: 8.954
Imputing row 1401/8000 with 0 missing, elapsed time: 8.954
Imputing row 1501/8000 with 0 missing, elapsed time: 8.955
Imputing row 1601/8000 with 0 missing, elapsed time: 8.955
Imputing 

In [24]:
print(X_test.isnull().sum())
print("----------------------------------------")
print(X_train.isnull().sum())

Age                    0
Blood Pressure         0
Cholesterol Level      0
BMI                    0
Sleep Hours            0
Triglyceride Level     0
Fasting Blood Sugar    0
CRP Level              0
Homocysteine Level     0
Gen_Male               0
Exercise_Low           0
Exercise_Medium        0
Smok_Yes               0
Family_Yes             0
Dia_Yes                0
HighB_Yes              0
LowHDL_Yes             0
HighLDL_Yes            0
Stress_Low             0
Stress_Medium          0
Sugar_Low              0
Sugar_Medium           0
dtype: int64
----------------------------------------
Age                    0
Blood Pressure         0
Cholesterol Level      0
BMI                    0
Sleep Hours            0
Triglyceride Level     0
Fasting Blood Sugar    0
CRP Level              0
Homocysteine Level     0
Gen_Male               0
Exercise_Low           0
Exercise_Medium        0
Smok_Yes               0
Family_Yes             0
Dia_Yes                0
HighB_Yes            

In [26]:
X_train.to_csv("../data/proccessed/X_train.csv", index=False)
X_test.to_csv("../data/proccessed/X_test.csv", index=False)
y_train.to_csv("../data/proccessed/y_train.csv", index=False)
y_test.to_csv("../data/proccessed/y_test.csv", index=False)