# Heart Failure Clinical Records Data Pre-processing

We analyzed the **Heart Failure Clinical Records Data** to gather insights about the missing data (if any), handle the missing values and cleaning it.

In [1]:
# Import necessary libraries
import pandas as pd

In [80]:
# Load the heart failure clinical records dataset
heart_data = pd.read_csv('../data/external/heart_failure_clinical_records_dataset.csv', low_memory=False)

In [81]:
# Preview the first few rows of the dataset
print("\nHeart Failure Clinical Records Data Preview:")
heart_data.head()


Heart Failure Clinical Records Data Preview:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
# Summary statistics for numerical columns
print("\nHeart Failure Clinical Records Data Description:")
heart_data.describe()


Heart Failure Clinical Records Data Description:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [83]:
# Check the structure of the dataset
print("\nHeart Failure Clinical Records Data Info:")
heart_data.info()


Heart Failure Clinical Records Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(1

In [84]:
# Checking for missing values in each column
heart_data_missing_values = heart_data.isnull().sum()

In [85]:
# Percentage of missing values in each column
heart_data_missing_percentage = (heart_data_missing_values / len(heart_data)) * 100

In [86]:
# Display missing values and their percentages
print("\nMissing Values in Heart Failure Clinical Records Dataset:")
heart_data_missing_values


Missing Values in Heart Failure Clinical Records Dataset:


age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [87]:
print("\nMissing Percentage in Heart Failure Clinical Records Dataset:")
heart_data_missing_percentage


Missing Percentage in Heart Failure Clinical Records Dataset:


age                         0.0
anaemia                     0.0
creatinine_phosphokinase    0.0
diabetes                    0.0
ejection_fraction           0.0
high_blood_pressure         0.0
platelets                   0.0
serum_creatinine            0.0
serum_sodium                0.0
sex                         0.0
smoking                     0.0
time                        0.0
DEATH_EVENT                 0.0
dtype: float64

In [88]:
# Option 1: Dropping rows with missing values
heart_data_dropped = heart_data.dropna()

# Shape after dropping rows with missing values
print("Shape of Heart Data after dropping missing values:", heart_data_dropped.shape)

Shape of Heart Data after dropping missing values: (299, 13)


In [89]:
# Option 2: Filling missing values with mean for numeric columns and mode for categorical columns

heart_data_filled_mean = heart_data.fillna(heart_data.mean(numeric_only=True)) # Fill numeric columns with mean
heart_data_filled_mode = heart_data.apply(lambda x: x.fillna(x.mode()[0]) if x.dtype == 'O' else x) # Fill categorical columns with mode

# Display first few rows after filling missing values
print("Heart Failure Clinical Records Data after filling missing values (mean/mode):")
heart_data_filled_mean.head()

Heart Failure Clinical Records Data after filling missing values (mean/mode):


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [90]:
# Use forward fill to propagate the last valid observation forward
heart_data_ffill = heart_data.ffill()

# Use backward fill to propagate the next valid observation backward
heart_data_bfill = heart_data.bfill()

# Display the first few rows of the DataFrame after backward filling
print("Heart Failure Clinical Records Data after Backward Fill:")
heart_data_bfill.head()

Heart Failure Clinical Records Data after Backward Fill:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [91]:
# Filter out non-numeric columns for correlation calculation
heart_data_numeric = heart_data.select_dtypes(include=[float, int])

# Print original correlation (before filling missing values)
print("\nHeart Failure Clinical Records Data Original Correlation:")
heart_data_numeric.corr()


Heart Failure Clinical Records Data Original Correlation:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
age,1.0,0.088006,-0.081584,-0.101012,0.060098,0.093289,-0.052354,0.159187,-0.045966,0.06543,0.018668,-0.224068,0.253729
anaemia,0.088006,1.0,-0.190741,-0.012729,0.031557,0.038182,-0.043786,0.052174,0.041882,-0.094769,-0.10729,-0.141414,0.06627
creatinine_phosphokinase,-0.081584,-0.190741,1.0,-0.009639,-0.04408,-0.07059,0.024463,-0.016408,0.05955,0.079791,0.002421,-0.009346,0.062728
diabetes,-0.101012,-0.012729,-0.009639,1.0,-0.00485,-0.012732,0.092193,-0.046975,-0.089551,-0.15773,-0.147173,0.033726,-0.001943
ejection_fraction,0.060098,0.031557,-0.04408,-0.00485,1.0,0.024445,0.072177,-0.011302,0.175902,-0.148386,-0.067315,0.041729,-0.268603
high_blood_pressure,0.093289,0.038182,-0.07059,-0.012732,0.024445,1.0,0.049963,-0.004935,0.037109,-0.104615,-0.055711,-0.196439,0.079351
platelets,-0.052354,-0.043786,0.024463,0.092193,0.072177,0.049963,1.0,-0.041198,0.062125,-0.12512,0.028234,0.010514,-0.049139
serum_creatinine,0.159187,0.052174,-0.016408,-0.046975,-0.011302,-0.004935,-0.041198,1.0,-0.189095,0.00697,-0.027414,-0.149315,0.294278
serum_sodium,-0.045966,0.041882,0.05955,-0.089551,0.175902,0.037109,0.062125,-0.189095,1.0,-0.027566,0.004813,0.08764,-0.195204
sex,0.06543,-0.094769,0.079791,-0.15773,-0.148386,-0.104615,-0.12512,0.00697,-0.027566,1.0,0.445892,-0.015608,-0.004316


In [92]:
# Fill missing values in the numeric data (if any) with the mean
heart_data_filled_mean = heart_data_numeric.fillna(heart_data_numeric.mean())

# Print correlation after filling missing values
print("\nHeart Failure Clinical Records Data After Filling Missing Values (Mean) Correlation:")
heart_data_filled_mean.corr()


Heart Failure Clinical Records Data After Filling Missing Values (Mean) Correlation:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
age,1.0,0.088006,-0.081584,-0.101012,0.060098,0.093289,-0.052354,0.159187,-0.045966,0.06543,0.018668,-0.224068,0.253729
anaemia,0.088006,1.0,-0.190741,-0.012729,0.031557,0.038182,-0.043786,0.052174,0.041882,-0.094769,-0.10729,-0.141414,0.06627
creatinine_phosphokinase,-0.081584,-0.190741,1.0,-0.009639,-0.04408,-0.07059,0.024463,-0.016408,0.05955,0.079791,0.002421,-0.009346,0.062728
diabetes,-0.101012,-0.012729,-0.009639,1.0,-0.00485,-0.012732,0.092193,-0.046975,-0.089551,-0.15773,-0.147173,0.033726,-0.001943
ejection_fraction,0.060098,0.031557,-0.04408,-0.00485,1.0,0.024445,0.072177,-0.011302,0.175902,-0.148386,-0.067315,0.041729,-0.268603
high_blood_pressure,0.093289,0.038182,-0.07059,-0.012732,0.024445,1.0,0.049963,-0.004935,0.037109,-0.104615,-0.055711,-0.196439,0.079351
platelets,-0.052354,-0.043786,0.024463,0.092193,0.072177,0.049963,1.0,-0.041198,0.062125,-0.12512,0.028234,0.010514,-0.049139
serum_creatinine,0.159187,0.052174,-0.016408,-0.046975,-0.011302,-0.004935,-0.041198,1.0,-0.189095,0.00697,-0.027414,-0.149315,0.294278
serum_sodium,-0.045966,0.041882,0.05955,-0.089551,0.175902,0.037109,0.062125,-0.189095,1.0,-0.027566,0.004813,0.08764,-0.195204
sex,0.06543,-0.094769,0.079791,-0.15773,-0.148386,-0.104615,-0.12512,0.00697,-0.027566,1.0,0.445892,-0.015608,-0.004316


In [None]:
# Print standard deviation before filling missing values
print("\nHeart Data Standard Deviation Before:")
heart_data_numeric.std(numeric_only=True)


Heart Data Standard Deviation Before:


age                            11.894809
anaemia                         0.496107
creatinine_phosphokinase      970.287881
diabetes                        0.494067
ejection_fraction              11.834841
high_blood_pressure             0.478136
platelets                   97804.236869
serum_creatinine                1.034510
serum_sodium                    4.412477
sex                             0.478136
smoking                         0.467670
time                           77.614208
DEATH_EVENT                     0.467670
dtype: float64

In [94]:
# Print standard deviation after filling missing values
print("\nHeart Data Standard Deviation After Filling Missing Values (Mean):")
heart_data_filled_mean.std(numeric_only=True)


Heart Data Standard Deviation After Filling Missing Values (Mean):


age                            11.894809
anaemia                         0.496107
creatinine_phosphokinase      970.287881
diabetes                        0.494067
ejection_fraction              11.834841
high_blood_pressure             0.478136
platelets                   97804.236869
serum_creatinine                1.034510
serum_sodium                    4.412477
sex                             0.478136
smoking                         0.467670
time                           77.614208
DEATH_EVENT                     0.467670
dtype: float64

In [17]:
import pandas as pd
import numpy as np

# Load the heart failure dataset
heart_data = pd.read_csv('../data/external/heart_failure_clinical_records_dataset.csv', low_memory=False)

# Display the first few rows
print("Heart Failure Clinical Records Dataset:")
heart_data.head()

Heart Failure Clinical Records Dataset:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [18]:
# Introducing null values at random positions in numeric columns
numeric_cols = heart_data.select_dtypes(include='number').columns
for col in numeric_cols:
    heart_data.loc[np.random.choice(heart_data.index, size=5, replace=False), col] = np.nan

# Introducing random negative values in numeric columns
for col in numeric_cols:
    neg_indices = np.random.choice(heart_data.index, size=3, replace=False)
    heart_data.loc[neg_indices, col] = -np.abs(heart_data.loc[neg_indices, col])

# Save the modified dataset for further use
file_path = '../data/processed/heart_failure_dataset_with_errors.csv'
heart_data.to_csv(file_path, index=False)

# Display the first few rows of the dataset with null and negative values
print("\nHeart Dataset with Introduced Errors:")
heart_data.head()


Heart Dataset with Introduced Errors:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0


In [19]:
# Adding a Total column
heart_data['Total'] = heart_data[numeric_cols].sum(axis=1, skipna=True)
print("\nDataset with Total Column:")
heart_data.head()


Dataset with Total Column:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,Total
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0,265815.9
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0,271457.13
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0,162371.3
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0,210329.9
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0,327374.7


In [20]:
# Pivot Table
pivot_table = heart_data.pivot_table(index='sex', columns='DEATH_EVENT', values='Total', aggfunc='sum', fill_value=0)
print("\nPivot Table - Total by Sex and Death Event:")
pivot_table


Pivot Table - Total by Sex and Death Event:


DEATH_EVENT,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,264310.5,264198.9
0.0,19317800.0,8015160.0
1.0,30771280.0,14475110.0


In [21]:
# Melt Pivot Table
melted_df = pivot_table.reset_index().melt(id_vars='sex', var_name='Death Event', value_name='Total')
print("\nMelted DataFrame:")
melted_df


Melted DataFrame:


Unnamed: 0,sex,Death Event,Total
0,-1.0,0.0,264310.5
1,0.0,0.0,19317800.0
2,1.0,0.0,30771280.0
3,-1.0,1.0,264198.9
4,0.0,1.0,8015160.0
5,1.0,1.0,14475110.0


In [22]:
# Replace negative values with NaN
heart_data[numeric_cols] = heart_data[numeric_cols].apply(lambda x: np.where(x < 0, np.nan, x))

print("\nDataset after handling bad data (negative values replaced with NaN):")
heart_data.head()


Dataset after handling bad data (negative values replaced with NaN):


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,Total
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0,265815.9
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0,271457.13
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0,162371.3
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0,210329.9
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0,327374.7


In [23]:
# Fill NaN values with column medians
heart_data[numeric_cols] = heart_data[numeric_cols].apply(lambda x: x.fillna(x.median()), axis=0)

print("\nDataset after filling NaN values with column medians:")
heart_data.head()


Dataset after filling NaN values with column medians:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,Total
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0,265815.9
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0,271457.13
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0,162371.3
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0,210329.9
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0,327374.7


In [24]:
# Grouping data
grouped_df = heart_data.groupby(['sex', 'DEATH_EVENT']).agg({'Total': 'sum'}).reset_index()

# Create a pivot table
pivot_df = grouped_df.pivot(index='sex', columns='DEATH_EVENT', values='Total').fillna(0)

print("\nPivot Table Based on Sex and Death Event:")
pivot_df


Pivot Table Based on Sex and Death Event:


DEATH_EVENT,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,19925010.0,8015160.0
1.0,32640660.0,15278800.0


In [25]:
# Stack Pivot Table
stacked_pivot = pivot_df.stack()
print("\nStacked Pivot Table:")
stacked_pivot


Stacked Pivot Table:


sex  DEATH_EVENT
0.0  0.0            1.992501e+07
     1.0            8.015160e+06
1.0  0.0            3.264066e+07
     1.0            1.527880e+07
dtype: float64