In [17]:
import pandas as pd
import numpy as np

# Load the heart failure dataset
heart_data = pd.read_csv('../data/external/heart_failure_clinical_records_dataset.csv', low_memory=False)

# Display the first few rows
print("Heart Failure Clinical Records Dataset:")
heart_data.head()

Heart Failure Clinical Records Dataset:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [18]:
# Introducing null values at random positions in numeric columns
numeric_cols = heart_data.select_dtypes(include='number').columns
for col in numeric_cols:
    heart_data.loc[np.random.choice(heart_data.index, size=5, replace=False), col] = np.nan

# Introducing random negative values in numeric columns
for col in numeric_cols:
    neg_indices = np.random.choice(heart_data.index, size=3, replace=False)
    heart_data.loc[neg_indices, col] = -np.abs(heart_data.loc[neg_indices, col])

# Save the modified dataset for further use
file_path = '../data/processed/heart_failure_dataset_with_errors.csv'
heart_data.to_csv(file_path, index=False)

# Display the first few rows of the dataset with null and negative values
print("\nHeart Dataset with Introduced Errors:")
heart_data.head()


Heart Dataset with Introduced Errors:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0


In [19]:
# Adding a Total column
heart_data['Total'] = heart_data[numeric_cols].sum(axis=1, skipna=True)
print("\nDataset with Total Column:")
heart_data.head()


Dataset with Total Column:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,Total
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0,265815.9
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0,271457.13
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0,162371.3
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0,210329.9
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0,327374.7


In [20]:
# Pivot Table
pivot_table = heart_data.pivot_table(index='sex', columns='DEATH_EVENT', values='Total', aggfunc='sum', fill_value=0)
print("\nPivot Table - Total by Sex and Death Event:")
pivot_table


Pivot Table - Total by Sex and Death Event:


DEATH_EVENT,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,264310.5,264198.9
0.0,19317800.0,8015160.0
1.0,30771280.0,14475110.0


In [21]:
# Melt Pivot Table
melted_df = pivot_table.reset_index().melt(id_vars='sex', var_name='Death Event', value_name='Total')
print("\nMelted DataFrame:")
melted_df


Melted DataFrame:


Unnamed: 0,sex,Death Event,Total
0,-1.0,0.0,264310.5
1,0.0,0.0,19317800.0
2,1.0,0.0,30771280.0
3,-1.0,1.0,264198.9
4,0.0,1.0,8015160.0
5,1.0,1.0,14475110.0


In [22]:
# Replace negative values with NaN
heart_data[numeric_cols] = heart_data[numeric_cols].apply(lambda x: np.where(x < 0, np.nan, x))

print("\nDataset after handling bad data (negative values replaced with NaN):")
heart_data.head()


Dataset after handling bad data (negative values replaced with NaN):


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,Total
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0,265815.9
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0,271457.13
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0,162371.3
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0,210329.9
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0,327374.7


In [23]:
# Fill NaN values with column medians
heart_data[numeric_cols] = heart_data[numeric_cols].apply(lambda x: x.fillna(x.median()), axis=0)

print("\nDataset after filling NaN values with column medians:")
heart_data.head()


Dataset after filling NaN values with column medians:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,Total
0,75.0,0.0,582.0,0.0,20.0,1.0,265000.0,1.9,130.0,1.0,0.0,4.0,1.0,265815.9
1,55.0,0.0,7861.0,0.0,38.0,0.0,263358.03,1.1,136.0,1.0,0.0,6.0,1.0,271457.13
2,65.0,0.0,146.0,0.0,20.0,0.0,162000.0,1.3,129.0,1.0,1.0,7.0,1.0,162371.3
3,50.0,1.0,111.0,0.0,20.0,0.0,210000.0,1.9,137.0,1.0,0.0,7.0,1.0,210329.9
4,65.0,1.0,160.0,1.0,20.0,0.0,327000.0,2.7,116.0,0.0,0.0,8.0,1.0,327374.7


In [24]:
# Grouping data
grouped_df = heart_data.groupby(['sex', 'DEATH_EVENT']).agg({'Total': 'sum'}).reset_index()

# Create a pivot table
pivot_df = grouped_df.pivot(index='sex', columns='DEATH_EVENT', values='Total').fillna(0)

print("\nPivot Table Based on Sex and Death Event:")
pivot_df


Pivot Table Based on Sex and Death Event:


DEATH_EVENT,0.0,1.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,19925010.0,8015160.0
1.0,32640660.0,15278800.0


In [25]:
# Stack Pivot Table
stacked_pivot = pivot_df.stack()
print("\nStacked Pivot Table:")
stacked_pivot


Stacked Pivot Table:


sex  DEATH_EVENT
0.0  0.0            1.992501e+07
     1.0            8.015160e+06
1.0  0.0            3.264066e+07
     1.0            1.527880e+07
dtype: float64