In [158]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [159]:
# Load the dataset
file_path_new = 'heart.csv'  
heart_df = pd.read_csv(file_path_new)

In [160]:
display(heart_df.head(10))

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [161]:
nan_values = heart_df.isna().sum()
print(nan_values)

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [162]:
# Count the number of duplicate rows
num_duplicates = heart_df.duplicated().sum()

print(f"Number of duplicate rows: {num_duplicates}")


Number of duplicate rows: 0


In [163]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [164]:
heart_df_cleaned = heart_df.copy()

In [165]:
# 3. Fill missing categorical values with mode
categorical_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope','ExerciseAngina']  # Columns that need encoding
for column in categorical_columns:
    heart_df_cleaned[column] = heart_df_cleaned[column].fillna(heart_df_cleaned[column].mode()[0])

# 4. Encoding categorical features using LabelEncoder
label_encoder = LabelEncoder()
for column in categorical_columns:
    heart_df_cleaned[column] = label_encoder.fit_transform(heart_df_cleaned[column])


In [166]:
print("\nFirst 10 rows of the cleaned dataset:")
display(heart_df_cleaned.head(10))


First 10 rows of the cleaned dataset:


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
5,39,1,2,120,339,0,1,170,0,0.0,2,0
6,45,0,1,130,237,0,1,170,0,0.0,2,0
7,54,1,1,110,208,0,1,142,0,0.0,2,0
8,37,1,0,140,207,0,1,130,1,1.5,1,1
9,48,0,1,120,284,0,1,120,0,0.0,2,0


In [167]:
# Display the summary information of the cleaned DataFrame
print("\nDataFrame Info after cleaning:")
heart_df_cleaned.info()


DataFrame Info after cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int64  
 2   ChestPainType   918 non-null    int64  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int64  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    int64  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int64  
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 86.2 KB


In [168]:
# Show statistical summary for numeric columns
print("\nStatistical summary for numeric columns:")
display(heart_df_cleaned.describe())


Statistical summary for numeric columns:


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,0.78976,0.781046,132.396514,198.799564,0.233115,0.989107,136.809368,0.404139,0.887364,1.361656,0.553377
std,9.432617,0.407701,0.956519,18.514154,109.384145,0.423046,0.631671,25.460334,0.490992,1.06657,0.607056,0.497414
min,28.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,0.0,120.0,173.25,0.0,1.0,120.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,0.0,130.0,223.0,0.0,1.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,1.0,2.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,2.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [170]:
nan_values = heart_df_cleaned.isna().sum()
print(nan_values)

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [169]:
# 6. Save the cleaned and encoded data
cleaned_file_path_new = 'heart_cleaned.csv'  # Replace with desired path if running locally
heart_df_cleaned.to_csv(cleaned_file_path_new, index=False)
