In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 📌 1️⃣ - Load the data
# Load the data from a CSV file (replace 'heart_disease.csv' with your actual file)
df = pd.read_csv('/content/drive/MyDrive/project_dm/Cardiovascular_Disease_Dataset.csv')

# 🔍 Display some samples from the dataset to see initial values
display(df.head())

# 📌 2️⃣ - Handling missing values
# 🔹 Check for missing values in the dataset
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

# 🔹 Fill missing values (Imputation) based on data type
for column in df.columns:
    if df[column].dtype == 'object':  # If the data is categorical
        df[column].fillna(df[column].mode()[0], inplace=True)  # Replace with the most frequent value
    else:
        df[column].fillna(df[column].median(), inplace=True)  # Replace with the median for numerical data

# 📌 3️⃣ - Encoding categorical variables
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Store the encoding for later use

# 📌 4️⃣ - Normalize numerical data
scaler = StandardScaler()
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

# 🔹 Apply standard scaling to numerical data
for col in numeric_columns:
    df[col] = scaler.fit_transform(df[[col]])

# 📌 5️⃣ - Verify cleaning and display final sample
display(df.head())

# 📌 6️⃣ - Save the cleaned data to a new file
df.to_csv('heart_disease_cleaned.csv', index=False)
print("✅ The cleaned data has been successfully saved!")


Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1


Missing values per column:
 patientid            0
age                  0
gender               0
chestpain            0
restingBP            0
serumcholestrol      0
fastingbloodsugar    0
restingrelectro      0
maxheartrate         0
exerciseangia        0
oldpeak              0
slope                0
noofmajorvessels     0
target               0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)  # Replace with the median for numerical data


Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,-1.708555,0.210464,0.554247,1.070663,0.642833,-2.352717,-0.648425,0.327384,0.044567,-0.996008,1.507245,1.45535,1.819678,0.850963
1,-1.703068,-0.517591,0.554247,-1.028677,-1.928098,-0.622817,-0.648425,0.327384,-0.89184,-0.996008,0.576955,-0.53828,-0.227204,-1.175139
2,-1.703025,-0.013553,0.554247,1.070663,-0.625938,-1.280028,-0.648425,-0.97176,1.654017,1.004008,1.332816,-0.53828,-1.250645,-1.175139
3,-1.698485,-0.349578,0.554247,-1.028677,-0.458995,-0.124243,1.542199,0.327384,0.220143,-0.996008,0.286239,0.458535,0.796237,0.850963
4,-1.693753,-1.021629,0.554247,0.020993,1.577717,-2.352717,-0.648425,1.626528,-0.277323,-0.996008,1.507245,1.45535,0.796237,0.850963


✅ The cleaned data has been successfully saved!
