In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
dataset_path = "heart.csv"  # Ensure this file is in the project directory
data = pd.read_csv(dataset_path)

# Step 1: Check for missing values
print("Missing Values in Each Column:")
print(data.isnull().sum())

# Handling missing values (if any)
# Uncomment one of the following approaches if you find missing values:
# data.fillna(method='ffill', inplace=True)  # Forward fill
# data.fillna(data.mean(), inplace=True)     # Replace with column mean
# data.dropna(inplace=True)                  # Drop rows with missing values

# Step 2: Check data types and encode categorical variables
print("\nData Types:")
print(data.dtypes)

# Convert categorical variables into dummy variables (if applicable)
# Example:
# data = pd.get_dummies(data, drop_first=True)

# Step 3: Identify duplicate rows (optional)
duplicates = data.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")
if duplicates > 0:
    data = data.drop_duplicates()
    print("Duplicates removed.")

# Step 4: Summary statistics
print("\nSummary Statistics:")
print(data.describe())

# Save the preprocessed data for further use
preprocessed_path = "preprocessed_heart.csv"
data.to_csv(preprocessed_path, index=False)
print(f"\nPreprocessed data saved to: {preprocessed_path}")


Missing Values in Each Column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Data Types:
age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

Number of Duplicate Rows: 1
Duplicates removed.

Summary Statistics:
             age         sex          cp    trestbps        chol         fbs  \
count  302.00000  302.000000  302.000000  302.000000  302.000000  302.000000   
mean    54.42053    0.682119    0.963576  131.602649  246.500000    0.149007   
std      9.04797    0.466426    1.032044   17.563394   51.753489    0.356686   
min     29.00000    0.000000    0.000000   94.000000  126.00000