In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
heart_data = pd.read_csv("C:/Users/lenovo/Downloads/Heart.csv")

In [5]:
# Step 1: Data Cleaning
print("Original Data Shape:", heart_data.shape)

# Handling missing values
imputer = SimpleImputer(strategy='mean')
heart_data = pd.DataFrame(imputer.fit_transform(heart_data), columns=heart_data.columns)

# Removing duplicates
heart_data.drop_duplicates(inplace=True)

# Checking for duplicates and null values after cleaning
print("Data Shape after Cleaning:", heart_data.shape)
print("Null values after cleaning:\n", heart_data.isnull().sum())

Original Data Shape: (303, 14)
Data Shape after Cleaning: (302, 14)
Null values after cleaning:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [6]:
# Step 3: Data Transformation
# Encoding categorical variables (if any)
label_encoder = LabelEncoder()
heart_data['sex'] = label_encoder.fit_transform(heart_data['sex'])
heart_data['cp'] = label_encoder.fit_transform(heart_data['cp'])
heart_data['thal'] = label_encoder.fit_transform(heart_data['thal'])

# Scaling numerical features
scaler = StandardScaler()
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
heart_data[numerical_cols] = scaler.fit_transform(heart_data[numerical_cols])

In [7]:
# Step 4: Error Correcting
# Removing any outlier values (using IQR method)
for col in numerical_cols:
    Q1 = heart_data[col].quantile(0.25)
    Q3 = heart_data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Filtering out the outliers
    heart_data = heart_data[(heart_data[col] >= lower_bound) & (heart_data[col] <= upper_bound)]

print("Data Shape after Error Correction:", heart_data.shape)

Data Shape after Error Correction: (283, 14)


In [8]:
# Step 5: Data Model Building
# Splitting the data
X = heart_data.drop('target', axis=1)
y = heart_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model Prediction and Evaluation
y_pred = model.predict(X_test)

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.8070175438596491
Classification Report:
               precision    recall  f1-score   support

         0.0       0.83      0.74      0.78        27
         1.0       0.79      0.87      0.83        30

    accuracy                           0.81        57
   macro avg       0.81      0.80      0.80        57
weighted avg       0.81      0.81      0.81        57

