In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score




In [2]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"]
heart_data = pd.read_csv(url, names=column_names, na_values="?")



In [3]:
# Drop rows with missing values
heart_data = heart_data.dropna()

# Separate features (X) and target variable (y)
X = heart_data.drop("target", axis=1)
y = heart_data["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(random_state=42)
}



In [6]:
# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name}: Accuracy = {accuracy:.4f}")



Random Forest: Accuracy = 0.6000
Gradient Boosting: Accuracy = 0.5333
Support Vector Machine: Accuracy = 0.6500
K-Nearest Neighbors: Accuracy = 0.6000
Logistic Regression: Accuracy = 0.6167


In [7]:
# Compare accuracy results
results_df = pd.DataFrame(list(results.items()), columns=["Algorithm", "Accuracy"])
print("\nComparison of Algorithm Accuracies:")
print(results_df.sort_values(by="Accuracy", ascending=False))


Comparison of Algorithm Accuracies:
                Algorithm  Accuracy
2  Support Vector Machine  0.650000
4     Logistic Regression  0.616667
0           Random Forest  0.600000
3     K-Nearest Neighbors  0.600000
1       Gradient Boosting  0.533333
