In [1]:
!pip install imblearn



In [None]:
# Core
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    LabelEncoder
)
from sklearn.impute import SimpleImputer

# Feature transformation
from sklearn.decomposition import PCA

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Model selection & evaluation
from sklearn.model_selection import (
    train_test_split,
    cross_val_score
)
from sklearn.metrics import (
    classification_report,
    f1_score, accuracy_score, precision_score, recall_score, f1_score
)

# Imbalanced data
from imblearn.over_sampling import SMOTE

# Extra datasets
from sklearn.datasets import load_iris

from scipy.stats import zscore 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


In [None]:
df = pd.read_csv("heart_disease_prediction_classification.csv")

print(df.info())
print(df.describe())
print(df.columns)


In [None]:
# Remove outlier using Z Score 

# Select numeric columns
numeric_cols = df.select_dtypes(include=np.number).columns

# Compute Z-scores
z_scores = np.abs(zscore(df[numeric_cols]))

# Remove outliers
df = df[(z_scores < 3).all(axis=1)]

print("Shape after outlier removal:", df.shape)


In [None]:
# Feature and Target 

X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]



In [None]:
# Encoding

categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Label Encoding
#for col in categorical_cols:
#    le = LabelEncoder()
#    X[col] = le.fit_transform(X[col])

# One-Hot Encoding (recommended for ML)
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [None]:
X

In [None]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [None]:
# Apply Scaling 

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Build classification models (SVM, Logistic Regression, Random Forest)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(random_state=42)
}

results = []

for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })
    
    # Print results
    print(f"\n{name}")
    print(f"Accuracy : {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall   : {recall}")
    print(f"F1-Score : {f1}")

In [None]:
# Apply PCA

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("Explained Variance Ratio (PCA):", pca.explained_variance_ratio_)

# Store results
results_pca = []

for name, model in models.items():
    # Train model on PCA data
    model.fit(X_train_pca, y_train)
    
    # Predict
    y_pred = model.predict(X_test_pca)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Save results
    results_pca.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })
    
    # Print results
    print(f"\n{name} (After PCA)")
    print(f"Accuracy : {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall   : {recall}")
    print(f"F1-Score : {f1}")

In [None]:
# Apply LDA
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

# Store results
results_lda = []

for name, model in models.items():
    # Train model on LDA data
    model.fit(X_train_lda, y_train)
    
    # Predict
    y_pred = model.predict(X_test_lda)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Save results
    results_lda.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })
    
    # Print results
    print(f"\n{name} (After LDA)")
    print(f"Accuracy : {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall   : {recall}")
    print(f"F1-Score : {f1}")