In [None]:
import numpy as np
import pandas as pd
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load dataset

df = pd.read_csv("heart.csv")

# Encode categorical features

categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope']
encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Separate features & target

X = df.drop(columns=["target"])
y = df["target"]

feature_names = X.columns.to_numpy()

# Genetic Algorithm â€“ Fitness Function

def fitness_function(chromosome, X, y):
    selected_idx = np.where(chromosome == 1)[0]
    if len(selected_idx) == 0:
        return 0

    X_selected = X.iloc[:, selected_idx]

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        model = RandomForestClassifier(
            n_estimators=100,
            random_state=RANDOM_SEED
        )
        model.fit(X_selected.iloc[train_idx], y.iloc[train_idx])
        preds = model.predict(X_selected.iloc[val_idx])
        scores.append(accuracy_score(y.iloc[val_idx], preds))

    return np.mean(scores)

# Genetic Algorithm

def genetic_algorithm(X, y, generations=50, pop_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(pop_size, num_features))

    for _ in range(generations):
        fitness = np.array([fitness_function(ch, X, y) for ch in population])
        population = population[np.argsort(fitness)[::-1]]

        if fitness[0] - fitness[-1] < 0.01:
            break

        parents = population[:4]
        offspring = (parents.mean(axis=0) > 0.5).astype(int)

        mutation_mask = np.random.rand(num_features) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness[-1]:
            population[-1] = offspring

    return population[0]

best_chromosome = genetic_algorithm(X, y)
selected_features = feature_names[best_chromosome == 1]

print("Selected Features:", selected_features.tolist())
X_selected = X[selected_features]

numeric_features = ["age", "trestbps", "oldpeak"]

scaler = StandardScaler()
X_selected[numeric_features] = scaler.fit_transform(
    X_selected[numeric_features]
)

# Train-test split

X_train, X_test, y_train, y_test = train_test_split(
    X_selected,
    y,
    test_size=0.25,
    stratify=y,
    random_state=RANDOM_SEED
)

# Train final model

rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=RANDOM_SEED
)

rf_model.fit(X_train, y_train)

# Evaluation

y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_prob))

joblib.dump(rf_model, "rf_heart_model.pkl")
joblib.dump(scaler, "scaler_selected.pkl")
joblib.dump(selected_features.tolist(), "features.pkl")
joblib.dump(encoders, "encoders.pkl")

print("Model and artifacts saved successfully.")

Selected Features: ['age', 'sex', 'trestbps', 'fbs', 'exang', 'oldpeak', 'ca', 'thal']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_selected[numeric_features] = scaler.fit_transform(


Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
ROC-AUC  : 1.0
Model and artifacts saved successfully.
