# Random Forest+Modified GA

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Separate features and target
X = df.drop(columns=['target']).values
y = df['target'].values

# Fitness function using Stratified K-Fold Cross-Validation
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)

# Modified Genetic Algorithm for Feature Selection
def modified_genetic_algorithm(X, y, num_generations=50, population_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        if fitness_scores[0] - fitness_scores[-1] <= 0.01:  # Convergence check
            break

        parents = population[:4]  # Select top 4 parents
        offspring = np.mean(parents, axis=0) > 0.5  # Crossover strategy
        offspring = np.array(offspring, dtype=int)  # Convert boolean to int

        # Mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]

# Run Modified Genetic Algorithm
best_chromosome = modified_genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Train Random Forest with selected features
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_model.fit(X_train[:, selected_features], y_train)

# Test the Model
y_pred = rf_model.predict(X_test[:, selected_features])
test_accuracy_rf = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Test Accuracy (Random Forest):", test_accuracy_rf)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Test the Model
y_pred = rf_model.predict(X_test[:, selected_features])
y_proba = rf_model.predict_proba(X_test[:, selected_features])[:, 1]  # For AUC-ROC

# Evaluation Metrics
test_precision_rf = precision_score(y_test, y_pred)
test_recall_rf = recall_score(y_test, y_pred)
test_auc_rf = roc_auc_score(y_test, y_proba)

# Output Results
print("Test Precision (Random Forest):", test_precision_rf)
print("Test Recall (Random Forest):", test_recall_rf)
print("Test AUC-ROC (Random Forest):", test_auc_rf)


Best Feature Selection: [1 1 0 1 0 1 0 0 1 1 0 1 1]
Test Accuracy (Random Forest): 1.0
Test Precision (Random Forest): 1.0
Test Recall (Random Forest): 1.0
Test AUC-ROC (Random Forest): 1.0


# XGBoost+Modified GA

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Set a fixed random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Define features and target variable
X = df.drop(columns=['target']).values
y = df['target'].values

# Fitness function using Stratified K-Fold Cross-Validation
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = XGBClassifier(n_estimators=100, max_depth=3, min_child_weight=3, random_state=RANDOM_SEED)
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)  # Return average accuracy

# Modified Genetic Algorithm for Feature Selection
def modified_genetic_algorithm(X, y, num_generations=50, population_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        if fitness_scores[0] - fitness_scores[-1] <= 0.01:  # Convergence check
            break

        parents = population[:4]  # Select top 4 parents
        offspring = np.mean(parents, axis=0) > 0.5  # Crossover strategy
        offspring = np.array(offspring, dtype=int)

        # Mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Modified Genetic Algorithm
best_chromosome = modified_genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Train XGBoost with selected features
xgb_model = XGBClassifier(
    n_estimators=100, max_depth=3, min_child_weight=3, random_state=RANDOM_SEED
)

# Train the model (NO EARLY STOPPING)
xgb_model.fit(X_train[:, selected_features], y_train)

# Test the Model
y_pred = xgb_model.predict(X_test[:, selected_features])
test_accuracy_xgb = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Test Accuracy (XGBoost):", test_accuracy_xgb)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Test the Model
y_pred = xgb_model.predict(X_test[:, selected_features])
y_proba = xgb_model.predict_proba(X_test[:, selected_features])[:, 1]  # For AUC-ROC

# Evaluation Metrics
test_precision_xgb = precision_score(y_test, y_pred)
test_recall_xgb = recall_score(y_test, y_pred)
test_auc_xgb = roc_auc_score(y_test, y_proba)

# Output Results
print("Test Precision (XGBoost):", test_precision_xgb)
print("Test Recall (XGBoost):", test_recall_xgb)
print("Test AUC-ROC (XGBoost):", test_auc_xgb)

Best Feature Selection: [1 1 1 1 1 1 1 1 0 0 1 1 1]
Test Accuracy (XGBoost): 0.9902439024390244
Test Precision (XGBoost): 0.9813084112149533
Test Recall (XGBoost): 1.0
Test AUC-ROC (XGBoost): 1.0


# Logistic Regression+Modified GA

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Set a fixed random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Define features and target variable
X = df.drop(columns=['target']).values
y = df['target'].values

# Fitness function using Stratified K-Fold Cross-Validation
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = LogisticRegression(max_iter=500, random_state=RANDOM_SEED)
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)  # Return average accuracy

# Modified Genetic Algorithm for Feature Selection
def modified_genetic_algorithm(X, y, num_generations=50, population_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        if fitness_scores[0] - fitness_scores[-1] <= 0.01:  # Convergence check
            break

        parents = population[:4]  # Select top 4 parents
        offspring = np.mean(parents, axis=0) > 0.5  # Crossover strategy
        offspring = np.array(offspring, dtype=int)

        # Mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Modified Genetic Algorithm
best_chromosome = modified_genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Train Logistic Regression with selected features
logreg_model = LogisticRegression(max_iter=500, random_state=RANDOM_SEED)

# Train the model (NO EARLY STOPPING)
logreg_model.fit(X_train[:, selected_features], y_train)

# Test the Model
y_pred = logreg_model.predict(X_test[:, selected_features])
test_accuracy_logreg = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Test Accuracy (Logistic Regression):", test_accuracy_logreg)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Test the Model
y_pred = logreg_model.predict(X_test[:, selected_features])
y_proba = logreg_model.predict_proba(X_test[:, selected_features])[:, 1]  # For AUC-ROC

# Evaluation Metrics
test_precision_logreg = precision_score(y_test, y_pred)
test_recall_logreg = recall_score(y_test, y_pred)
test_auc_logreg = roc_auc_score(y_test, y_proba)

# Output Results
print("Test Precision (Logistic Regression):", test_precision_logreg)
print("Test Recall (Logistic Regression):", test_recall_logreg)
print("Test AUC-ROC (Logistic Regression):", test_auc_logreg)

Best Feature Selection: [0 1 1 1 1 0 1 0 0 1 1 1 1]
Test Accuracy (Logistic Regression): 0.8536585365853658
Test Precision (Logistic Regression): 0.8205128205128205
Test Recall (Logistic Regression): 0.9142857142857143
Test AUC-ROC (Logistic Regression): 0.9152380952380952


# KNN+Modified GA

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Set a fixed random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Define features and target variable
X = df.drop(columns=['target']).values
y = df['target'].values

# Fitness function using Stratified K-Fold Cross-Validation
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = KNeighborsClassifier(n_neighbors=5)  # Using KNN
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)  # Return average accuracy

# Modified Genetic Algorithm for Feature Selection
def modified_genetic_algorithm(X, y, num_generations=50, population_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        if fitness_scores[0] - fitness_scores[-1] <= 0.01:  # Convergence check
            break

        parents = population[:4]  # Select top 4 parents
        offspring = np.mean(parents, axis=0) > 0.5  # Crossover strategy
        offspring = np.array(offspring, dtype=int)

        # Mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Modified Genetic Algorithm
best_chromosome = modified_genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Train KNN with selected features
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model (NO EARLY STOPPING)
knn_model.fit(X_train[:, selected_features], y_train)

# Test the Model
y_pred = knn_model.predict(X_test[:, selected_features])
test_accuracy_knn = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Test Accuracy (KNN):", test_accuracy_knn)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Evaluation Metrics
y_proba = knn_model.predict_proba(X_test[:, selected_features])[:, 1]  # For AUC-ROC
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

# Output Results
print("Test Precision (KNN):", precision)
print("Test Recall (KNN):", recall)
print("Test AUC-ROC (KNN):", auc_roc)

Best Feature Selection: [1 1 0 1 0 1 0 0 1 1 0 1 1]
Test Accuracy (KNN): 0.8731707317073171
Test Precision (KNN): 0.8761904761904762
Test Recall (KNN): 0.8761904761904762
Test AUC-ROC (KNN): 0.9644761904761905


# SVM+Modified GA

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Split features and target variable
X = df.drop(columns=['target']).values  # Feature matrix
y = df['target'].values  # Target variable

# Fitness function using Stratified K-Fold Cross-Validation
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]  # Select features based on chromosome
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = SVC(kernel='linear', random_state=random_seed)  # Using SVM
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)  # Return average accuracy

# Modified Genetic Algorithm for Feature Selection
def genetic_algorithm(X, y, num_generations=50, population_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        if fitness_scores[0] - fitness_scores[-1] <= 0.01:  # Convergence check
            break

        parents = population[:4]  # Select top 4 parents
        offspring = np.mean(parents, axis=0) > 0.5  # Crossover strategy
        offspring = np.array(offspring, dtype=int)  # Convert boolean to int

        # Mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Modified Genetic Algorithm
best_chromosome = genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed, stratify=y)

# Train SVM with selected features
svm_model = SVC(kernel='linear', random_state=random_seed)

# Train the model (NO EARLY STOPPING)
svm_model.fit(X_train[:, selected_features], y_train)

# Test the Model
y_pred = svm_model.predict(X_test[:, selected_features])
test_accuracy_svm = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Test Accuracy (SVM):", test_accuracy_svm)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Evaluation Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# SVM does not support predict_proba by default, so use decision_function + normalization
decision_scores = svm_model.decision_function(X_test[:, selected_features])
# Normalize decision function to [0,1] for AUC-ROC
normalized_scores = (decision_scores - decision_scores.min()) / (decision_scores.max() - decision_scores.min())
auc_roc = roc_auc_score(y_test, normalized_scores)

# Output Results
print("Test Precision (SVM):", precision)
print("Test Recall (SVM):", recall)
print("Test AUC-ROC (SVM):", auc_roc)

Best Feature Selection: [1 1 1 1 1 1 1 1 0 0 1 1 1]
Test Accuracy (SVM): 0.8341463414634146
Test Precision (SVM): 0.7795275590551181
Test Recall (SVM): 0.9428571428571428
Test AUC-ROC (SVM): 0.9102857142857143


# Decision Tree+Modified GA

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Split features and target variable
X = df.drop(columns=['target']).values  # Feature matrix
y = df['target'].values  # Target variable

# Fitness function using Stratified K-Fold Cross-Validation
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]  # Select features based on chromosome
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = DecisionTreeClassifier(random_state=random_seed)  # Using Decision Tree
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)  # Return average accuracy

# Genetic Algorithm for Feature Selection
def genetic_algorithm(X, y, num_generations=50, population_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        if fitness_scores[0] - fitness_scores[-1] <= 0.01:  # Convergence check
            break

        parents = population[:4]  # Select top 4 parents
        offspring = np.mean(parents, axis=0) > 0.5  # Crossover strategy
        offspring = np.array(offspring, dtype=int)  # Convert boolean to int

        # Mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Genetic Algorithm
best_chromosome = genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed, stratify=y)

# Train Decision Tree with selected features
dt_model = DecisionTreeClassifier(random_state=random_seed)
dt_model.fit(X_train[:, selected_features], y_train)

# Test the Model
y_pred = dt_model.predict(X_test[:, selected_features])
test_accuracy_dt = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Test Accuracy (Decision Tree):", test_accuracy_dt)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Evaluation Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# For AUC-ROC, use predict_proba if available
if hasattr(dt_model, "predict_proba"):
    y_proba = dt_model.predict_proba(X_test[:, selected_features])[:, 1]
    auc_roc = roc_auc_score(y_test, y_proba)
else:
    auc_roc = roc_auc_score(y_test, y_pred)  # Fallback in case predict_proba is not supported

# Output Results
print("Test Precision (Decision Tree):", precision)
print("Test Recall (Decision Tree):", recall)
print("Test AUC-ROC (Decision Tree):", auc_roc)

Best Feature Selection: [1 1 1 1 0 1 1 0 1 0 0 1 0]
Test Accuracy (Decision Tree): 1.0
Test Precision (Decision Tree): 1.0
Test Recall (Decision Tree): 1.0
Test AUC-ROC (Decision Tree): 1.0


# Naive Bayes+Modified GA

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Split features and target variable
X = df.drop(columns=['target']).values  # Feature matrix
y = df['target'].values  # Target variable

# Fitness function using Stratified K-Fold Cross-Validation
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]  # Select features based on chromosome
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = GaussianNB()  # Using Naïve Bayes
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)  # Return average accuracy

# Genetic Algorithm for Feature Selection
def genetic_algorithm(X, y, num_generations=50, population_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        if fitness_scores[0] - fitness_scores[-1] <= 0.01:  # Convergence check
            break

        parents = population[:4]  # Select top 4 parents
        offspring = np.mean(parents, axis=0) > 0.5  # Crossover strategy
        offspring = np.array(offspring, dtype=int)  # Convert boolean to int

        # Mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Genetic Algorithm
best_chromosome = genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed, stratify=y)

# Train Naïve Bayes with selected features
nb_model = GaussianNB()
nb_model.fit(X_train[:, selected_features], y_train)

# Test the Model
y_pred = nb_model.predict(X_test[:, selected_features])
test_accuracy_nb = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Test Accuracy (Naïve Bayes):", test_accuracy_nb)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Evaluation Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# AUC-ROC using predicted probabilities
y_proba = nb_model.predict_proba(X_test[:, selected_features])[:, 1]
auc_roc = roc_auc_score(y_test, y_proba)

# Output Results
print("Test Precision (Naïve Bayes):", precision)
print("Test Recall (Naïve Bayes):", recall)
print("Test AUC-ROC (Naïve Bayes):", auc_roc)

Best Feature Selection: [0 1 1 1 1 0 1 0 0 1 1 1 1]
Test Accuracy (Naïve Bayes): 0.8341463414634146
Test Precision (Naïve Bayes): 0.8034188034188035
Test Recall (Naïve Bayes): 0.8952380952380953
Test AUC-ROC (Naïve Bayes): 0.9137142857142858


# LightGBM+Modified GA

In [8]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Split features and target variable
X = df.drop(columns=['target']).values  # Feature matrix
y = df['target'].values  # Target variable

# Fitness function using Stratified K-Fold Cross-Validation
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]  # Select features based on chromosome
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = lgb.LGBMClassifier(n_estimators=100, max_depth=5, random_state=random_seed, verbose=-1)
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)  # Return average accuracy

# Genetic Algorithm for Feature Selection
def genetic_algorithm(X, y, num_generations=50, population_size=20, mutation_rate=0.1):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        if fitness_scores[0] - fitness_scores[-1] <= 0.01:  # Convergence check
            break

        parents = population[:4]  # Select top 4 parents
        offspring = np.mean(parents, axis=0) > 0.5  # Crossover strategy
        offspring = np.array(offspring, dtype=int)  # Convert boolean to int

        # Mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Genetic Algorithm
best_chromosome = genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed, stratify=y)

# Train LightGBM with selected features
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=random_seed, verbose=-1)
lgb_model.fit(X_train[:, selected_features], y_train)

# Test the Model
y_pred = lgb_model.predict(X_test[:, selected_features])
test_accuracy_lgb = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Test Accuracy (LightGBM):", test_accuracy_lgb)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Evaluation Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# AUC-ROC using predicted probabilities
y_proba = lgb_model.predict_proba(X_test[:, selected_features])[:, 1]
auc_roc = roc_auc_score(y_test, y_proba)

# Output Results
print("Test Precision (LightGBM):", precision)
print("Test Recall (LightGBM):", recall)
print("Test AUC-ROC (LightGBM):", auc_roc)

Best Feature Selection: [1 1 1 1 1 1 1 1 0 0 1 1 1]
Test Accuracy (LightGBM): 1.0
Test Precision (LightGBM): 1.0
Test Recall (LightGBM): 1.0
Test AUC-ROC (LightGBM): 1.0


# MLP+Modified GA

In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import warnings

# Enable warnings
warnings.filterwarnings("default")

# Set fixed random seed for full reproducibility
RANDOM_SEED = 42
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Ensure TensorFlow operates deterministically
tf.config.experimental.enable_op_determinism()

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Define features and target variable
X = df.drop(columns=['target']).values.astype(np.float32)
y = df['target'].values.astype(np.int32)

# Fitness function using Stratified K-Fold CV with Logistic Regression
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Logistic Regression for fast evaluation
        model = LogisticRegression(max_iter=100, solver='liblinear', random_state=RANDOM_SEED)
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)

# Optimized Genetic Algorithm for Faster Feature Selection
def fast_genetic_algorithm(X, y, num_generations=20, population_size=10, mutation_rate=0.05):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        # Early stopping if fitness doesn't improve significantly
        if fitness_scores[0] - fitness_scores[-1] <= 0.005:
            break

        # Select top 2 parents
        parents = population[:2]

        # Create new offspring using uniform crossover
        offspring = (parents[0] + parents[1]) // 2
        offspring = np.array(offspring, dtype=int)

        # Apply mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Optimized GA for Feature Selection
best_chromosome = fast_genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split (70-30) using the best-selected features
X_train, X_test, y_train, y_test = train_test_split(X[:, selected_features], y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Build Final MLP Model
mlp_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train MLP Model (Reduced Epochs for Speed)
mlp_model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

# Test the Model
y_pred = (mlp_model.predict(X_test) > 0.5).astype(int)
test_accuracy_mlp = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Selected Feature Count:", len(selected_features))
print("Test Accuracy (MLP with 70-30 split):", test_accuracy_mlp)
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Predict the labels for the test set
y_pred = (mlp_model.predict(X_test) > 0.5).astype(int)

# Calculate metrics
test_accuracy_mlp = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("Precision:", precision)
print("Recall:", recall)
print("AUC-ROC:", roc_auc)


Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6688 - loss: 0.6337   
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8004 - loss: 0.4591 
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8309 - loss: 0.4063 
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8580 - loss: 0.3657 
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8487 - loss: 0.3607 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
Best Feature Selection: [1 1 1 1 1 0 1 0 1 1 0 1 0]
Selected Feature Count: 9
Test Accuracy (MLP with 70-30 split): 0.8195121951219512
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Precision: 0.7698412698412699
Recall: 0.9238095238095239
AUC-ROC: 0.8169047619047619


# CNN+Modified GA

In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import warnings

# Enable warnings
warnings.filterwarnings("default")

# Set fixed random seed for full reproducibility
RANDOM_SEED = 42
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Ensure TensorFlow operates deterministically
tf.config.experimental.enable_op_determinism()

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Define features and target variable
X = df.drop(columns=['target']).values.astype(np.float32)
y = df['target'].values.astype(np.int32)

# Fitness function using Stratified K-Fold CV with Logistic Regression
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)  # Fewer splits for speed
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Logistic Regression for fast evaluation
        model = LogisticRegression(max_iter=100, solver='liblinear', random_state=RANDOM_SEED)
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)

# Optimized Genetic Algorithm for Faster Feature Selection
def fast_genetic_algorithm(X, y, num_generations=20, population_size=10, mutation_rate=0.05):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        # Early stopping if fitness doesn't improve significantly
        if fitness_scores[0] - fitness_scores[-1] <= 0.005:
            break

        # Select top 2 parents
        parents = population[:2]

        # Create new offspring using uniform crossover
        offspring = (parents[0] + parents[1]) // 2
        offspring = np.array(offspring, dtype=int)

        # Apply mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Optimized GA for Feature Selection
best_chromosome = fast_genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split (70-30) using the best-selected features
X_train, X_test, y_train, y_test = train_test_split(X[:, selected_features], y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Build Final CNN Model
cnn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN Model (Reduced Epochs for Speed)
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

# Test the Model
y_pred = (cnn_model.predict(X_test) > 0.5).astype(int)
test_accuracy_cnn = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Selected Feature Count:", len(selected_features))
print("Test Accuracy (CNN with 80-20 split):", test_accuracy_cnn)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Evaluation Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# AUC-ROC using predicted probabilities
y_proba = cnn_model.predict(X_test)  # Get probabilities from CNN
auc_roc = roc_auc_score(y_test, y_proba)

# Output Results
print("Test Precision (CNN):", precision)
print("Test Recall (CNN):", recall)
print("Test AUC-ROC (CNN):", auc_roc)

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5217 - loss: 0.7049   
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 943us/step - accuracy: 0.7583 - loss: 0.5833
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8078 - loss: 0.4845 
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8080 - loss: 0.4387 
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8108 - loss: 0.4254 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Best Feature Selection: [1 1 1 1 1 0 1 0 1 1 0 1 0]
Selected Feature Count: 9
Test Accuracy (CNN with 80-20 split): 0.775609756097561
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Test Precision (CNN): 0.7521367521367521
Test Recall (CNN): 0.8380952380952381
Test AUC-ROC (CNN): 0.8846666666666668


# LSTM+Modified GA

In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import warnings

# Enable warnings
warnings.filterwarnings("default")

# Set fixed random seed for full reproducibility
RANDOM_SEED = 42
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Ensure TensorFlow operates deterministically
tf.config.experimental.enable_op_determinism()

# Load dataset
df = pd.read_csv('heart.csv')

# Encode categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'target']
le = LabelEncoder()
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])

# Scale numerical features
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Define features and target variable
X = df.drop(columns=['target']).values.astype(np.float32)
y = df['target'].values.astype(np.int32)

# Fitness function using Stratified K-Fold CV with Logistic Regression
def fitness_function(chromosome, X, y):
    selected_features = np.where(chromosome == 1)[0]
    if len(selected_features) == 0:
        return 0  # Avoid empty feature sets

    X_selected = X[:, selected_features]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracy_scores = []

    for train_idx, val_idx in skf.split(X_selected, y):
        X_train, X_val = X_selected[train_idx], X_selected[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Logistic Regression for fast evaluation
        model = LogisticRegression(max_iter=100, solver='liblinear', random_state=RANDOM_SEED)
        model.fit(X_train, y_train)
        accuracy_scores.append(accuracy_score(y_val, model.predict(X_val)))

    return np.mean(accuracy_scores)

# Optimized Genetic Algorithm for Faster Feature Selection
def fast_genetic_algorithm(X, y, num_generations=20, population_size=10, mutation_rate=0.05):
    num_features = X.shape[1]
    population = np.random.randint(2, size=(population_size, num_features))

    for generation in range(num_generations):
        fitness_scores = np.array([fitness_function(chrom, X, y) for chrom in population])
        sorted_indices = np.argsort(fitness_scores)[::-1]
        population = population[sorted_indices]

        # Early stopping if fitness doesn't improve significantly
        if fitness_scores[0] - fitness_scores[-1] <= 0.005:
            break

        # Select top 2 parents
        parents = population[:2]

        # Create new offspring using uniform crossover
        offspring = (parents[0] + parents[1]) // 2
        offspring = np.array(offspring, dtype=int)

        # Apply mutation
        mutation_mask = np.random.rand(*offspring.shape) < mutation_rate
        offspring[mutation_mask] = 1 - offspring[mutation_mask]

        if fitness_function(offspring, X, y) > fitness_scores[-1]:
            population[-1] = offspring  # Replace worst individual

    return population[0]  # Return best chromosome

# Run Optimized GA for Feature Selection
best_chromosome = fast_genetic_algorithm(X, y)
selected_features = np.where(best_chromosome == 1)[0]

# Train/Test Split (70-30) using the best-selected features
X_train, X_test, y_train, y_test = train_test_split(X[:, selected_features], y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

# Reshape input for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build Final LSTM Model
lstm_model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train.shape[1], 1)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train LSTM Model (Reduced Epochs for Speed)
lstm_model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

# Test the Model
y_pred = (lstm_model.predict(X_test) > 0.5).astype(int)
test_accuracy_lstm = accuracy_score(y_test, y_pred)

# Output Results
print("Best Feature Selection:", best_chromosome)
print("Selected Feature Count:", len(selected_features))
print("Test Accuracy (LSTM with 80-20 split):", test_accuracy_lstm)
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Evaluation Metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# AUC-ROC using predicted probabilities
y_proba = lstm_model.predict(X_test)  # Get probabilities from LSTM
auc_roc = roc_auc_score(y_test, y_proba)

# Output Results
print("Test Precision (LSTM):", precision)
print("Test Recall (LSTM):", recall)
print("Test AUC-ROC (LSTM):", auc_roc)


Epoch 1/5


  super().__init__(**kwargs)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5497 - loss: 0.6893
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6961 - loss: 0.6193
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7433 - loss: 0.5441
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7651 - loss: 0.5260
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7406 - loss: 0.5181
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Best Feature Selection: [1 1 1 1 1 0 1 0 1 1 0 1 0]
Selected Feature Count: 9
Test Accuracy (LSTM with 80-20 split): 0.775609756097561
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Test Precision (LSTM): 0.736
Test Recall (LSTM): 0.8761904761904762
Test AUC-ROC (LSTM): 0.8353333333333334
