In [1]:
# ============================================================================
# CELL 1: IMPORT LIBRARIES
# ============================================================================
# Run this cell first to import all required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                            ConfusionMatrixDisplay, roc_curve, auc,
                            f1_score, recall_score, precision_score, roc_auc_score)
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!")
print(f"  - Pandas version: {pd.__version__}")
print(f"  - NumPy version: {np.__version__}")
print(f"  - XGBoost version: {xgb.__version__}")

✓ All libraries imported successfully!
  - Pandas version: 2.3.3
  - NumPy version: 2.4.0
  - XGBoost version: 3.1.2


In [2]:
# ============================================================================
# CELL 2: CONFIGURE PATHS AND PARAMETERS
# ============================================================================
# Update the DATA_PATH to point to your CSV file

# UPDATE THIS PATH TO YOUR CSV FILE LOCATION:
DATA_PATH = 'heart.csv'  # or 'heart_disease_data01.csv'

# For Windows: DATA_PATH = r'C:\Users\YourName\Documents\heart.csv'
# For Mac/Linux: DATA_PATH = '/home/username/Documents/heart.csv'

# SMO Parameters
N_MONKEYS = 30
N_ITERATIONS = 20  # Start with 20 for testing, increase to 100 for final run

# Hyperparameter bounds
bounds = np.array([
    [10, 100],    # rf_n_estimators
    [1, 20],      # rf_max_depth
    [10, 100],    # xgb_n_estimators
    [1, 20],      # xgb_max_depth
    [0.01, 0.3],  # xgb_learning_rate
    [2, 50]       # rf_min_samples_split
])

print("✓ Configuration set!")
print(f"  - Data path: {DATA_PATH}")
print(f"  - Number of monkeys: {N_MONKEYS}")
print(f"  - Number of iterations: {N_ITERATIONS}")

✓ Configuration set!
  - Data path: heart.csv
  - Number of monkeys: 30
  - Number of iterations: 20


In [3]:
# ============================================================================
# CELL 3: LOAD AND PREVIEW DATA
# ============================================================================
# Load the dataset and check if it's loaded correctly

print("Loading dataset...")
print("-" * 80)

column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
               'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 
               'ca', 'thal', 'target']

try:
    # Try loading with headers first
    data_test = pd.read_csv(DATA_PATH, nrows=5)
    
    # Check if file has headers
    if 'target' in data_test.columns or 'age' in data_test.columns:
        print("✓ File has headers, loading with headers...")
        data = pd.read_csv(DATA_PATH)
    else:
        print("✓ File has no headers, loading with custom column names...")
        data = pd.read_csv(DATA_PATH, header=None, names=column_names)
    
    print(f"✓ Data loaded successfully!")
    print(f"  - Shape: {data.shape}")
    print(f"  - Columns: {list(data.columns)}")
    print("\nFirst 5 rows:")
    print(data.head())
    
    print("\nData types:")
    print(data.dtypes)
    
    print("\nMissing values:")
    print(data.isnull().sum())
    
except FileNotFoundError:
    print(f"✗ ERROR: File not found at '{DATA_PATH}'")
    print("Please check the file path and try again.")
except Exception as e:
    print(f"✗ ERROR: {e}")

Loading dataset...
--------------------------------------------------------------------------------
✓ File has headers, loading with headers...
✓ Data loaded successfully!
  - Shape: (303, 14)
  - Columns: ['age', 'sex', 'chest_pain_type', 'resting_bp', 'cholestoral', 'fasting_blood_sugar', 'restecg', 'max_hr', 'exang', 'oldpeak', 'slope', 'num_major_vessels', 'thal', 'target']

First 5 rows:
   age  sex  chest_pain_type  resting_bp  cholestoral  fasting_blood_sugar  \
0   63    1                3         145          233                    1   
1   37    1                2         130          250                    0   
2   41    0                1         130          204                    0   
3   56    1                1         120          236                    0   
4   57    0                0         120          354                    0   

   restecg  max_hr  exang  oldpeak  slope  num_major_vessels  thal  target  
0        0     150      0      2.3      0                 

In [4]:
# ============================================================================
# CELL 4: DATA PREPROCESSING
# ============================================================================
# Clean and prepare the data

print("\nPreprocessing data...")
print("-" * 80)

# Replace '?' with NaN
data.replace('?', pd.NA, inplace=True)

# Convert all columns to numeric
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

print(f"Before dropping NaN - Shape: {data.shape}")

# Drop rows with missing values
data.dropna(inplace=True)

print(f"After dropping NaN - Shape: {data.shape}")

# Check target distribution
print(f"\nTarget distribution:")
print(data['target'].value_counts())

# One-hot encoding for categorical features
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# Only encode columns that exist in the dataset
categorical_columns = [col for col in categorical_columns if col in data.columns]

print(f"\nCategorical columns to encode: {categorical_columns}")

data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

print(f"After encoding - Shape: {data.shape}")
print(f"Final columns: {list(data.columns)}")

# Define features and target
X = data.drop('target', axis=1)
y = data['target'].apply(lambda x: 1 if x > 0 else 0)

print(f"\n✓ Preprocessing complete!")
print(f"  - Features shape: {X.shape}")
print(f"  - Target shape: {y.shape}")
print(f"  - Class distribution: {y.value_counts().to_dict()}")


Preprocessing data...
--------------------------------------------------------------------------------
Before dropping NaN - Shape: (303, 14)
After dropping NaN - Shape: (303, 14)

Target distribution:
target
1    165
0    138
Name: count, dtype: int64

Categorical columns to encode: ['sex', 'restecg', 'exang', 'slope', 'thal']
After encoding - Shape: (303, 18)
Final columns: ['age', 'chest_pain_type', 'resting_bp', 'cholestoral', 'fasting_blood_sugar', 'max_hr', 'oldpeak', 'num_major_vessels', 'target', 'sex_1', 'restecg_1', 'restecg_2', 'exang_1', 'slope_1', 'slope_2', 'thal_1', 'thal_2', 'thal_3']

✓ Preprocessing complete!
  - Features shape: (303, 17)
  - Target shape: (303,)
  - Class distribution: {1: 165, 0: 138}


In [5]:
# ============================================================================
# CELL 5: TRAIN-TEST SPLIT
# ============================================================================
# Split data into training and testing sets

print("\nSplitting data...")
print("-" * 80)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"✓ Data split complete!")
print(f"  - Training set: {X_train.shape[0]} samples")
print(f"  - Test set: {X_test.shape[0]} samples")
print(f"  - Number of features: {X_train.shape[1]}")
print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTest set class distribution:")
print(y_test.value_counts())


Splitting data...
--------------------------------------------------------------------------------
✓ Data split complete!
  - Training set: 212 samples
  - Test set: 91 samples
  - Number of features: 17

Training set class distribution:
target
1    115
0     97
Name: count, dtype: int64

Test set class distribution:
target
1    50
0    41
Name: count, dtype: int64


In [None]:
 ============================================================================
# CELL 6: DEFINE SPIDER MONKEY CLASS AND FUNCTIONS
# ============================================================================
# Define all the functions needed for SMO

print("\nDefining Spider Monkey Optimization functions...")
print("-" * 80)

class SpiderMonkey:
    """Represents a spider monkey with position and fitness."""
    def __init__(self, position, fitness):
        self.position = position
        self.fitness = fitness

print("✓ SpiderMonkey class defined")

def evaluate_fitness(position, X_train, X_test, y_train, y_test):
    """
    Evaluate fitness of a position (hyperparameter combination).
    Returns negative accuracy for minimization.
    """
    try:
        # Decode position to hyperparameters
        rf_n_estimators = int(position[0])
        rf_max_depth = int(position[1])
        xgb_n_estimators = int(position[2])
        xgb_max_depth = int(position[3])
        xgb_learning_rate = position[4]
        rf_min_samples_split = int(position[5])
        
        # Define the base models
        base_models = [
            ('rf', RandomForestClassifier(
                n_estimators=rf_n_estimators,
                max_depth=rf_max_depth,
                min_samples_split=rf_min_samples_split,
                random_state=42,
                n_jobs=-1
            )),
            ('xgb', xgb.XGBClassifier(
                n_estimators=xgb_n_estimators,
                max_depth=xgb_max_depth,
                learning_rate=xgb_learning_rate,
                random_state=42,
                eval_metric='logloss',
                n_jobs=-1,
                verbosity=0
            ))
        ]
        
        # Define the meta-model
        meta_model = LogisticRegression(max_iter=1000)
        
        # Create the stacking classifier
        stacking_clf = StackingClassifier(
            estimators=base_models,
            final_estimator=meta_model,
            cv=5,
            n_jobs=-1
        )
        
        # Train and predict
        stacking_clf.fit(X_train, y_train)
        y_pred = stacking_clf.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        return -accuracy  # Negative for minimization
    
    except Exception as e:
        print(f"Error in evaluate_fitness: {e}")
        return 0  # Return worst fitness on error

print("✓ evaluate_fitness() function defined")

def initialize_spider_monkeys(n_monkeys, bounds, X_train, X_test, y_train, y_test):
    """Initialize spider monkey population."""
    monkeys = []
    print("Initializing spider monkeys...")
    for i in range(n_monkeys):
        position = np.random.uniform(bounds[:, 0], bounds[:, 1])
        fitness = evaluate_fitness(position, X_train, X_test, y_train, y_test)
        monkeys.append(SpiderMonkey(position, fitness))
        if (i + 1) % 10 == 0:
            print(f"  Initialized {i + 1}/{n_monkeys} monkeys")
    return monkeys

print("✓ initialize_spider_monkeys() function defined")

def update_position(monkey, global_best, local_best, bounds, X_train, X_test, y_train, y_test):
    """Update spider monkey position."""
    new_position = (
        monkey.position + 
        np.random.uniform(-1, 1, size=bounds.shape[0]) * (local_best.position - monkey.position) +
        np.random.uniform(-1, 1, size=bounds.shape[0]) * (global_best.position - monkey.position)
    )
    
    new_position = np.clip(new_position, bounds[:, 0], bounds[:, 1])
    new_fitness = evaluate_fitness(new_position, X_train, X_test, y_train, y_test)
    
    if new_fitness < monkey.fitness:
        monkey.position = new_position
        monkey.fitness = new_fitness

print("✓ update_position() function defined")

print("\n" + "=" * 80)
print("ALL FUNCTIONS SUCCESSFULLY DEFINED!")
print("=" * 80)
print("\nDefined functions:")
print("  1. SpiderMonkey (class)")
print("  2. evaluate_fitness()")
print("  3. initialize_spider_monkeys()")
print("  4. update_position()")
print("\nYou can now proceed to Cell 7 for testing!")
print("=" * 80)

In [6]:
# ============================================================================
# CELL 7: TEST SINGLE EVALUATION (SANITY CHECK)
# ============================================================================
# Test if a single fitness evaluation works

print("\nTesting single fitness evaluation...")
print("-" * 80)

# Create a random position
test_position = np.array([50, 10, 50, 5, 0.1, 10])

print(f"Test hyperparameters:")
print(f"  RF: n_estimators={int(test_position[0])}, max_depth={int(test_position[1])}, min_samples_split={int(test_position[5])}")
print(f"  XGB: n_estimators={int(test_position[2])}, max_depth={int(test_position[3])}, learning_rate={test_position[4]:.4f}")

try:
    test_fitness = evaluate_fitness(test_position, X_train, X_test, y_train, y_test)
    test_accuracy = -test_fitness
    print(f"\n✓ Test evaluation successful!")
    print(f"  - Test Accuracy: {test_accuracy:.4f}")
    print("\nIf you see this message, your functions are working correctly!")
    print("You can proceed to Cell 8 to run the full optimization.")
except Exception as e:
    print(f"✗ Test evaluation failed: {e}")
    import traceback
    print("\nFull error traceback:")
    traceback.print_exc()


Testing single fitness evaluation...
--------------------------------------------------------------------------------
Test hyperparameters:
  RF: n_estimators=50, max_depth=10, min_samples_split=10
  XGB: n_estimators=50, max_depth=5, learning_rate=0.1000
✗ Test evaluation failed: name 'evaluate_fitness' is not defined

Full error traceback:


Traceback (most recent call last):
  File "/var/folders/jg/v0bbfb4d6z31kv3sqwt32tr40000gn/T/ipykernel_19425/3405926086.py", line 17, in <module>
    test_fitness = evaluate_fitness(test_position, X_train, X_test, y_train, y_test)
                   ^^^^^^^^^^^^^^^^
NameError: name 'evaluate_fitness' is not defined


In [7]:
# ============================================================================
# CELL 8: RUN SPIDER MONKEY OPTIMIZATION
# ============================================================================
# Main optimization loop

print("\n" + "=" * 80)
print("STARTING SPIDER MONKEY OPTIMIZATION")
print("=" * 80)

# Initialize population
monkeys = initialize_spider_monkeys(N_MONKEYS, bounds, X_train, X_test, y_train, y_test)
global_best = min(monkeys, key=lambda monkey: monkey.fitness)

print(f"\n✓ Initial best accuracy: {-global_best.fitness:.4f}")

accuracies = []

# Main optimization loop
print("\nOptimization progress:")
print("-" * 80)

for iteration in range(N_ITERATIONS):
    # Find local best
    local_best = min(monkeys, key=lambda monkey: monkey.fitness)
    
    # Update all monkeys
    for monkey in monkeys:
        update_position(monkey, global_best, local_best, bounds, X_train, X_test, y_train, y_test)
    
    # Update global best
    current_best = min(monkeys, key=lambda monkey: monkey.fitness)
    if current_best.fitness < global_best.fitness:
        global_best = current_best
    
    # Track accuracy
    current_accuracy = -global_best.fitness
    accuracies.append(current_accuracy)
    
    # Print progress
    print(f'Iteration {iteration + 1:3d}/{N_ITERATIONS}: Best Accuracy = {current_accuracy:.4f}')

print("-" * 80)
print(f"✓ Optimization complete!")

best_position = global_best.position
best_fitness = -global_best.fitness

print(f"\nBEST HYPERPARAMETERS FOUND:")
print(f"  Random Forest:")
print(f"    - n_estimators: {int(best_position[0])}")
print(f"    - max_depth: {int(best_position[1])}")
print(f"    - min_samples_split: {int(best_position[5])}")
print(f"  XGBoost:")
print(f"    - n_estimators: {int(best_position[2])}")
print(f"    - max_depth: {int(best_position[3])}")
print(f"    - learning_rate: {best_position[4]:.4f}")
print(f"\nBest Accuracy: {best_fitness:.4f}")


STARTING SPIDER MONKEY OPTIMIZATION


NameError: name 'initialize_spider_monkeys' is not defined

In [None]:
# ============================================================================
# CELL 9: PLOT ACCURACY OVER ITERATIONS
# ============================================================================
# Visualize optimization progress

plt.figure(figsize=(12, 6))
plt.plot(range(1, N_ITERATIONS + 1), accuracies, marker='o', 
         linestyle='-', color='blue', linewidth=2, markersize=6)
plt.title('Accuracy over Iterations during SMO', fontsize=16, fontweight='bold')
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("✓ Accuracy plot displayed")

In [None]:
# ============================================================================
# CELL 10: TRAIN FINAL MODEL WITH BEST HYPERPARAMETERS
# ============================================================================
# Retrain the model with optimized hyperparameters

print("\nRetraining model with best hyperparameters...")
print("-" * 80)

# Create models with best hyperparameters
base_models = [
    ('rf', RandomForestClassifier(
        n_estimators=int(best_position[0]),
        max_depth=int(best_position[1]),
        min_samples_split=int(best_position[5]),
        random_state=42,
        n_jobs=-1
    )),
    ('xgb', xgb.XGBClassifier(
        n_estimators=int(best_position[2]),
        max_depth=int(best_position[3]),
        learning_rate=best_position[4],
        random_state=42,
        eval_metric='logloss',
        n_jobs=-1,
        verbosity=0
    ))
]

meta_model = LogisticRegression(max_iter=1000)

stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

# Train the final model
stacking_clf.fit(X_train, y_train)

# Make predictions
y_pred = stacking_clf.predict(X_test)
y_pred_proba = stacking_clf.predict_proba(X_test)[:, 1]

print("✓ Final model trained!")


In [None]:
# ============================================================================
# CELL 11: CALCULATE ALL METRICS
# ============================================================================
# Compute and display all evaluation metrics

print("\nFINAL MODEL EVALUATION METRICS:")
print("=" * 80)

# Calculate metrics
final_accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy:  {final_accuracy:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")
print("=" * 80)

In [None]:
# ============================================================================
# CELL 12: PLOT CONFUSION MATRIX
# ============================================================================
# Visualize the confusion matrix

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Disease', 'Disease'])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("✓ Confusion matrix displayed")


In [None]:
# ============================================================================
# CELL 13: PLOT ROC CURVE
# ============================================================================
# Visualize the ROC curve

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc_val = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_val:.4f})')
plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=16, fontweight='bold')
plt.legend(loc="lower right", fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("✓ ROC curve displayed")

In [None]:
# ============================================================================
# CELL 14: SUMMARY AND FINAL OUTPUT
# ============================================================================
# Display final summary

print("\n" + "=" * 80)
print("EXECUTION COMPLETED SUCCESSFULLY!")
print("=" * 80)

print("\nSUMMARY:")
print(f"  Dataset: {DATA_PATH}")
print(f"  Total samples: {len(X)}")
print(f"  Training samples: {len(X_train)}")
print(f"  Test samples: {len(X_test)}")
print(f"  Features: {X_train.shape[1]}")
print(f"\n  SMO Iterations: {N_ITERATIONS}")
print(f"  Population Size: {N_MONKEYS}")
print(f"\n  Final Test Accuracy: {final_accuracy:.4f}")
print(f"  F1 Score: {f1:.4f}")
print(f"  ROC-AUC: {roc_auc:.4f}")

print("\n" + "=" * 80)