In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names

In [3]:
feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [4]:
X.shape

(569, 30)

In [5]:
y.shape

(569,)

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((398, 30), (171, 30), (398,), (171,))

In [7]:
# Define the fitness function
def fitness_function(feature_subset):
    """
    Evaluate the performance of a Random Forest model using the selected features.
    :param feature_subset: Binary array indicating which features are selected (1 = selected, 0 = not selected).
    :return: Accuracy of the model (higher is better).
    """
    # Select only the features marked as 1
    X_train_selected = X_train[:, feature_subset == 1]
    X_test_selected = X_test[:, feature_subset == 1]
    
    # Train a Random Forest classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_selected, y_train)
    
    # Evaluate the model on the test set
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

In [11]:
feature_subset = np.ones(30)
feature_subset

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [12]:
fitness_function(feature_subset)

0.9707602339181286

In [13]:
feature_subset[15:] = 0
feature_subset

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [14]:
fitness_function(feature_subset)

0.9532163742690059

In [15]:
# PSO parameters
num_particles = 20       # Number of particles in the swarm
max_iterations = 50      # Maximum number of iterations
w = 0.7                  # Inertia weight
c1 = 1.5                 # Cognitive coefficient
c2 = 1.5                 # Social coefficient
num_features = X.shape[1]  # Total number of features

In [16]:
num_features

30

In [17]:
# Initialize the swarm
positions = np.random.randint(0, 2, size=(num_particles, num_features))  # Binary positions (0 or 1)
velocities = np.random.uniform(-1, 1, size=(num_particles, num_features))

In [18]:
positions

array([[1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
        1, 0, 0, 1, 0, 0, 1, 1],
       [1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
        1, 0, 0, 1, 1, 1, 0, 1],
       [1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
        0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 1, 0, 1, 1, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
        0, 1, 1, 1, 0, 1, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1,

In [19]:
positions.shape

(20, 30)

In [20]:
# Initialize personal best positions and fitness values
p_best_positions = positions.copy()
p_best_fitness = np.array([fitness_function(pos) for pos in positions])

In [21]:
# Initialize global best position and fitness value
g_best_position = p_best_positions[np.argmax(p_best_fitness)]
g_best_fitness = np.max(p_best_fitness)

In [22]:
# PSO algorithm
for iteration in range(max_iterations):
    for i in range(num_particles):
        # Update velocity (sigmoid transformation ensures binary positions)
        r1, r2 = np.random.rand(), np.random.rand()
        velocities[i] = (w * velocities[i] +
                         c1 * r1 * (p_best_positions[i] - positions[i]) +
                         c2 * r2 * (g_best_position - positions[i]))
        
        # Update position (binary decision based on sigmoid)
        sigmoid = 1 / (1 + np.exp(-velocities[i]))
        positions[i] = (np.random.rand(num_features) < sigmoid).astype(int)
        
        # Evaluate fitness
        fitness = fitness_function(positions[i])
        
        # Update personal best
        if fitness > p_best_fitness[i]:
            p_best_fitness[i] = fitness
            p_best_positions[i] = positions[i]
        
        # Update global best
        if fitness > g_best_fitness:
            g_best_fitness = fitness
            g_best_position = positions[i]
    
    # Print progress
    print(f"Iteration {iteration + 1}: Best Fitness = {g_best_fitness:.4f}")

Iteration 1: Best Fitness = 0.9883
Iteration 2: Best Fitness = 0.9942
Iteration 3: Best Fitness = 0.9942
Iteration 4: Best Fitness = 0.9942
Iteration 5: Best Fitness = 0.9942
Iteration 6: Best Fitness = 0.9942
Iteration 7: Best Fitness = 0.9942
Iteration 8: Best Fitness = 0.9942
Iteration 9: Best Fitness = 0.9942
Iteration 10: Best Fitness = 0.9942
Iteration 11: Best Fitness = 0.9942
Iteration 12: Best Fitness = 0.9942
Iteration 13: Best Fitness = 0.9942
Iteration 14: Best Fitness = 0.9942
Iteration 15: Best Fitness = 0.9942
Iteration 16: Best Fitness = 0.9942
Iteration 17: Best Fitness = 0.9942
Iteration 18: Best Fitness = 0.9942
Iteration 19: Best Fitness = 0.9942
Iteration 20: Best Fitness = 0.9942
Iteration 21: Best Fitness = 0.9942
Iteration 22: Best Fitness = 0.9942
Iteration 23: Best Fitness = 0.9942
Iteration 24: Best Fitness = 0.9942
Iteration 25: Best Fitness = 0.9942
Iteration 26: Best Fitness = 0.9942
Iteration 27: Best Fitness = 0.9942
Iteration 28: Best Fitness = 0.9942
I

In [23]:
# Output the result
print("\nOptimization Complete:")
selected_features = np.where(g_best_position == 1)[0]
print(f"Selected Features: {selected_features}")
print(f"Feature Names: {[feature_names[i] for i in selected_features]}")
print(f"Best Fitness (Accuracy): {g_best_fitness:.4f}")


Optimization Complete:
Selected Features: [ 0  2  6  8  9 10 16 17 18 22 23 25 26 28 29]
Feature Names: [np.str_('mean radius'), np.str_('mean perimeter'), np.str_('mean concavity'), np.str_('mean symmetry'), np.str_('mean fractal dimension'), np.str_('radius error'), np.str_('concavity error'), np.str_('concave points error'), np.str_('symmetry error'), np.str_('worst perimeter'), np.str_('worst area'), np.str_('worst compactness'), np.str_('worst concavity'), np.str_('worst symmetry'), np.str_('worst fractal dimension')]
Best Fitness (Accuracy): 0.9942


In [25]:
# Compare model performance before and after feature selection
# Before feature selection
model_full = RandomForestClassifier(random_state=42)
model_full.fit(X_train, y_train)
y_pred_full = model_full.predict(X_test)
accuracy_full = accuracy_score(y_test, y_pred_full)
accuracy_full

0.9707602339181286

In [26]:
# After feature selection
X_train_selected = X_train[:, g_best_position == 1]
X_test_selected = X_test[:, g_best_position == 1]
model_selected = RandomForestClassifier(random_state=42)
model_selected.fit(X_train_selected, y_train)
y_pred_selected = model_selected.predict(X_test_selected)
accuracy_selected = accuracy_score(y_test, y_pred_selected)
accuracy_selected

0.9766081871345029

In [27]:
print("\nModel Performance Comparison:")
print(f"Accuracy (All Features): {accuracy_full:.4f}")
print(f"Accuracy (Selected Features): {accuracy_selected:.4f}")


Model Performance Comparison:
Accuracy (All Features): 0.9708
Accuracy (Selected Features): 0.9766
