### Feature Selection Using Particle Swarm Optimization (PSO) Algorithm

##### Ignore Convergence Warning

In [1]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter( "ignore", category=ConvergenceWarning)

##### Import Statements

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

##### Load Dataset

In [3]:
cancer_data = load_breast_cancer()

df = pd.DataFrame( cancer_data[ "data"], columns=cancer_data[ "feature_names"])
df[ "target"] = cancer_data[ "target"]

print( df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

##### Construct Train and Test Set

In [4]:
X = df[ cancer_data[ "feature_names"]].values
y = df[ "target"].values

X_train, X_test, y_train, y_test = train_test_split( X, y)

print( f"Number of Samples in Train Set: { len( y_train)}")
print( f"Number of Samples in Test Set: { len( y_test)}")

Number of Samples in Train Set: 426
Number of Samples in Test Set: 143


##### Create The Model, Train it with All Features and Evaluate it

In [5]:
model = LogisticRegression()

model.fit( X_train, y_train)

y_pred = model.predict( X_test)

test_accuracy = accuracy_score( y_test, y_pred)
test_loss = log_loss( y_test, y_pred)

print( f"Loss: { test_loss}")
print( f"Accuracy: { test_accuracy}")

Loss: 2.7725887222397807
Accuracy: 0.9230769230769231


##### Fitness Function

In [6]:
def fitness( X_train, y_train, X_test, y_test, p):
	features = np.nonzero( p)[ 0]

	selected_X_train = X_train[ :, features]
	selected_X_test = X_test[ :, features]

	model = LogisticRegression()

	model.fit( selected_X_train, y_train)
	
	y_pred = model.predict( selected_X_test)

	accuracy = accuracy_score( y_test, y_pred)
	loss = log_loss( y_test, y_pred)

	return loss, accuracy

##### PSO Function

In [7]:
def pso( X_train, y_train, X_test, y_test, w=0.7, c1=1, c2=1, n_iter=5):
	
	# dimensions
	_, d = X_train.shape

	# number of particles
	pn = d

	# maximum range of search space for each dimension
	domain = 1

	# position of particles. initial seed within [ 0, domain]
	p = domain * np.random.rand( pn, d)

	# make postions binary
	p[ p < 0.5] = 0
	p[ p >= 0.5] = 1
	
	# initial velocity
	v = np.zeros( ( pn, d))

	# best position of particle initiated with p ( initial position)
	pbest = np.copy( p)

	# value of fit function for each particle
	f = np.array( [ fitness( X_train, y_train, X_test, y_test, particle)[ 0] for particle in p])

	# best value of fit function for each particle
	# initiated with f ( initial value of fit function for each particle)
	fpbest = np.copy( f)

	# best value of fit function among all particles
	# and its corresponding position
	fgbest, gbest = f.min(), p[ f.argmin()]

	# variable for recording history of fgbest (best value of fit function)
	history = np.zeros( n_iter)

	# iterate n_iter time
	for i in range( n_iter):

		# recording best value of fit function
		history[ i] = fgbest

		# generate two matrices of random numbers in range [ -1, 1] and shape of ( pn, d) ( for each particle and for each dimension)
		r1 = np.random.rand( pn, d)
		r2 = np.random.rand( pn, d)

		# calculate velocity of particles in each dimenstion
		v = ( w * v) + ( r1 * c1 * ( pbest - p)) + ( r2 * c2 * ( gbest - p))

		# update position of particle
		p = p + v

		# keep values in range and make it binary
		p[ p < 0.5] = 0
		p[ p >= 0.5] = 1

		f = np.array( [ fitness( X_train, y_train, X_test, y_test, particle)[ 0] for particle in p])

		# find indices of particles with better value of fit function than their previous value
		indices = np.nonzero( f < fpbest)

		# update value of fit function for particles that found a better value
		fpbest[ indices] = f[ indices]

		# update position of particles that found a better value
		pbest[ indices] = p[ indices]
		
		# if f.min() < fgbest:
		# update best value and its corresponding position
		fgbest, gbest = f.min(), p[ f.argmin()]

	# acquire loss and accuracy from gbest
	loss, accuracy = fitness( X_train, y_train, X_test, y_test, gbest)

	return history, gbest, loss, accuracy


##### Driver

In [8]:
_, gbest, loss, accuracy = pso( X_train, y_train, X_test, y_test)

print( f"Loss: { loss}")
print( f"Accuracy: { accuracy}")
print( f"Number of Selected Features: { np.count_nonzero( gbest)} out of { len( gbest)}")

Loss: 1.2602676010180827
Accuracy: 0.965034965034965
Number of Selected Features: 16 out of 30
