# The application of genetic algorithm for feature selection. 
Firstly, the logistic regression is applied for breast cancer data and the performance is noted. 
Secondly, genetic algorithm is applied  for selecting features of the breast cancer data which results in increase of performance. 

In [2]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot
%matplotlib inline

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [4]:
cancer=load_breast_cancer()
datafr = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
label=cancer["target"]

In [7]:
Xtrain, Xtest, ytrain, ytest = train_test_split(datafr, 
                                                    label, test_size=0.30, 
                                                    random_state=101)

In [15]:
#training a logistics regression model
logmodel = LogisticRegression()
logmodel.fit(Xtrain, ytrain)
predictions = logmodel.predict(Xtest)
print("The accuracy of the model is "+ str(accuracy_score(ytest, predictions)))

The accuracy of the model is 0.935672514619883




In [27]:
#defining various steps required for the genetic algorithm
def init_of_pop(size, numFeat):
    pop = []
    for i in range( size ):
        chromosome = np.ones(numFeat, dtype = np.bool)
        chromosome[ :int( 0.3*numFeat )] = False
        np.random.shuffle(chromosome)
        pop.append(chromosome)
    return pop

def fit_score( pop ):
    scores = []
    for chromosome in pop:
        logmodel.fit( Xtrain.iloc[:,chromosome], ytrain )
        preds = logmodel.predict( Xtest.iloc[:,chromosome] )
        scores.append( accuracy_score( ytest, preds ))
    scores, pop = np.array( scores ), np.array( pop ) 
    inds = np.argsort( scores )
    return list( scores[inds][::-1]), list( pop[inds,:][::-1] )

def selection( popAfterFit, numParents ):
    popNextgen = []
    for i in range(numParents):
        popNextgen.append(popAfterFit[i])
    return popNextgen

def crossover( popAfterSel ):
    popNextgen=popAfterSel
    for i in range(len(popAfterSel)):
        child=popAfterSel[i]
        child[3:7]=popAfterSel[(i+1)%len(popAfterSel)][3:7]
        popNextgen.append(child)
    return popNextgen

def mutation(popAfterCross, mutationRate):
    popNextgen = []
    for i in range( 0, len(popAfterCross ) ):
        chromosome = popAfterCross[i]
        for j in range(len(chromosome)):
            if random.random() < mutationRate:
                chromosome[j]= not chromosome[j]
        popNextgen.append(chromosome)
    #print(popNextgen)
    return popNextgen

def generations(size, numFeat, numParents, mutationRate, nGen, Xtrain,
                                   Xtest,  ytrain, ytest):
    bestChromo= []
    bestScore= []
    popNextgen=init_of_pop( size, numFeat)
    for i in range( nGen ):
        scores, popAfterFit = fit_score(popNextgen)
        print(scores[:2])
        popAfterSel = selection(popAfterFit, numParents)
        popAfterCross = crossover(popAfterSel)
        popNextgen = mutation(popAfterCross,mutationRate)
        bestChromo.append( popAfterFit[0] )
        bestScore.append( scores[0] )
    return bestChromo, bestScore

In [41]:
chromo,score=generations(size=200, numFeat=30, numParents=100, mutationRate=0.07,
                     nGen=38, Xtrain=Xtrain, Xtest=Xtest, ytrain=ytrain, ytest=ytest )









[0.9532163742690059, 0.9532163742690059]










[0.9590643274853801, 0.9590643274853801]












[0.9532163742690059, 0.9532163742690059]










[0.9532163742690059, 0.9532163742690059]










[0.9532163742690059, 0.9532163742690059]










[0.9590643274853801, 0.9590643274853801]










[0.9532163742690059, 0.9532163742690059]










[0.9590643274853801, 0.9590643274853801]








[0.9590643274853801, 0.9590643274853801]








[0.9532163742690059, 0.9532163742690059]










[0.9532163742690059, 0.9532163742690059]








[0.9590643274853801, 0.9590643274853801]










[0.9532163742690059, 0.9532163742690059]








[0.9649122807017544, 0.9649122807017544]








[0.9532163742690059, 0.9532163742690059]










[0.9590643274853801, 0.9590643274853801]








[0.9532163742690059, 0.9532163742690059]








[0.9532163742690059, 0.9532163742690059]








[0.9532163742690059, 0.9532163742690059]








[0.9532163742690059, 0.9532163742690059]








[0.9590643274853801, 0.9590643274853801]








[0.9590643274853801, 0.9590643274853801]










[0.9590643274853801, 0.9590643274853801]








[0.9590643274853801, 0.9590643274853801]










[0.9532163742690059, 0.9532163742690059]








[0.9590643274853801, 0.9590643274853801]








[0.9590643274853801, 0.9590643274853801]










[0.9532163742690059, 0.9532163742690059]








[0.9532163742690059, 0.9532163742690059]








[0.9590643274853801, 0.9590643274853801]








[0.9532163742690059, 0.9532163742690059]










[0.9590643274853801, 0.9590643274853801]








[0.9590643274853801, 0.9590643274853801]








[0.9590643274853801, 0.9590643274853801]








[0.9532163742690059, 0.9532163742690059]








[0.9532163742690059, 0.9532163742690059]








[0.9532163742690059, 0.9532163742690059]








[0.9532163742690059, 0.9532163742690059]


In [42]:
logmodel.fit(Xtrain.iloc[:,chromo[-1]], ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
predictions = logmodel.predict(Xtest.iloc[:,chromo[-1]])

In [44]:
print("Accuracy score after genetic algorithm is= "+ str(accuracy_score( ytest, predictions )))

Accuracy score after genetic algorithm is= 0.9532163742690059
