## Importing the required libraries

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
%matplotlib inline 
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

classifiers = ['LinearSVM', 'RadialSVM', 
               'Logistic',  'RandomForest', 
               'AdaBoost',  'DecisionTree', 
               'KNeighbors','GradientBoosting']

models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=0),
          AdaBoostClassifier(random_state = 0),
          DecisionTreeClassifier(random_state=0),
          KNeighborsClassifier(),
          GradientBoostingClassifier(random_state=0)]


def acc_score(df,label):
    Score = pd.DataFrame({"Classifier":classifiers})
    j = 0
    acc = []
    X_train,X_test,Y_train,Y_test = split(df,label)
    for i in models:
        model = i
        model.fit(X_train,Y_train)
        predictions = model.predict(X_test)
        acc.append(accuracy_score(Y_test,predictions))
        j = j+1     
    Score["Accuracy"] = acc
    Score.sort_values(by="Accuracy", ascending=False,inplace = True)
    Score.reset_index(drop=True, inplace=True)
    return Score

def plot(score,x,y,c = "b"):
    gen = [1,2,3,4,5]
    plt.figure(figsize=(6,4))
    ax = sns.pointplot(x=gen, y=score,color = c )
    ax.set(xlabel="Generation", ylabel="Accuracy")
    ax.set(ylim=(x,y))



## Genetic algorithm functions

In [6]:
def initilization_of_population(size,n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat,dtype=bool)  # Initialize n_feat sized array with True/One     
        chromosome[:int(0.3*n_feat)]=False       # Set first 30% of the elements False/Zero
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population

def fitness_score(population):
    scores = []
    logmodel = RandomForestClassifier(n_estimators=200, random_state=0)
    for chromosome in population:
        logmodel.fit(X_train.iloc[:,chromosome],Y_train)         
        predictions = logmodel.predict(X_test.iloc[:,chromosome])
        scores.append(accuracy_score(Y_test,predictions))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)                                    
    return list(scores[inds][::-1]), list(population[inds,:][::-1]) 


def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen


def crossover(pop_after_sel):
    pop_nextgen = pop_after_sel
    for i in range(0,len(pop_after_sel),2):
        new_par = []
        child_1 , child_2 = pop_nextgen[i] , pop_nextgen[i+1]
        new_par = np.concatenate((child_1[:len(child_1)//2],child_2[len(child_1)//2:]))
        pop_nextgen.append(new_par)
    return pop_nextgen


def mutation(pop_after_cross,mutation_rate,n_feat):   
    mutation_range = int(mutation_rate*n_feat)
    pop_next_gen = []
    for n in range(0,len(pop_after_cross)):
        chromo = pop_after_cross[n]
        rand_posi = [] 
        for i in range(0,mutation_range):
            pos = randint(0,n_feat-1)
            rand_posi.append(pos)
        for j in rand_posi:
            chromo[j] = not chromo[j]  
        pop_next_gen.append(chromo)
    return pop_next_gen
# df=X, label=y, size=80,n_feat=number of column in X,n_parents=64, mutation_rate=0.2,n_gen=5
def generations(df,label,size,n_feat,n_parents,mutation_rate,n_gen,X_train,
                                   X_test, Y_train, Y_test):
    best_chromo= []
    best_score= []
    population_nextgen=initilization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print('Best score in generation',i+1,':',scores[:1])  #2
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate,n_feat)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo,best_score

## Running on a dataset

In [8]:
data = pd.read_csv("onlyAlpha.csv")
data

Unnamed: 0,Fp1-LE_alpha,F3-LE_alpha,C3-LE_alpha,P3-LE_alpha,O1-LE_alpha,F7-LE_alpha,T3-LE_alpha,T5-LE_alpha,Fz-LE_alpha,Fp2-LE_alpha,...,C4-LE_alpha,P4-LE_alpha,O2-LE_alpha,F8-LE_alpha,T4-LE_alpha,T6-LE_alpha,Cz-LE_alpha,Pz-LE_alpha,A2-A1_alpha,class
0,11.19,7.75,13.34,11.89,6.69,8.49,6.46,7.86,8.31,11.32,...,4.65,6.09,8.18,10.04,2.77,4.52,8.54,9.02,7.85,0
1,14.20,9.54,13.86,16.57,11.09,12.98,6.73,11.64,11.53,14.67,...,9.20,13.80,14.05,16.58,4.62,8.93,11.43,12.49,9.74,0
2,10.63,9.36,6.75,10.46,8.80,6.55,4.70,7.26,11.13,8.55,...,5.91,7.11,7.81,6.04,2.65,4.40,10.43,9.40,8.79,0
3,31.42,20.69,49.08,52.89,17.34,16.68,14.99,25.63,23.69,28.18,...,16.37,27.07,23.36,18.37,5.11,13.56,26.13,36.55,7.69,0
4,11.02,9.65,33.14,34.99,15.42,8.30,10.13,17.58,10.45,11.47,...,10.73,19.07,25.06,8.55,2.64,11.44,16.71,26.95,6.90,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,28.04,29.33,26.85,18.75,16.63,21.65,24.31,16.02,32.90,28.56,...,29.19,27.08,20.64,31.56,42.09,27.00,32.34,24.29,43.66,1
1137,31.86,39.36,49.55,37.09,18.11,23.69,43.28,34.24,38.53,34.67,...,34.00,28.34,18.99,29.04,26.23,19.37,39.78,30.92,21.07,1
1138,30.12,21.56,17.70,17.86,13.41,13.71,13.71,14.87,20.09,25.51,...,18.09,15.37,14.16,14.65,10.93,11.75,19.92,16.35,4.84,1
1139,19.72,31.87,35.32,25.38,19.03,20.42,35.75,17.27,22.80,16.07,...,21.05,21.22,18.09,10.88,12.91,14.43,25.57,22.33,9.83,1


In [9]:
y=data['class']
X=data.drop('class',axis=1)
X

Unnamed: 0,Fp1-LE_alpha,F3-LE_alpha,C3-LE_alpha,P3-LE_alpha,O1-LE_alpha,F7-LE_alpha,T3-LE_alpha,T5-LE_alpha,Fz-LE_alpha,Fp2-LE_alpha,F4-LE_alpha,C4-LE_alpha,P4-LE_alpha,O2-LE_alpha,F8-LE_alpha,T4-LE_alpha,T6-LE_alpha,Cz-LE_alpha,Pz-LE_alpha,A2-A1_alpha
0,11.19,7.75,13.34,11.89,6.69,8.49,6.46,7.86,8.31,11.32,6.99,4.65,6.09,8.18,10.04,2.77,4.52,8.54,9.02,7.85
1,14.20,9.54,13.86,16.57,11.09,12.98,6.73,11.64,11.53,14.67,9.19,9.20,13.80,14.05,16.58,4.62,8.93,11.43,12.49,9.74
2,10.63,9.36,6.75,10.46,8.80,6.55,4.70,7.26,11.13,8.55,8.85,5.91,7.11,7.81,6.04,2.65,4.40,10.43,9.40,8.79
3,31.42,20.69,49.08,52.89,17.34,16.68,14.99,25.63,23.69,28.18,21.88,16.37,27.07,23.36,18.37,5.11,13.56,26.13,36.55,7.69
4,11.02,9.65,33.14,34.99,15.42,8.30,10.13,17.58,10.45,11.47,9.06,10.73,19.07,25.06,8.55,2.64,11.44,16.71,26.95,6.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,28.04,29.33,26.85,18.75,16.63,21.65,24.31,16.02,32.90,28.56,31.99,29.19,27.08,20.64,31.56,42.09,27.00,32.34,24.29,43.66
1137,31.86,39.36,49.55,37.09,18.11,23.69,43.28,34.24,38.53,34.67,36.63,34.00,28.34,18.99,29.04,26.23,19.37,39.78,30.92,21.07
1138,30.12,21.56,17.70,17.86,13.41,13.71,13.71,14.87,20.09,25.51,17.11,18.09,15.37,14.16,14.65,10.93,11.75,19.92,16.35,4.84
1139,19.72,31.87,35.32,25.38,19.03,20.42,35.75,17.27,22.80,16.07,17.89,21.05,21.22,18.09,10.88,12.91,14.43,25.57,22.33,9.83


In [10]:
score1 = acc_score(X,y)
score1

Unnamed: 0,Classifier,Accuracy
0,KNeighbors,0.891608
1,RandomForest,0.888112
2,GradientBoosting,0.888112
3,AdaBoost,0.870629
4,Logistic,0.867133
5,LinearSVM,0.86014
6,DecisionTree,0.828671
7,RadialSVM,0.646853


In [11]:
logmodel = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(X,y)
chromo_df_bc,score_bc=generations(X,y,size=80,n_feat=X.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)
# X.shape[1] return the number of columns of X dataframe
# n_parents = ?
# size = ?

Best score in generation 1 : [0.8951048951048951]
Best score in generation 2 : [0.8986013986013986]
Best score in generation 3 : [0.9055944055944056]
Best score in generation 4 : [0.8951048951048951]
Best score in generation 5 : [0.9020979020979021]


In [20]:
chromo_df_bc

[array([False,  True,  True, False,  True, False,  True,  True, False,
         True, False,  True, False, False,  True,  True,  True, False,
        False, False]),
 array([False,  True, False, False,  True, False, False, False,  True,
         True,  True,  True, False,  True,  True,  True,  True, False,
         True,  True]),
 array([False, False, False, False,  True,  True,  True, False,  True,
         True,  True,  True, False,  True,  True, False,  True, False,
        False,  True]),
 array([False,  True,  True,  True, False, False, False,  True,  True,
        False, False,  True,  True,  True,  True,  True,  True,  True,
        False, False]),
 array([ True,  True,  True,  True, False, False,  True, False, False,
        False,  True, False, False,  True, False,  True,  True,  True,
         True, False])]

In [22]:
score_bc

[0.8951048951048951,
 0.8986013986013986,
 0.9055944055944056,
 0.8951048951048951,
 0.9020979020979021]