In [1]:
import copy
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from problem_types import Problem, Individual
from WHH import WHH
from BBO import BBO
from DE import DE
from FireFly import FireFly
from HS import HS
from PSO import PSO
from SA import SA
from IWO import IWO
from TLBO import TLBO

In [2]:
# reading data

X_train = pd.read_excel("Data/Xtrain.xlsx", index_col=0)
y_train = pd.read_excel("Data/ytrain.xlsx", index_col=0)
X_test = pd.read_excel("Data/Xtest.xlsx", index_col=0)
y_test = pd.read_excel("Data/ytest.xlsx", index_col=0)

X_train.drop(['sample_id'], axis=1, inplace=True)
X_test.drop(['sample_id'], axis=1, inplace=True)

columns = X_train.columns.to_numpy()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

THRESHOLD = 0.9

In [3]:
# cost function

def FS(x):
    selected = x > THRESHOLD
    true_count = 0
    for i in selected:
        if i:
            true_count += 1
    if true_count == 0:
        return 1

    svm = SVC()
    svm.fit(X_train[:, selected],
            y_train.values.ravel())
    acc = svm.score(X_test[:, selected], y_test)
    g = true_count / selected.size
    f = 1 - (acc - g)
    return f

In [4]:
# initialize problem object

def problem_init(algs):
    problem = Problem()
    # Number of Decision Variables
    problem.nVar = len(X_train[0])
    # Decision Variables Matrix Size
    problem.VarSize = problem.nVar
    # Decision Variables Lower Bound
    problem.VarMin = 0
    # Decision Variables Upper Bound
    problem.VarMax = 1
    problem.MaxIt = 10
    problem.nPop = 10
    problem.CostFunction = FS
    problem.BestSol = Individual(0)
    problem.GlobalBest = Individual(problem.nVar)
    problem.BestCostMaxiter = np.zeros(problem.MaxIt)
    problem.BestCost = np.zeros(problem.MaxIt)
    problem.WorstCost = 0

    return problem

In [16]:
# initialize algorithm's list

def algs_set():
    a = [
        SA,
        WHH,
        BBO,
        DE,
        FireFly,
        HS,
        PSO,
        TLBO,
        IWO,
    ]
    algs = []
    for i in range(len(a)):
        algs.append(a[i](i))

    return algs


def algs_set_parameters(problem, algs):
    for alg in algs:
        alg.set_parameters(problem)

In [13]:
# initialize population

def pop_init(problem):
    pop = np.array([Individual(problem.nVar) for i in range(problem.nPop)])

    for i in range(problem.nPop):
        pop[i].Position = np.random.uniform(
            problem.VarMin, problem.VarMax, problem.VarSize)
        pop[i].Velocity = np.zeros(problem.VarSize)
        pop[i].Cost = problem.CostFunction(pop[i].Position)
        pop[i].Best_Position = pop[i].Position
        pop[i].Best_Cost = pop[i].Cost
        if pop[i].Best_Cost < problem.BestSol.Cost:
            problem.GlobalBest.Position = pop[i].Best_Position
            problem.GlobalBest.Cost = pop[i].Best_Cost

    pop = np.array(sorted(pop, key=lambda a: a.Cost))
    return pop

In [18]:
np.random.seed(43)
algs = algs_set()
problem = problem_init(algs)
algs_set_parameters(problem, algs)
pop = pop_init(problem)
problem.BestSol = copy.deepcopy(pop[0])
problem.WorstCost = pop[-1].Cost

In [19]:
# run

main_pop = copy.deepcopy(pop)
for i in range(len(algs)):
    print(alg.__class__.__name__, ":")
    alg_pop = copy.deepcopy(pop)
    problem.BestSol = copy.deepcopy(main_pop[0])
    for j in range(problem.MaxIt):
        newpop = copy.deepcopy(alg_pop)
        algs[i].run(problem, alg_pop, newpop)

    selected = problem.BestSol.Position > THRESHOLD
    selected_count = 0
    for k in selected:
        if k:
            selected_count += 1
    g = selected_count / selected.size
    acc = 1 + g - problem.BestSol.Cost
    print("Best Cost:", problem.BestSol.Cost)
    print("Accuracy:", acc)
    print("Selected Num:", selected_count)
    print("Selected:", columns[selected])
    print()

    # save result of whh
    if algs[i].__class__.__name__ == "WHH":
        x = pd.read_excel("Data/Xtrain.xlsx")
        x.to_excel('selected-features.xlsx',
                   columns=columns[selected])

SA :
Best Cost: 0.06922487304781066
Accuracy: 0.9647887323943662
Selected Num: 5
Selected: ['Guanidineacetic acid_pos-087' 'NMN_pos-162'
 'N-Acetyl-D-glucosamine 6-phosphate_neg-061' 'Succinate_neg-079'
 'malate_neg-096']

WHH :
Best Cost: 0.0689853406151193
Accuracy: 0.971830985915493
Selected Num: 6
Selected: ['1-Methyladenosine_pos-001' 'Carnitine-C18_pos-051'
 'Guanidineacetic acid_pos-087' 'Hypoxanthine_pos-096' 'NMN_pos-162'
 'Succinate_neg-079']

BBO :
Best Cost: 0.18118233208776469
Accuracy: 0.8732394366197184
Selected Num: 8
Selected: ['5-Hydroxyindoleacetic acid_pos-021' 'Guanosine_pos-089'
 "S-Methyl-5'-thioadenosine_pos-143" 'Tryptophan_pos-154' 'Uracil_pos-156'
 'NMN_pos-162' 'Pyroglutamic acid_neg-072' 'Ribose 5-phosphate_neg-074']

DE :
Best Cost: 0.13821021366292996
Accuracy: 0.9366197183098592
Selected Num: 11
Selected: ['Asparagine_pos-040' 'Carnitine-C5_pos-054' 'Creatinine_pos-063'
 'N-Acetyl-aspartic acid_pos-111' 'Ornithine_pos-124'
 "S-Methyl-5'-thioadenosine_pos