In [139]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from numpy.random import rand
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import time

In [140]:
def error_rate(features, target, x, opts):
    # parameters
    k     = opts['k']
    cv    = opts['cv']

    kf = KFold(n_splits=cv, shuffle=True, random_state=2)

    total_error = 0

    for train_index, test_index in kf.split(features):
      X_train, X_test = features[train_index], features[test_index]
      y_train, y_test = target[train_index], target[test_index]

      # Number of instances
      num_train = np.size(X_train, 0)
      num_test  = np.size(X_test, 0)
      
      # Define selected features
      xtrain = X_train[:, x==1]
      ytrain = y_train.reshape(num_train)
      xtest  = X_test[:, x==1]
      ytest  = y_test.reshape(num_test)

      # Training
      knn = KNeighborsClassifier(n_neighbors=k)
      knn.fit(xtrain, ytrain)

      # Prediction
      ypred = knn.predict(xtest)
      acc   = np.sum(ytest == ypred) / num_test
      error = 1 - acc

      total_error = total_error + error
      
    return total_error/cv

In [141]:
def Fun(features, target, x, opts):
    # Parameters
    alpha    = 0.99
    beta     = 1 - alpha
    # Original feature size
    max_feat = len(x)
    # Number of selected features
    num_feat = np.sum(x == 1)
    # Solve if no feature selected
    if num_feat == 0:
        cost  = 1
    else:
        # Get error rate
        error = error_rate(features, target, x, opts)
        # Objective function
        cost  = alpha * error + beta * (num_feat / max_feat)
        
    return cost

In [142]:
def init_position(lb, ub, N, dim):
    X = np.zeros([N, dim], dtype='float')
    R = rand()
    u = 1 + R
    for d in range(dim):
        X[0,d] = lb[0,d] + (ub[0,d] - lb[0,d]) * rand()
    for i in range(1, N):
        for d in range(dim):
            if X[i-1,d] <= 0.5:
                X[i,d] = u * X[i-1,d]
            else:
                X[i,d] = u * (1 - X[i-1,d])
    
    return X

In [143]:
def binary_conversion(X, thres, N, dim):
    Xbin = np.zeros([N, dim], dtype='int')
    for i in range(N):
        # print("Max:Min", max(X[i,:]), min(X[i,:]))
        Min = min(X[i,:])
        Max = max(X[i,:])
        for d in range(dim):
            Xi = (X[i,d] - Min) / (Max - Min)
            xi = 1/(1 + np.exp(-Xi))
            if xi > rand():
            # if X[i,d] > 0.5:
                Xbin[i,d] = 0
            else:
                Xbin[i,d] = 1
    
    return Xbin

In [144]:
def boundary(x, lb, ub):
    if x < lb:
        x = lb
    if x > ub:
        x = ub
    
    return x

In [145]:
def gwo(features, target, opts):
    # Parameters
    ub    = 1
    lb    = 0
    thres = rand()    # NEED TO CHANGE

    N        = opts['N']
    max_iter = opts['T']
    if 'w' in opts:
        w    = opts['w']
    if 'c1' in opts:
        c1   = opts['c1']
    if 'c2' in opts:
        c2   = opts['c2'] 
    
    # Dimension
    dim = np.size(features, 1)
    if np.size(lb) == 1:
        ub = ub * np.ones([1, dim], dtype='float')
        lb = lb * np.ones([1, dim], dtype='float')
        
    # Initialize position
    X     = init_position(lb, ub, N, dim)

    # Binary conversion
    Xbin  = binary_conversion(X, thres, N, dim)

    # Fitness at first iteration
    fit    = np.zeros([N, 1], dtype='float')
    Xalpha = np.zeros([1, dim], dtype='float')
    Xbeta  = np.zeros([1, dim], dtype='float')
    Xdelta = np.zeros([1, dim], dtype='float')
    Falpha = float('inf')
    Fbeta  = float('inf')
    Fdelta = float('inf')

    for i in range(N):
      fit[i,0] = Fun(features, target, Xbin[i,:], opts)
      if fit[i,0] < Falpha:
        Xalpha[0,:] = X[i,:]
        Falpha      = fit[i,0]

      if fit[i,0] < Fbeta and fit[i,0] > Falpha:
        Xbeta[0,:]  = X[i,:]
        beta       = fit[i,0]
            
      if fit[i,0] < Fdelta and fit[i,0] > Fbeta and fit[i,0] > Falpha:
        Xdelta[0,:] = X[i,:]
        Fdelta      = fit[i,0]

    # Pre
    curve = np.zeros([1, max_iter], dtype='float')
    t     = 0

    curve[0,t] = Falpha.copy()
    print("Iteration:", t + 1)
    print("Best (GWO):", curve[0,t])
    t += 1
    
    while t < max_iter:
        # Coefficient decreases linearly from 2 to 0 
        a = 2 - t * (2 / max_iter)
        
        for i in range(N):
            for d in range(dim):
                # Parameter C (3.4)
                C1     = 2 * rand()
                C2     = 2 * rand()
                C3     = 2 * rand()
                # Compute Dalpha, Dbeta & Ddelta (3.5)
                Dalpha = abs(C1 * Xalpha[0,d] - X[i,d]) 
                Dbeta  = abs(C2 * Xbeta[0,d] - X[i,d])
                Ddelta = abs(C3 * Xdelta[0,d] - X[i,d])
                # Parameter A (3.3)
                A1     = 2 * a * rand() - a
                A2     = 2 * a * rand() - a
                A3     = 2 * a * rand() - a
                # Compute X1, X2 & X3 (3.6) 
                X1     = Xalpha[0,d] - A1 * Dalpha
                X2     = Xbeta[0,d] - A2 * Dbeta
                X3     = Xdelta[0,d] - A3 * Ddelta
                # Update wolf (3.7)
                X[i,d] = (X1 + X2 + X3) / 3                
                # Boundary
                X[i,d] = boundary(X[i,d], lb[0,d], ub[0,d])

        # Binary conversion
        Xbin  = binary_conversion(X, thres, N, dim)
        
        # Fitness
        for i in range(N):
            fit[i,0] = Fun(features, target, Xbin[i,:], opts)
            if fit[i,0] < Falpha:
                Xalpha[0,:] = X[i,:]
                Falpha      = fit[i,0]
                
            if fit[i,0] < Fbeta and fit[i,0] > Falpha:
                Xbeta[0,:]  = X[i,:]
                Fbeta       = fit[i,0]
                
            if fit[i,0] < Fdelta and fit[i,0] > Fbeta and fit[i,0] > Falpha:
                Xdelta[0,:] = X[i,:]
                Fdelta      = fit[i,0]
        
        curve[0,t] = Falpha.copy()
        print("Iteration:", t + 1)
        print("Best (GWO):", curve[0,t])
        t += 1
    
                
    # Best feature subset
    Gbin       = binary_conversion(Xalpha, thres, 1, dim) 
    Gbin       = Gbin.reshape(dim)
    pos        = np.asarray(range(0, dim))    
    sel_index  = pos[Gbin == 1]
    num_feat   = len(sel_index)
    # Create dictionary
    gwo_data = {'sf': sel_index, 'c': curve, 'nf': num_feat}
    
    return gwo_data 

In [146]:
column_headers = ['Type', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols',
                  'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color Intensity',
                   'Hue', 'OD280/OD315 of diluted wines', 'Proline']

In [147]:
# #####################################################################
# #######         WINE
# #####################################################################

# column_headers = ['Type', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols',
#                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color Intensity',
#                    'Hue', 'OD280/OD315 of diluted wines', 'Proline']
# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', sep=",", header=None)
# # data = data.values
# data.dropna(inplace=True)
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 1:])
# target = np.asarray(dataArray[:, 0])

In [148]:
# #####################################################################
# #######         BREAST CANCER
# #####################################################################

# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
# data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
#                 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
#                 'Normal Nucleoli', 'Mitoses','Class']

# data = data.drop(['Sample code'],axis=1)
# # data = data.values
# print(data.shape[0])
# data = data.replace("?",np.NaN)
# print(data.isnull().values.any())
# data.dropna(inplace=True)
# print(data.shape[0])
# data['Bare Nuclei'] = pd.to_numeric(data['Bare Nuclei'])
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 1:-1])
# target = np.asarray(dataArray[:, -1])

In [149]:
#####################################################################
#######         ZOO
#####################################################################

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data', header=None)
# data = data.values
data = data.loc[:, data.columns != 0]
data = data.astype(float)
print(data.shape[0])
print(data.isnull().values.any())
data.dropna(inplace=True)
print(data.shape[0])
dataArray = data.to_numpy()

print("Data: ", data)

features  = np.asarray(dataArray[:, 0:-1])
target = np.asarray(dataArray[:, -1])

101
False
101
Data:        1    2    3    4    5    6    7    8    9    10   11   12   13   14  \
0    1.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  1.0  1.0  0.0  0.0  4.0  0.0   
1    1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  1.0  0.0  0.0  4.0  1.0   
2    0.0  0.0  1.0  0.0  0.0  1.0  1.0  1.0  1.0  0.0  0.0  1.0  0.0  1.0   
3    1.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  1.0  1.0  0.0  0.0  4.0  0.0   
4    1.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  1.0  1.0  0.0  0.0  4.0  1.0   
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
96   1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  1.0  0.0  0.0  2.0  1.0   
97   1.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  1.0  0.0  6.0  0.0   
98   1.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  1.0  1.0  0.0  0.0  4.0  1.0   
99   0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0   
100  0.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  1.0  0.0  0.0  2.0  1.0   

      15   16   17  
0    0.0  1.0  1.0  
1    0.0  1.

In [150]:
# #####################################################################
# #######         SONAR
# #####################################################################

# data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data', header=None)
# # data = data.values
# mymap = {'R': 1, 'M': 2}
# data = data.applymap(lambda s: mymap.get(s) if s in mymap else s)

# print(data.shape[0])
# print(data.isnull().values.any())
# data.dropna(inplace=True)
# print(data.shape[0])
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 0:-1])
# target = np.asarray(dataArray[:, -1])

In [151]:
# #####################################################################
# #######         LYMPHOGRAPHY
# #####################################################################

# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data', header=None)
# # data = data.values

# print(data.shape[0])
# print(data.isnull().values.any())
# data.dropna(inplace=True)
# print(data.shape[0])
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 1:])
# target = np.asarray(dataArray[:, 0])

In [152]:
# #####################################################################
# #######         CONGRESS
# #####################################################################

# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header=None)
# # data = data.values
# mymap = {'n': 0, 'y': 1, 'democrat': 1, 'republican': 2}
# data = data.applymap(lambda s: mymap.get(s) if s in mymap else s)

# print(data.shape[0])
# data = data.replace("?",np.NaN)
# print(data.isnull().values.any())
# data.dropna(inplace=True)
# print(data.shape[0])
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 1:])
# target = np.asarray(dataArray[:, 0])

In [153]:
# #####################################################################
# #######         IONOSPHERE
# #####################################################################

# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data', header=None)
# # data = data.values
# mymap = {'g': 1, 'b': 2}
# data = data.applymap(lambda s: mymap.get(s) if s in mymap else s)

# print(data.shape[0])
# data = data.replace("?",np.NaN)
# print(data.isnull().values.any())
# data.dropna(inplace=True)
# print(data.shape[0])
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 0:-1])
# target = np.asarray(dataArray[:, -1])

In [154]:
# #####################################################################
# #######         CHESS
# #####################################################################

# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data', header=None)
# # data = data.values
# mymap = {'f': 0., 't': 1., 'n': 0., 'w': 1., 'b': 2., 'l': 0., 'g': 1., 'won': 1., 'nowin': 2.}
# data = data.applymap(lambda s: mymap.get(s) if s in mymap else s)

# print(data.shape[0])
# data = data.replace("?",np.NaN)
# print(data.isnull().values.any())
# data.dropna(inplace=True)
# print(data.shape[0])
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 0:-1])
# target = np.asarray(dataArray[:, -1])

In [155]:
# #####################################################################
# #######         TIC-TAC-TOE
# #####################################################################

# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data', header=None)
# # data = data.values
# mymap = {'x': 1., 'o': 2., 'b': 3., 'positive': 1., 'negative': 2.}
# data = data.applymap(lambda s: mymap.get(s) if s in mymap else s)

# print(data.shape[0])
# data = data.replace("?",np.NaN)
# print(data.isnull().values.any())
# data.dropna(inplace=True)
# print(data.shape[0])
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 0:-1])
# target = np.asarray(dataArray[:, -1])

In [156]:
# #####################################################################
# #######         WDBC
# #####################################################################

# data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
# # data = data.values
# mymap = {'M': 1., 'B': 2.}
# data = data.applymap(lambda s: mymap.get(s) if s in mymap else s)
# data = data.loc[:, data.columns != 0]

# print(data.shape[0])
# data = data.replace("?",np.NaN)
# print(data.isnull().values.any())
# data.dropna(inplace=True)
# print(data.shape[0])
# dataArray = data.to_numpy()

# print("Data: ", data)

# features  = np.asarray(dataArray[:, 1:])
# target = np.asarray(dataArray[:, 0])

In [157]:
print("Feature: ", features)

Feature:  [[1. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 1. 0. 1.]
 [0. 0. 1. ... 1. 0. 0.]
 ...
 [1. 0. 0. ... 1. 0. 1.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 1. 0. 0.]]


In [158]:
print("Target: ", target)

Target:  [1. 1. 4. 1. 1. 1. 1. 4. 4. 1. 1. 2. 4. 7. 7. 7. 2. 1. 4. 1. 2. 2. 1. 2.
 6. 5. 5. 1. 1. 1. 6. 1. 1. 2. 4. 1. 1. 2. 4. 6. 6. 2. 6. 2. 1. 1. 7. 1.
 1. 1. 1. 6. 5. 7. 1. 1. 2. 2. 2. 2. 4. 4. 3. 1. 1. 1. 1. 1. 1. 1. 1. 2.
 7. 4. 1. 1. 3. 7. 2. 2. 3. 7. 4. 2. 1. 7. 4. 2. 6. 5. 3. 3. 4. 1. 1. 2.
 1. 6. 1. 7. 2.]


In [159]:
num_runs = 20    # number of independent runs
k        = 5     # k-value in KNN
N        = 5     # number of particles
T        = 20    # maximum number of iterations
cv       = 10    # K-fold cross-validation
opts     = {'k':k, 'N':N, 'T':T, 'cv': cv}
dim      = features.shape[1]

In [160]:
acc_arr          = []
feature_size_arr = []
time_arr         = []
fitness          = []
run_count        = 0

while run_count < num_runs:
  start_time = time.time()
  run_count += 1
  print("Run ", run_count)
  print("-----------------------------------")
  fmdl = gwo(features, target, opts)
  sf   = fmdl['sf']
  
  if sf.size == 0:
    sf = np.arange(dim)
    
  print("SF", sf)

  kf = KFold(n_splits=cv, shuffle=True, random_state=2)

  total_Acc = 0

  for train_index, test_index in kf.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # Number of instances
    num_train = np.size(X_train, 0)
    num_test  = np.size(X_test, 0)
    
    # Define selected features
    xtrain = X_train[:, sf]
    ytrain = y_train.reshape(num_train)
    xtest  = X_test[:, sf]
    ytest  = y_test.reshape(num_test)

    # Training
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain, ytrain)

    # Prediction
    ypred = knn.predict(xtest)
    acc   = np.sum(ytest == ypred) / num_test

    total_Acc = total_Acc + acc

  time_arr.append(time.time() - start_time)

  Accuracy = 100 * (total_Acc/cv)
  print("Accuracy:", Accuracy)
  acc_arr.append(Accuracy)

  num_feat = fmdl['nf']
  print("Feature Size:", num_feat)
  feature_size_arr.append(num_feat)

  fitt = fmdl['c'][0].min()
  print("Fitness", fitt)
  fitness.append(fitt)
  print("--------------------------------------------------")

Run  1
-----------------------------------
Iteration: 1
Best (GWO): 0.12317500000000002
Iteration: 2
Best (GWO): 0.12317500000000002
Iteration: 3
Best (GWO): 0.12317500000000002
Iteration: 4
Best (GWO): 0.12317500000000002
Iteration: 5
Best (GWO): 0.12317500000000002
Iteration: 6
Best (GWO): 0.12317500000000002
Iteration: 7
Best (GWO): 0.12317500000000002
Iteration: 8
Best (GWO): 0.12317500000000002
Iteration: 9
Best (GWO): 0.12317500000000002
Iteration: 10
Best (GWO): 0.12317500000000002
Iteration: 11
Best (GWO): 0.12317499999999997
Iteration: 12
Best (GWO): 0.093475
Iteration: 13
Best (GWO): 0.093475
Iteration: 14
Best (GWO): 0.093475
Iteration: 15
Best (GWO): 0.093475
Iteration: 16
Best (GWO): 0.093475
Iteration: 17
Best (GWO): 0.093475
Iteration: 18
Best (GWO): 0.0923
Iteration: 19
Best (GWO): 0.0923
Iteration: 20
Best (GWO): 0.0923
SF [ 4  7  8 12]
Accuracy: 82.0909090909091
Feature Size: 4
Fitness 0.0923
--------------------------------------------------
Run  2
------------------

In [161]:
print("Average Accuracy: ", np.mean(acc_arr))
print("Average fitness: ", np.mean(fitness))
print("Average number of features selected: ", np.mean(feature_size_arr))
print("Average time taken: ", np.mean(time_arr), "seconds")

Average Accuracy:  77.19090909090907
Average fitness:  0.07769499999999997
Average number of features selected:  5.9
Average time taken:  1.5966955423355103 seconds
