In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# read datasets
train_df = pd.read_csv('BCD_Train.csv')
test_df = pd.read_csv('BCD_Test.csv')

# label for classificaiton
X_train = train_df.drop('Diagnosis', axis=1)  
y_train = train_df['Diagnosis']

X_test = test_df.drop('Diagnosis', axis=1)
y_test = test_df['Diagnosis']

def employedBee(X_train, y_train, X_test, y_test, selected_features, n_employed):

    # initial valuess
    best_features = selected_features.copy()
    best_accuracy = 0

    for bee in range(n_employed):
        employed_bees = np.random.choice(np.where(selected_features == 1)[0], size=n_employed, replace=True)
        
        candidate_features = selected_features.copy()
        for bee in employed_bees:
            candidate_features[bee] = 1 if np.random.rand() < 0.5 else 0

        # find best
        knn_model = KNeighborsClassifier(n_neighbors=3)
        knn_model.fit(X_train.iloc[:, candidate_features == 1], y_train)
        y_pred = knn_model.predict(X_test.iloc[:, candidate_features == 1])
        accuracy = accuracy_score(y_test, y_pred)
        # update
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = candidate_features.copy()
    
    return best_features, best_accuracy


def onlookerBee(X_train, y_train, X_test, y_test, employed_features, n_onlookers):

    # initial values
    best_features = employed_features.copy()
    best_accuracy = 0

    for onlookerBee in range(n_onlookers):
        onlooker_features = np.random.choice(employed_features, size=n_onlookers, replace=True)
        
        candidate_features = np.zeros(X_train.shape[1])
        candidate_features[onlooker_features] = 1

        # KNN
        knn_model = KNeighborsClassifier(n_neighbors=3)
        knn_model.fit(X_train.iloc[:, candidate_features == 1], y_train)
        y_pred = knn_model.predict(X_test.iloc[:, candidate_features == 1])
        accuracy = accuracy_score(y_test, y_pred)

        # update
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = candidate_features.copy()


    return best_features, best_accuracy

    
# main function
def feature_selection_ABC(X_train, y_train, X_test, y_test, n_iterations=100, n_employed=10, n_onlookers = 10, limit=5):

    num_features = X_train.shape[1]

     # choose random features
    selected_features = np.random.choice([0, 1], size=num_features)

    # initial values
    best_features = selected_features.copy()
    best_accuracy = 0

    for iteration in range(n_iterations):
        employed_features, employed_accuracy = employedBee(X_train, y_train, X_test, y_test, best_features, n_employed)
        onlooker_features, onlooker_accuracy = onlookerBee(X_train, y_train, X_test, y_test, employed_features, n_onlookers)
        
        if employed_accuracy > onlooker_accuracy:
            selected_features = employed_features
        else:
            selected_features = onlooker_features

        # random changing to selected features
        for feature in selected_features:
            selected_features[feature] = 1 if np.random.rand() < 0.5 else 0

        # KNN 
        knn_model = KNeighborsClassifier(n_neighbors=3)
        knn_model.fit(X_train.iloc[:, selected_features == 1], y_train)
        y_pred = knn_model.predict(X_test.iloc[:, selected_features == 1])
        accuracy = accuracy_score(y_test, y_pred)

        # update best accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = selected_features.copy()

       # scout bee
        if accuracy < best_accuracy:
            # generate a new random solution
            selected_features = np.random.choice([0, 1], size=num_features)

    print("best features", best_features)
    return best_features, best_accuracy

# select features
selected_features_ABC = feature_selection_ABC(X_train, y_train, X_test, y_test)

# KNN with selected features
knn_model_final = KNeighborsClassifier(n_neighbors=3)
knn_model_final.fit(X_train.iloc[:, selected_features_ABC[0] == 1], y_train)

# test
y_pred_final = knn_model_final.predict(X_test.iloc[:, selected_features_ABC[0] == 1])
accuracy_final = accuracy_score(y_test, y_pred_final)
print("Final accuracy:", accuracy_final)





best features [1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0]
Final accuracy: 0.9436619718309859
