In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
def initialize_bats(pop_size, dim):
    return np.random.rand(pop_size, dim)

In [3]:
def update_position(position, velocity):
    return position + velocity

In [18]:
def bat_algorithm(objective_function, pop_size=10, max_iterations=100, loudness=0.5, pulse_rate=0.5):
    dim = 10  # The objective function takes 'x' and 'y', so subtract 1 for dimensionality
    
    bats = initialize_bats(pop_size, dim)
    velocities = np.zeros((pop_size, dim))

    fitness = np.apply_along_axis(objective_function, 1, bats)
    best_index = np.argmin(fitness)
    best_solution = bats[best_index]

    for iteration in range(max_iterations):
        current_loudness = loudness * (1 - np.exp(-pulse_rate * iteration))

        for i in range(pop_size):
            frequency = 0.5
            velocities[i] = velocities[i] + (bats[i] - best_solution) * frequency
            bats[i] = update_position(bats[i], velocities[i])

            if np.random.rand() > current_loudness:
                bats[i] = best_solution + 0.001 * np.random.randn(dim)

        new_fitness = np.apply_along_axis(objective_function, 1, bats)
        new_best_index = np.argmin(new_fitness)
        
        if new_fitness[new_best_index] < fitness[best_index]:
            best_solution = bats[new_best_index]
            best_index = new_best_index

    return best_solution, fitness[best_index]

In [30]:
def objective_function(features):
    # Features is a binary vector indicating selected features
    selected_features = [X_train.columns[i] for i, selected in enumerate(features) if selected]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    
    classifier.fit(X_train_selected, y_train)
    y_pred = classifier.predict(X_test_selected)
    
    return 1 - accuracy_score(y_test, y_pred)  # We minimize the error rate, so 1 - accuracy is used

In [20]:
data = pd.read_csv('diabetes.csv')

In [21]:
X = data.drop('Outcome', axis=1)  # Replace 'target_column_name' with the actual target column name
y = data['Outcome'] 

In [22]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [24]:
print("Size of train set:", X_train.shape[0])
print("Size of test set:", X_test.shape[0])

Size of train set: 537
Size of test set: 231


In [25]:
if X_train.empty or X_test.empty:
    raise ValueError("Empty train or test data after splitting")

In [26]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [27]:
if np.isnan(X_train).any() or np.isinf(X_train).any():
    raise ValueError("NaN or infinite values found in standardized train data")

In [28]:
classifier = RandomForestClassifier()

In [29]:
best_features, best_fitness = bat_algorithm(objective_function)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'