In [12]:
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
scoring_functions = {
    'chi2': chi2,
    'f_classif': f_classif,
    'mutual_info_classif': mutual_info_classif
}

In [3]:
data = pd.read_csv('adult_cleaned_final.csv')

In [4]:
X = pd.get_dummies(data.drop('income', axis=1)).values
y = (data['income'] == ' >50K').astype(int).values

In [5]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [6]:
n_bats = 20
n_iterations = 100
min_freq = 0
max_freq = 2
alpha = 0.9  
gamma = 0.9

In [13]:
def bat_algorithm(X, y, n_features, model, scoring):
    n_samples, n_total_features = X.shape

    positions = np.random.randint(0, 2, size=(n_bats, n_total_features))
    velocities = np.zeros((n_bats, n_total_features))

    freq = np.zeros(n_bats)
    loudness = np.ones(n_bats)
    pulse_rate = np.ones(n_bats)

    def fitness(solution):
        selected_features = np.where(solution == 1)[0]
        if len(selected_features) == 0:
            return 0
        X_selected = X[:, selected_features]
        
        if scoring == 'chi2':
            scaler = MinMaxScaler()
        else:
            scaler = StandardScaler()
        
        X_selected = scaler.fit_transform(X_selected)
        selector = SelectKBest(scoring_functions[scoring], k=min(len(selected_features), n_features))
        X_selected = selector.fit_transform(X_selected, y)
        scores = cross_val_score(model, X_selected, y, cv=5, scoring='accuracy')
        return np.mean(scores)

    best_position = positions[0]
    best_fitness = fitness(best_position)

    for t in range(n_iterations):
        for i in range(n_bats):
            freq[i] = min_freq + (max_freq - min_freq) * np.random.rand()
            velocities[i] += (positions[i] - best_position) * freq[i]
            new_position = positions[i] + velocities[i]

            new_position = np.clip(new_position, 0, 1)
            new_position = np.random.randint(0, 2, size=n_total_features)

            if np.random.rand() > pulse_rate[i]:
                new_position = best_position + alpha * np.mean(loudness)

            new_fitness = fitness(new_position)

            if (new_fitness > fitness(positions[i])) and (np.random.rand() < loudness[i]):
                positions[i] = new_position
                loudness[i] *= alpha
                pulse_rate[i] = pulse_rate[i] * (1 - np.exp(-gamma * t))

            if new_fitness > best_fitness:
                best_position = new_position
                best_fitness = new_fitness

    return best_position

In [8]:
models = {
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

In [14]:
results = {}
for model_name, model in models.items():
    best_features = bat_algorithm(X, y, n_features=10, model=model, scoring='chi2')
    selected_features_indices = np.where(best_features == 1)[0]
    X_selected = X[:, selected_features_indices]

    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy

In [None]:
print(results)