In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import mode

In [31]:
data = pd.read_csv('adult_cleaned_final.csv')

In [32]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K


In [3]:
y = data['income']
data = data.drop('income', axis=1)

In [23]:
y.head()

0    <=50K
1    <=50K
2     >50K
3     >50K
4    <=50K
Name: income, dtype: object

In [4]:
data = pd.get_dummies(data, drop_first=True)

In [24]:
data.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
1,38,89814,9,0,0,50,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
2,28,336951,12,0,0,40,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,44,160323,10,7688,0,40,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,24,369667,10,0,0,40,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False


In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

In [26]:
print(X_scaled)

[[-1.16088612  0.52721629 -1.32098034 ...  0.30262079 -0.04512937
  -0.02092762]
 [-0.07017698 -1.04506802 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [-0.90918401  1.79145508  0.73966858 ...  0.30262079 -0.04512937
  -0.02092762]
 ...
 [ 0.09762442 -0.30407848 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [ 1.60783707 -0.33235912 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [ 1.10443285  1.2287805  -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]]


In [6]:
selector = VarianceThreshold(threshold=0.1)
X_reduced = selector.fit_transform(X_scaled)

In [27]:
print(X_reduced)

[[-1.16088612  0.52721629 -1.32098034 ...  0.30262079 -0.04512937
  -0.02092762]
 [-0.07017698 -1.04506802 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [-0.90918401  1.79145508  0.73966858 ...  0.30262079 -0.04512937
  -0.02092762]
 ...
 [ 0.09762442 -0.30407848 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [ 1.60783707 -0.33235912 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [ 1.10443285  1.2287805  -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]]


In [7]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [28]:
print(y_encoded)

[0 0 1 ... 1 0 1]


In [8]:
def evaluate_kmeans(features):
    selected_features = np.where(features > 0.5, 1, 0)
    X_selected = X_reduced[:, selected_features == 1]
    
    if X_selected.shape[1] == 0:
        return 1.0
    
    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(X_selected)
    labels = kmeans.labels_
    
    score = silhouette_score(X_selected, labels)
    return -score

In [9]:
def bat_algorithm(dim, lb, ub, n=20, max_gen=50, alpha=0.9, gamma=0.9, Qmin=0, Qmax=2):
    bats = np.random.uniform(lb, ub, (n, dim))
    velocities = np.zeros((n, dim))
    frequencies = np.zeros(n)
    fitness = np.array([evaluate_kmeans(b) for b in bats])
    best_bat = bats[np.argmin(fitness)]
    best_fitness = np.min(fitness)
    
    for gen in range(max_gen):
        for i in range(n):
            frequencies[i] = Qmin + (Qmax - Qmin) * np.random.rand()
            velocities[i] += (bats[i] - best_bat) * frequencies[i]
            solution = bats[i] + velocities[i]
            solution = np.clip(solution, lb, ub)
            
            if np.random.rand() > alpha:
                solution = best_bat + 0.001 * np.random.randn(dim)
            
            new_fitness = evaluate_kmeans(solution)
            
            if new_fitness < fitness[i] and np.random.rand() < gamma:
                bats[i] = solution
                fitness[i] = new_fitness
                
                if new_fitness < best_fitness:
                    best_bat = solution
                    best_fitness = new_fitness
                    
    return best_bat, best_fitness

In [10]:
dim = X_reduced.shape[1]
lb = [0] * dim
ub = [1] * dim

In [11]:
best_pos_bat, best_fit_bat = bat_algorithm(dim, lb, ub)

  super()._check_params_vs_input(X, default_n_init=10)
[WinError 2] The system cannot find the file specified
  File "c:\Users\DT User3\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\DT User3\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 501, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\DT User3\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 966, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\DT User3\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1435, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_in

In [12]:
selected_features_bat = np.where(best_pos_bat > 0.5, 1, 0)

In [13]:
print("Bat selected features:", selected_features_bat)

Bat selected features: [0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0
 1 1 0 1 1 1 1 0 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0
 1 1 0 1 1 1 0 1 1 0 0 0 1 1 0 1 0 0 1 1]


In [14]:
print("Best silhouette score (Bat):", -best_fit_bat)

Best silhouette score (Bat): 0.5510138293813686


In [15]:
X_selected = X_reduced[:, selected_features_bat == 1]
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_selected)
labels = kmeans.labels_

  super()._check_params_vs_input(X, default_n_init=10)


In [29]:
print(labels)

[1 1 1 ... 1 1 1]


In [16]:
def map_clusters_to_labels(labels, y):
    labels = labels.astype(int)
    mapped_labels = np.zeros_like(labels)
    for i in range(np.max(labels) + 1):
        mask = (labels == i)
        mapped_labels[mask] = pd.Series(y[mask]).mode()[0]
    return mapped_labels

In [17]:
mapped_labels = map_clusters_to_labels(labels, y_encoded)

In [30]:
print(mapped_labels)

[0 0 0 ... 0 0 0]


In [18]:
accuracy = accuracy_score(y_encoded, mapped_labels)
print("Accuracy (Bat):", accuracy)

Accuracy (Bat): 0.7379299562226391
