In [1]:
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
import pandas as pd

In [19]:
data = pd.read_csv('adult_cleaned_final.csv')

In [20]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K


In [3]:
X = pd.get_dummies(data.drop('income', axis=1)).values
y = data['income'].str.strip().apply(lambda x: 1 if x == '>50K' else 0).values

In [16]:
print(y)

[0 0 1 ... 1 0 1]


In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [17]:
print(X)

[[-1.16088612  0.52721629 -1.32098034 ...  0.30262079 -0.04512937
  -0.02092762]
 [-0.07017698 -1.04506802 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [-0.90918401  1.79145508  0.73966858 ...  0.30262079 -0.04512937
  -0.02092762]
 ...
 [ 0.09762442 -0.30407848 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [ 1.60783707 -0.33235912 -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]
 [ 1.10443285  1.2287805  -0.49672077 ...  0.30262079 -0.04512937
  -0.02092762]]


In [5]:
model = KNeighborsClassifier()

In [6]:
scoring_functions = {
    'chi2': chi2,
    'f_classif': f_classif,
    'mutual_info_classif': mutual_info_classif
}

In [7]:
population_size = 20
n_generations = 50
mutation_rate = 0.01
crossover_rate = 0.8

In [8]:
def genetic_algorithm(X, y, n_features, model, scoring):
    n_samples, n_total_features = X.shape

    population = np.random.randint(0, 2, size=(population_size, n_total_features))

    def fitness(solution):
        selected_features = np.where(solution == 1)[0]
        if len(selected_features) == 0:
            return 0
        X_selected = X[:, selected_features]
        
        if scoring == 'chi2':
            scaler = MinMaxScaler()
        else:
            scaler = StandardScaler()
        
        X_selected = scaler.fit_transform(X_selected)
        selector = SelectKBest(scoring_functions[scoring], k=min(len(selected_features), n_features))
        X_selected = selector.fit_transform(X_selected, y)
        scores = cross_val_score(model, X_selected, y, cv=5, scoring='accuracy')
        return np.mean(scores)

    def selection(population, fitnesses):
        selected_indices = np.random.choice(np.arange(population_size), size=population_size, replace=True, p=fitnesses/fitnesses.sum())
        return population[selected_indices]

    def crossover(parent1, parent2):
        if np.random.rand() < crossover_rate:
            crossover_point = np.random.randint(1, n_total_features-1)
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            return child1, child2
        else:
            return parent1, parent2

    def mutate(chromosome):
        for i in range(n_total_features):
            if np.random.rand() < mutation_rate:
                chromosome[i] = 1 - chromosome[i]
        return chromosome

    best_solution = population[0]
    best_fitness = fitness(best_solution)

    for generation in range(n_generations):
        fitnesses = np.array([fitness(individual) for individual in population])
        best_gen_index = np.argmax(fitnesses)
        if fitnesses[best_gen_index] > best_fitness:
            best_solution = population[best_gen_index]
            best_fitness = fitnesses[best_gen_index]

        selected_population = selection(population, fitnesses)
        new_population = []

        for i in range(0, population_size, 2):
            parent1 = selected_population[i]
            parent2 = selected_population[i+1]
            child1, child2 = crossover(parent1, parent2)
            new_population.append(mutate(child1))
            new_population.append(mutate(child2))

        population = np.array(new_population)
        print(f'Iteration {generation+1} finished')

    return best_solution

In [9]:
best_features = genetic_algorithm(X, y, n_features=10, model=model, scoring='chi2')

Iteration 1 finished
Iteration 2 finished
Iteration 3 finished
Iteration 4 finished
Iteration 5 finished
Iteration 6 finished
Iteration 7 finished
Iteration 8 finished
Iteration 9 finished
Iteration 10 finished
Iteration 11 finished
Iteration 12 finished
Iteration 13 finished
Iteration 14 finished
Iteration 15 finished
Iteration 16 finished
Iteration 17 finished
Iteration 18 finished
Iteration 19 finished
Iteration 20 finished
Iteration 21 finished
Iteration 22 finished
Iteration 23 finished
Iteration 24 finished
Iteration 25 finished
Iteration 26 finished
Iteration 27 finished
Iteration 28 finished
Iteration 29 finished
Iteration 30 finished
Iteration 31 finished
Iteration 32 finished
Iteration 33 finished
Iteration 34 finished
Iteration 35 finished
Iteration 36 finished
Iteration 37 finished
Iteration 38 finished
Iteration 39 finished
Iteration 40 finished
Iteration 41 finished
Iteration 42 finished
Iteration 43 finished
Iteration 44 finished
Iteration 45 finished
Iteration 46 finish

In [10]:
selected_features_indices = np.where(best_features == 1)[0]
X_selected = X[:, selected_features_indices]

In [18]:
print(X_selected)

[[-1.16088612  0.52721629 -0.14818923 ... -0.04407355 -0.03494283
  -0.02092762]
 [-0.07017698 -1.04506802 -0.14818923 ... -0.04407355 -0.03494283
  -0.02092762]
 [-0.90918401  1.79145508 -0.14818923 ... -0.04407355 -0.03494283
  -0.02092762]
 ...
 [ 0.09762442 -0.30407848 -0.14818923 ... -0.04407355 -0.03494283
  -0.02092762]
 [ 1.60783707 -0.33235912 -0.14818923 ... -0.04407355 -0.03494283
  -0.02092762]
 [ 1.10443285  1.2287805   2.07774495 ... -0.04407355 -0.03494283
  -0.02092762]]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

[WinError 2] The system cannot find the file specified
  File "c:\Users\DT User3\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\DT User3\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 501, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\DT User3\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 966, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\DT User3\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1435, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


In [12]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8012820512820513
