In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
data = pd.read_csv('adult_cleaned_final.csv')

In [5]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K


In [6]:
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [11]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,2,226802,1,7,4,6,3,2,1,0,0,40,38,0
1,38,2,89814,10,9,2,4,0,4,1,0,0,50,38,0
2,28,1,336951,6,12,2,10,0,4,1,0,0,40,38,1
3,44,2,160323,13,10,2,6,0,2,1,7688,0,40,38,1
4,24,2,369667,13,10,4,7,4,4,0,0,0,40,38,0


In [12]:
X = data.drop('income', axis=1).values
y = data['income'].values

In [13]:
print(X)

[[    25      2 226802 ...      0     40     38]
 [    38      2  89814 ...      0     50     38]
 [    28      1 336951 ...      0     40     38]
 ...
 [    40      2 154374 ...      0     40     38]
 [    58      2 151910 ...      0     40     38]
 [    52      3 287927 ...      0     40     38]]


In [14]:
print(y)

[0 0 1 ... 1 0 1]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
def initialize_particles(n_particles, dimensions):
    particles = np.random.rand(n_particles, dimensions)
    velocities = np.random.rand(n_particles, dimensions) * 0.1
    p_best = particles.copy()
    return particles, velocities, p_best

In [7]:
def fitness_function(particle, X, y):
    condition = X.dot(particle[:-1]) + particle[-1] > 0
    predictions = np.where(condition, 1, 0)
    return accuracy_score(y, predictions)

In [8]:
def update_particles(particles, velocities, p_best, g_best, w, c1, c2):
    r1, r2 = np.random.rand(), np.random.rand()
    velocities = w * velocities + c1 * r1 * (p_best - particles) + c2 * r2 * (g_best - particles)
    particles += velocities
    return particles, velocities

In [9]:
def pso(X, y, n_particles=30, dimensions=15, max_iter=100, w=0.5, c1=1.5, c2=1.5):
    particles, velocities, p_best = initialize_particles(n_particles, dimensions)
    p_best_scores = np.array([fitness_function(p, X, y) for p in p_best])
    g_best = p_best[np.argmax(p_best_scores)]

    for i in range(max_iter):
        for j in range(n_particles):
            fitness = fitness_function(particles[j], X, y)
            if fitness > p_best_scores[j]:
                p_best[j] = particles[j]
                p_best_scores[j] = fitness
                if fitness > fitness_function(g_best, X, y):
                    g_best = particles[j]

        particles, velocities = update_particles(particles, velocities, p_best, g_best, w, c1, c2)
        print(f'Iteration {i+1}/{max_iter}, Best Fitness: {fitness_function(g_best, X, y)}')

    return g_best

In [10]:
best_particle = pso(X_train, y_train)

Iteration 1/100, Best Fitness: 0.2623905565978737
Iteration 2/100, Best Fitness: 0.2623905565978737
Iteration 3/100, Best Fitness: 0.2623905565978737
Iteration 4/100, Best Fitness: 0.2623905565978737
Iteration 5/100, Best Fitness: 0.2623905565978737
Iteration 6/100, Best Fitness: 0.2623905565978737
Iteration 7/100, Best Fitness: 0.2623905565978737
Iteration 8/100, Best Fitness: 0.2623905565978737
Iteration 9/100, Best Fitness: 0.2623905565978737
Iteration 10/100, Best Fitness: 0.2623905565978737
Iteration 11/100, Best Fitness: 0.2623905565978737
Iteration 12/100, Best Fitness: 0.2623905565978737
Iteration 13/100, Best Fitness: 0.2623905565978737
Iteration 14/100, Best Fitness: 0.739524702939337
Iteration 15/100, Best Fitness: 0.7461303939962477
Iteration 16/100, Best Fitness: 0.7739993746091307
Iteration 17/100, Best Fitness: 0.2623905565978737
Iteration 18/100, Best Fitness: 0.764618511569731
Iteration 19/100, Best Fitness: 0.7736866791744841
Iteration 20/100, Best Fitness: 0.76500938

In [11]:
print("Best Particle:", best_particle)

Best Particle: [ 0.74336325  1.08725951 -0.00931012  0.3327871  -0.00876104  0.63560729
  0.88583276  0.00846424 -0.04748852  0.37119862  0.40431623  1.01760678
  1.20868088 -0.16451377  0.78840997]


In [12]:
condition = X_test.dot(best_particle[:-1]) + best_particle[-1] > 0
y_pred = np.where(condition, 1, 0)
accuracy = accuracy_score(y_test, y_pred)

In [13]:
print("Test Accuracy:", accuracy)

Test Accuracy: 0.7732958098811757
