In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import uniform
import matplotlib.pyplot as plt
import random

In [2]:
data = pd.read_csv('adult_cleaned_final.csv')

In [4]:
X = data.drop('income', axis=1)

In [6]:
y = data['income'].apply(lambda x: 1 if x == '>50K' else 0)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
numerical_features = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']

In [10]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [12]:
initial_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='rbf'))
])

In [13]:
initial_clf.fit(X_train, y_train)
y_pred_initial = initial_clf.predict(X_test)
print("Inicijalna preciznost:", accuracy_score(y_test, y_pred_initial))

Inicijalna preciznost: 0.8453721075672295


In [14]:
initial_best = [1.5227581792019662, 0.06450025916751723]

In [15]:
lb = [max(0.1, initial_best[0] - 0.5), max(0.001, initial_best[1] - 0.01)]
ub = [initial_best[0] + 0.5, initial_best[1] + 0.01]
random_seed = 54

In [16]:
def objective_function(params, iteration_seed):
    C, gamma = params
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', SVC(kernel='rbf', C=C, gamma=gamma, random_state=iteration_seed))
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [17]:
def ga(pop_size, generations, lb, ub, mutation_rate, initial_best, random_seed):
    np.random.seed(random_seed)
    random.seed(random_seed)
    dim = 2

    def create_individual():
        return np.random.uniform(lb, ub, dim)

    def mutate(individual):
        if np.random.rand() < mutation_rate:
            mutation_idx = np.random.randint(0, dim)
            individual[mutation_idx] = np.random.uniform(lb[mutation_idx], ub[mutation_idx])
        return individual

    def crossover(parent1, parent2):
        crossover_idx = np.random.randint(0, dim)
        child1 = np.concatenate([parent1[:crossover_idx], parent2[crossover_idx:]])
        child2 = np.concatenate([parent2[:crossover_idx], parent1[crossover_idx:]])
        return child1, child2

    population = np.array([create_individual() for _ in range(pop_size)])
    population[0] = initial_best
    print(population)
    scores = np.array([objective_function(ind, random.randint(0, 10000)) for ind in population])

    history = []

    for gen in range(generations):
        new_population = []
        for _ in range(pop_size // 2):
            parents = population[np.argsort(-scores)[:2]]
            child1, child2 = crossover(parents[0], parents[1])
            new_population.extend([mutate(child1), mutate(child2)])

        population = np.array(new_population)
        scores = np.array([objective_function(ind, random.randint(0, 10000)) for ind in population])

        print(f"GA Iteracija: {gen+1}")
        best_idx = np.argmax(scores)
        best_individual = population[best_idx]
        best_score = scores[best_idx]
        history.append(best_score)

    return best_individual, best_score, history

In [18]:
pop_size = 10
generations = 30
mutation_rate = 0.1

In [19]:
best_params_ga, best_score_ga, history_ga = ga(pop_size, generations, lb, ub, mutation_rate, initial_best, random_seed)

[[1.52275818 0.06450026]
 [1.20763487 0.06486591]
 [1.03136363 0.07387898]
 [1.82413944 0.0696465 ]
 [1.69424248 0.0550934 ]
 [1.41432021 0.06259614]
 [1.21238919 0.06984237]
 [1.63948132 0.07302574]
 [1.34788221 0.05956335]
 [1.7429761  0.07443941]]
GA Iteracija: 1
GA Iteracija: 2
GA Iteracija: 3
GA Iteracija: 4
GA Iteracija: 5
GA Iteracija: 6
GA Iteracija: 7
GA Iteracija: 8
GA Iteracija: 9
GA Iteracija: 10
GA Iteracija: 11
GA Iteracija: 12
GA Iteracija: 13
GA Iteracija: 14
GA Iteracija: 15
GA Iteracija: 16
GA Iteracija: 17
GA Iteracija: 18
GA Iteracija: 19
GA Iteracija: 20
GA Iteracija: 21
GA Iteracija: 22
GA Iteracija: 23
GA Iteracija: 24
GA Iteracija: 25
GA Iteracija: 26
GA Iteracija: 27
GA Iteracija: 28
GA Iteracija: 29
GA Iteracija: 30


In [20]:
print(f"GA najbolje parametre: C = {best_params_ga[0]}, gamma = {best_params_ga[1]}")
print(f"GA najbolji rezultat: {best_score_ga}")

GA najbolje parametre: C = 1.8241394352184273, gamma = 0.07443940842727098
GA najbolji rezultat: 0.8488117573483427
