# Genetic Algorithms (GA)

In [None]:
#isntall necessary libraries
#pip install deap in the terminal


In [1]:
#import necessary libraries
import pandas as pd
import random
import numpy as np
from deap import base, creator, tools
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv(
    'C:/Lecture Notes/Degree/Sem 8/TCI - Computational Intelligence/new_project/cleaned_smart_bin_data.csv'
)

# Keep bin_id separately
bin_ids = df['bin_id']

In [3]:
# Convert 3-class label to binary
# High (2) -> 1 (important)
# Low + Medium (0,1) -> 0 (not important)
y = (df['priority_label'] == 2).astype(int)

X = df.drop(['priority_label'], axis=1)

X_train, X_test, y_train, y_test, bin_id_train, bin_id_test = train_test_split(
    X, y, bin_ids,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)

Train: (81, 6)
Test: (36, 6)


In [4]:
def discretize_fill(x):
    if x < 60:
        return 0   # Low
    elif x < 85:
        return 1   # Medium
    else:
        return 2   # High

def discretize_battery(x):
    if x < 30:
        return 0
    elif x < 70:
        return 1
    else:
        return 2

def discretize_time(x):
    if x < 0.3:
        return 0   # Recent
    elif x < 0.7:
        return 1   # Moderate
    else:
        return 2   # Long

In [5]:
# GA REPRESENTATION

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Rule structure:
# [fill, battery, overflow, time, status, priority, score]
def random_rule():
    return [
        random.choice([-1, 0, 1, 2]),  # fill
        random.choice([-1, 0, 1, 2]),  # battery
        random.choice([-1, 0, 1]),     # overflow
        random.choice([-1, 0, 1, 2]),  # time
        random.choice([-1, 0, 1, 2]),  # status
        random.randint(0, 1),          # priority (0/1)
        random.uniform(0, 1)           # score
    ]

def init_individual(n_rules=6):
    return creator.Individual([random_rule() for _ in range(n_rules)])

In [6]:
#  RULE MATCHING & INFERENCE

def match_rule(rule, row):
    conditions = [
        (rule[0], discretize_fill(row['fill_level_percent'])),
        (rule[1], discretize_battery(row['battery_level_percent'])),
        (rule[2], row['overflow_alert']),
        (rule[3], discretize_time(row['time_since_last_reading_norm'])),
        (rule[4], row['bin_status_encoded'])
    ]

    for rule_val, data_val in conditions:
        if rule_val != -1 and rule_val != data_val:
            return False

    return True


def ga_infer(row, rules):
    matched = [r for r in rules if match_rule(r, row)]

    if not matched:
        return 0, 0.0

    # Important (priority = 1) rules
    important_rules = [r for r in matched if r[5] == 1]

    if important_rules:
        best = max(important_rules, key=lambda r: r[6])
        return 1, best[6]
    else:
        # Not important but still give weak score
        best_low = max(matched, key=lambda r: r[6])
        return 0, best_low[6] * 0.33


In [7]:
# FITNESS FUNCTION (BINARY F1)
from sklearn.metrics import f1_score
def evaluate(individual, X, y):
    preds = []

    for _, row in X.iterrows():
        p, _ = ga_infer(row, individual)
        preds.append(p)

    f1 = f1_score(y, preds, average='binary', zero_division=0)

    # Penalize too many don't-cares
    dont_care_penalty = sum(rule.count(-1) for rule in individual) * 0.001

    return (f1 - dont_care_penalty,)

In [8]:
toolbox = base.Toolbox()

toolbox.register("individual", init_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate, X=X_train, y=y_train)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)

In [9]:
# TRAIN GA
population = toolbox.population(n=50)
NGEN = 30

for gen in range(NGEN):
    offspring = toolbox.select(population, len(population))
    offspring = list(map(toolbox.clone, offspring))

    for c1, c2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < 0.7:
            toolbox.mate(c1, c2)
            del c1.fitness.values, c2.fitness.values

    for mutant in offspring:
        if random.random() < 0.2:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    invalid = [ind for ind in offspring if not ind.fitness.valid]
    for ind, fit in zip(invalid, map(toolbox.evaluate, invalid)):
        ind.fitness.values = fit

    population[:] = offspring

best_ind = tools.selBest(population, 1)[0]


In [10]:
# EVALUATION

y_pred = []
scores = []

for _, row in X_test.iterrows():
    p, s = ga_infer(row, best_ind)
    y_pred.append(p)
    scores.append(s)

print("GA Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


GA Accuracy: 0.8055555555555556
[[21  4]
 [ 3  8]]
              precision    recall  f1-score   support

           0       0.88      0.84      0.86        25
           1       0.67      0.73      0.70        11

    accuracy                           0.81        36
   macro avg       0.77      0.78      0.78        36
weighted avg       0.81      0.81      0.81        36



In [12]:
#print top 10 bins with the highest priority
results = X_test.copy()
results['bin_id'] = bin_id_test.values
results['priority'] = y_pred
results['score'] = scores

top_10 = results.sort_values(
    by='score',
    ascending=False
).head(10)

print(top_10[['bin_id', 'priority', 'score']])

      bin_id  priority     score
25   BIN-026         1  0.796330
2    BIN-003         1  0.796330
30   BIN-031         1  0.796330
94   BIN-147         1  0.796330
78   BIN-131         1  0.796330
81   BIN-134         1  0.796330
43   BIN-045         1  0.796330
84   BIN-137         1  0.258869
75   BIN-128         1  0.258869
113  BIN-216         1  0.133438
