In [219]:
import numpy as np
import pandas as pd
import operator as op
import random as rd
from copy import deepcopy
from anytree import Node, RenderTree, AsciiStyle, PreOrderIter
from anytree.cachedsearch import findall
from sklearn.metrics.cluster import v_measure_score
from pyclustering.cluster.kmeans import kmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.utils.metric import distance_metric, type_metric

In [2]:
# Dataframe e variáveis associadas
pd.set_option('display.max_rows', None)
df = pd.read_csv('glass_train.csv')
cluster_column = df.columns.values[-1]
cluster_count = df[cluster_column].nunique()
df_unclass = df.drop([cluster_column], axis=1)
df_attr = len(df_unclass.columns)

In [3]:
# Definindo operadores e terminais
def div0(a, b): return 1 if b == 0 else a/b
ops = {
    '+': op.add,
    '-': op.sub,
    '*': op.mul,
    '/': div0,
    'max': max,
    'min': min,
}
nonterminals = list(ops.keys())
terminals = [str(i) + 'a' for i in range(df_attr)]
terminals += [str(i) + 'b' for i in range(df_attr)]
terminals += ['c']
def random_constant(min_=-1000, max_=1000): return rd.uniform(min_, max_)
def get_terminal_value(term, row1, row2):
    t = str(term[-1])
    # Constante
    if t != 'a' and t != 'b': return float(term)
    # Terminal comum
    index = int(term[:-1])
    return row1[index] if t == 'a' else row2[index]
def random_terminal():
    t = rd.choice(terminals)
    return random_constant() if t == 'c' else t
def random_nonterminal(): return rd.choice(nonterminals)

In [4]:
# Criação de árvores para população inicial
max_h = 7

def create_tree(h = 0, full = False):
    node = Node('')
    # Retorna um terminal se a árvore passar do tamanho máximo
    if h >= max_h:
        node.name = random_terminal()
    else:
        # Método full: escolhe não terminais até não poder mais
        if full: value = random_nonterminal()
        # Método grow: escolhe entre terminais ou não terminais
        else: value = random_terminal() if rd.randint(0,1) == 0 else random_nonterminal()
        node.name = value
        if value in nonterminals:
            left_child = create_tree(h = h+1, full = full)
            left_child.parent = node
            right_child = create_tree(h = h+1, full = full)
            right_child.parent = node
    return node

# Imprime uma árvore
def print_tree(t):
    for pre, _, node in RenderTree(t):
        print("%s%s" % (pre, node.name))
        
# Função que calcula a distância entre dois pontos do dataframe com a árvore
def evaluate_tree(t, row1, row2):
    v = str(t.name)
    if v in nonterminals:
        lhs = evaluate_tree(t.children[0], row1, row2)
        rhs = evaluate_tree(t.children[1], row1, row2)
        return ops[v](lhs, rhs)
    else:
        if v[-1] != 'a' and v[-1] != 'b':
            return float(v)
        else:
            index = int(v[:-1])
            return row1[index] if v[-1] == 'a' else row2[index]

evaluated_tree = None
def evaluate(row1, row2): return evaluate_tree(evaluated_tree, row1, row2)

In [5]:
# Inicializando os centros uma única vez para resultados de fitness não mudarem entre chamadas
df_unclass = df_unclass.drop(['pred'], axis=1, errors='ignore')
init_centers = kmeans_plusplus_initializer(df_unclass, cluster_count).initialize()

# Calcula fitness usando kmeans e v measure em cima da função de um indivíduo
def fitness(t):
    global df_unclass
    global evaluated_tree
    evaluated_tree = t
    # Primeiro, descarta previsões passadas
    df_unclass = df_unclass.drop(['pred'], axis=1, errors='ignore')
    # Depois, roda o kmeans para o clustering
    mt = distance_metric(type_metric.USER_DEFINED, func = evaluate)
    #init_centers = kmeans_plusplus_initializer(df_unclass, cluster_count).initialize()
    kmeans_inst = kmeans(df_unclass, init_centers, metric = mt, itermax = 20)
    kmeans_inst.process()
    kmeans_clusters = kmeans_inst.get_clusters()
    # Para cada cluster, coloca os valores como previsões
    for i in range(len(kmeans_clusters)):
        df_unclass.loc[kmeans_clusters[i], 'pred'] = df.iloc[kmeans_clusters[i]].groupby(cluster_column).size().idxmax()
    # Compara as previsões com os valores reais com v measure
    fit = v_measure_score(df[cluster_column], df_unclass['pred'])
    t.fitness = fit
    return fit

In [None]:
t = create_tree()
print_tree(t)
print(t.parent)

In [414]:
# Operadores genéticos e variáveis associadas
crossover_prob = 0.9
mutation_prob = 0.05

# Seleciona um nó aleatório de uma árvore
def random_node(t):
    nodes = [node for node in PreOrderIter(t)]
    return rd.choice(nodes)

def add_left_child(t, c): t.children = [c] + list(t.children)
def add_right_child(t, c): t.children = list(t.children) + [c]
def replace_left_child(t, c):
    if len(t.children) >= 2: t.children = (c,) + t.children[1:]
    else: t.children = (c,) + t.children
def replace_right_child(t, c):
    if len(t.children) >= 2: t.children = t.children[:-1] + (c,)
    else: t.children += (c,)

def is_left_child(t, c): return c.name == t.children[0].name
def is_right_child(t, c): return c.name == t.children[1].name

# Crossover
def crossover(t1_original, t2_original):
    t1, t2 = deepcopy(t1_original), deepcopy(t2_original)
    node1, node2 = random_node(t1), random_node(t2)
    #print('Node1: ' + str(node1))
    #print('Node2: ' + str(node2))
    parent1, parent2 = node1.parent, node2.parent
    parent1_c, parent2_c = deepcopy(parent1), deepcopy(parent2)
    #print('Parent1: ' + str(parent1) if parent1 != None else print('Parent1: --'))
    #print('Parent2: ' + str(parent2) if parent2 != None else print('Parent2: --'))
    
    if parent1_c == None: t1 = node2
    elif is_left_child(parent1_c, node1): replace_left_child(parent1, node2)
    elif is_right_child(parent1_c, node1): replace_right_child(parent1, node2)
    else: raise IndexError('Couldnt find ' + str(node1) + ' in children of ' + str(parent1))
    
    if parent2_c == None: t2 = node1
    elif is_left_child(parent2_c, node2): replace_left_child(parent2, node1)
    elif is_right_child(parent2_c, node2): replace_right_child(parent2, node1)
    else: raise IndexError('Couldnt find ' + str(node2) + ' in children of ' + str(parent2))
    
    return t1, t2

In [405]:
x0 = Node('x0')
x1 = Node('x1')
x2 = Node('x2', parent = x0)
add_left_child(x0, x1)
print_tree(x0)

y0 = Node('y0')
y1 = Node('y1')
y2 = Node('y2')
y3 = Node('y3')
y4 = Node('y4')
add_left_child(y0, y1)
add_left_child(y1, y2)
add_right_child(y1, y3)
add_right_child(y0, y4)
print_tree(y0)

x0
├── x1
└── x2
y0
├── y1
│   ├── y2
│   └── y3
└── y4


In [412]:
new_x, new_y = crossover(x0, y0)

Node1: Node('/x0')
Node2: Node('/y0')
Parent1: --
None
Parent2: --
None


In [413]:
print_tree(new_x)
print_tree(new_y)

y0
├── y1
│   ├── y2
│   └── y3
└── y4
x0
├── x1
└── x2


In [417]:
df.iloc[0].tolist()

[1.5159, 12.82, 3.52, 1.9, 72.86, 0.69, 7.97, 0.0, 0.0, 2.0]