In [21]:
import numpy as np
import pandas as pd
import operator as op
import random as rd
from anytree import Node, RenderTree, AsciiStyle
from anytree.cachedsearch import findall
from sklearn.metrics.cluster import v_measure_score
from pyclustering.cluster.kmeans import kmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.utils.metric import distance_metric, type_metric
import fastcache

In [130]:
1.516/72.65

0.020867171369580176

In [181]:
# Dataframe e variáveis associadas
pd.set_option('display.max_rows', None)
df = pd.read_csv('glass_train.csv')
cluster_column = df.columns.values[-1]
cluster_count = df[cluster_column].nunique()
df_unclass = df.drop([cluster_column], axis=1)
df_attr = len(df_unclass.columns)

In [45]:
# Definindo operadores e terminais
def div0(a, b): return 1 if b == 0 else a/b
ops = {
    '+': op.add,
    '-': op.sub,
    '*': op.mul,
    '/': div0,
    'max': max,
    'min': min,
}
nonterminals = list(ops.keys())
terminals = [str(i) + 'a' for i in range(df_attr)]
terminals += [str(i) + 'b' for i in range(df_attr)]
terminals += ['c']
def random_constant(min_=-1000, max_=1000): return rd.uniform(min_, max_)
def get_terminal_value(term, row1, row2):
    t = str(term[-1])
    # Constante
    if t != 'a' and t != 'b': return float(term)
    # Terminal comum
    index = int(term[:-1])
    return row1[index] if t == 'a' else row2[index]
def random_terminal():
    t = rd.choice(terminals)
    return random_constant() if t == 'c' else t
def random_nonterminal(): return rd.choice(nonterminals)

In [192]:
# Criação de árvores para população inicial
max_h = 7

def create_tree(h = 0, full = False):
    node = Node('')
    # Retorna um terminal se a árvore passar do tamanho máximo
    if h >= max_h:
        node.name = random_terminal()
    else:
        # Método full: escolhe não terminais até não poder mais
        if full: value = random_nonterminal()
        # Método grow: escolhe entre terminais ou não terminais
        else: value = random_terminal() if rd.randint(0,1) == 0 else random_nonterminal()
        node.name = value
        if value in nonterminals:
            left_child = create_tree(h = h+1, full = full)
            left_child.parent = node
            right_child = create_tree(h = h+1, full = full)
            right_child.parent = node
    return node
            
def print_tree(t):
    for pre, _, node in RenderTree(t):
        print("%s%s" % (pre, node.name))
        
def evaluate_tree(t, row1, row2):
    v = str(t.name)
    if v in nonterminals:
        lhs = evaluate_tree(t.children[0], row1, row2)
        rhs = evaluate_tree(t.children[1], row1, row2)
        return ops[v](lhs, rhs)
    else:
        if v[-1] != 'a' and v[-1] != 'b':
            return float(v)
        else:
            index = int(v[:-1])
            return row1[index] if v[-1] == 'a' else row2[index]

evaluated_tree = None
def evaluate(row1, row2): return evaluate_tree(evaluated_tree, row1, row2)

In [228]:
df_unclass = df_unclass.drop(['pred'], axis=1, errors='ignore')
init_centers = kmeans_plusplus_initializer(df_unclass, cluster_count).initialize()

def fitness(t):
    global df_unclass
    global evaluated_tree
    evaluated_tree = t
    # Primeiro, descarta previsões passadas
    df_unclass = df_unclass.drop(['pred'], axis=1, errors='ignore')
    # Depois, roda o kmeans para o clustering
    mt = distance_metric(type_metric.USER_DEFINED, func = evaluate)
    #init_centers = kmeans_plusplus_initializer(df_unclass, cluster_count).initialize()
    kmeans_inst = kmeans(df_unclass, init_centers, metric = mt, itermax = 20)
    kmeans_inst.process()
    kmeans_clusters = kmeans_inst.get_clusters()
    # Para cada cluster, coloca os valores como previsões
    for i in range(len(kmeans_clusters)):
        df_unclass.loc[kmeans_clusters[i], 'pred'] = df.iloc[kmeans_clusters[i]].groupby(cluster_column).size().idxmax()
    # Compara as previsões com os valores reais com v measure
    return v_measure_score(df[cluster_column], df_unclass['pred'])

In [213]:
t = create_tree(full = True)
print_tree(t)

+
├── max
│   ├── *
│   │   ├── *
│   │   │   ├── +
│   │   │   │   ├── +
│   │   │   │   │   ├── *
│   │   │   │   │   │   ├── 2b
│   │   │   │   │   │   └── 6a
│   │   │   │   │   └── +
│   │   │   │   │       ├── 7a
│   │   │   │   │       └── 4a
│   │   │   │   └── min
│   │   │   │       ├── /
│   │   │   │       │   ├── 5a
│   │   │   │       │   └── 3a
│   │   │   │       └── /
│   │   │   │           ├── 8a
│   │   │   │           └── 7b
│   │   │   └── /
│   │   │       ├── -
│   │   │       │   ├── /
│   │   │       │   │   ├── 8b
│   │   │       │   │   └── 5a
│   │   │       │   └── max
│   │   │       │       ├── -30.724761362550566
│   │   │       │       └── 1a
│   │   │       └── *
│   │   │           ├── -
│   │   │           │   ├── 6a
│   │   │           │   └── 3a
│   │   │           └── -
│   │   │               ├── 7b
│   │   │               └── 7a
│   │   └── *
│   │       ├── +
│   │       │   ├── min
│   │       │   │   ├── +
│   │       │   │   │   ├── 5b
│   

In [231]:
fitness(t)

0.029733008639916056

In [232]:
df_unclass

Unnamed: 0,refractive_index,Sodium,Magnesium,Aluminum,Silicon,Potassium,Calcium,Barium,Iron,pred
0,1.5159,12.82,3.52,1.9,72.86,0.69,7.97,0.0,0.0,2.0
1,1.51934,13.64,3.54,0.75,72.65,0.16,8.89,0.15,0.24,2.0
2,1.51818,13.72,0.0,0.56,74.45,0.0,10.99,0.0,0.0,2.0
3,1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0.0,0.17,2.0
4,1.5186,13.36,3.43,1.43,72.26,0.51,8.6,0.0,0.0,2.0
5,1.52068,13.55,2.09,1.67,72.18,0.53,9.57,0.27,0.17,5.0
6,1.51806,13.0,3.8,1.08,73.07,0.56,8.38,0.0,0.12,2.0
7,1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.0,0.0,2.0
8,1.5172,13.38,3.5,1.15,72.85,0.5,8.43,0.0,0.0,2.0
9,1.52152,13.12,3.58,0.9,72.2,0.23,9.82,0.0,0.16,2.0
