In [48]:
# Importando funções e bibliotecas
import numpy as np
import pandas as pd
import operator as op
import random as rd

In [44]:
# Importando dataset
df = pd.read_csv('glass_train.csv')
cluster_column = 'glass_type'
df_unclassified = df.drop([cluster_column], axis=1)
df

Unnamed: 0,refractive_index,Sodium,Magnesium,Aluminum,Silicon,Potassium,Calcium,Barium,Iron,glass_type
0,1.51590,12.82,3.52,1.90,72.86,0.69,7.97,0.00,0.00,2
1,1.51934,13.64,3.54,0.75,72.65,0.16,8.89,0.15,0.24,3
2,1.51818,13.72,0.00,0.56,74.45,0.00,10.99,0.00,0.00,2
3,1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0.00,0.17,2
4,1.51860,13.36,3.43,1.43,72.26,0.51,8.60,0.00,0.00,2
...,...,...,...,...,...,...,...,...,...,...
166,1.53125,10.73,0.00,2.10,69.81,0.58,13.30,3.15,0.28,2
167,1.51763,12.61,3.59,1.31,73.29,0.58,8.50,0.00,0.00,1
168,1.51588,13.12,3.41,1.58,73.26,0.07,8.39,0.00,0.19,2
169,1.51852,14.09,2.19,1.66,72.67,0.00,9.32,0.00,0.00,6


In [397]:
# Variáveis de controle de indivíduos
max_depth = 7
attr_count = len(df_unclassified.columns)
attr_count

9

In [320]:
# Definindo funções para a GP
def div0(a, b):
    if b == 0:
        return 1
    else:
        return a/b

ops = {
    '+': op.add,
    '-': op.sub,
    '*': op.mul,
    '/': div0,
    'max': max,
    'min': min,
}

# Escolhendo constante aleatória (entre -1000 e 1000)
def get_random_constant(min_v = -1000, max_v = 1000):
    return rd.uniform(min_v, max_v)

# Traduzindo terminais para valores entre duas instâncias do dataset
def get_attr(terminal, inst1, inst2):
    t = str(terminal)
    inst = t[-1]
    # Terminal constante não termina com o indexador de instância "a/b"
    if inst != 'a' and inst != 'b':
        return terminal
    # Terminal não constante da forma "(0/1/../9)(a/b)", ex.: 2b, 5a
    index = int(terminal[:-1])
    if index >= attr_count:
        raise IndexError('Terminal value outside of row range')
    if inst == 'a':
        return inst1[index]
    elif inst == 'b':
        return inst2[index]
    else:
        raise IndexError('Instance value not a/b/c')

# Fazendo lista com todos os terminais
def get_terminals():
    t = []
    for i in range(attr_count):
        index = str(i)
        t.append(index + 'a')
    for i in range(attr_count):
        index = str(i)
        t.append(index + 'b')
    return t

# Criando lista com terminais e funções
funcs = list(ops.keys())
terms = get_terminals()
terms.append('c')
components = funcs + terms

# Escolhendo terminal aleatório
def get_random_terminal():
    choice = rd.choice(terms)
    if choice == 'c':
        return get_random_constant()
    else:
        return choice

# Escolhendo função aleatória
def get_random_function():
    return rd.choice(funcs)

# Escolhendo componente aleatório
def get_random_component():
    choice = rd.choice(components)
    if choice == 'c':
        return get_random_constant()
    else:
        return choice

In [243]:
# Árvore representando indivíduo da GP
class Tree:
    def __init__(self, parent = None):
        self.value = None
        self.left = None
        self.right = None
        self.parent = parent
        
    def evaluate(self, row1, row2):
        # If node value is a function, evaluate it based on left and right children values
        if self.value in funcs:
            lv = self.left.evaluate(row1, row2)
            rv = self.right.evaluate(row1, row2)
            return ops[self.value](lv, rv)
        # If node is a terminal/constant, return the corresponding value on dataset
        else:
            return get_attr(self.value, row1, row2)
        
    def print(self):
        return printBTree(self, lambda n:(str(n.value), n.left, n.right))

# Cria uma árvore de indivíduo aleatória, com método grow ou full
def create_tree(depth = 0, full = False):
    node = Tree()
    # Caso a prof. máxima seja atingida, escolhe obrigatoriamente constante ou terminal
    if depth >= max_depth:
        node.value = get_random_terminal()
        return node
    else:
        # Em full, escolhe funções até a prof. máxima. Em grow, escolhe qualquer componente em qualquer prof.
        if full:
            comp = get_random_function()
        else:
            comp = get_random_component()
        node.value = comp
        # Caso componente escolhido seja uma função, popula ela com dois filhos
        if comp not in terms:
            node.left = create_tree(depth + 1, full)
            node.left.parent = node
            node.right = create_tree(depth + 1, full)
            node.right.parent = node
        return node

In [467]:
t = create_tree(full=True)
print('Evaluated with 0 and 1: ' + str(t.evaluate(df.iloc[0], df.iloc[1])))

Evaluated with 0 and 1: -0.7826893022280031


In [56]:
# Funcionalidades de distância euclidiana (AINDA NÃO SEI SE/COMO APLICAR)
def euclidian_dist(point1, point2):
    return np.linalg.norm(point1 - point2)

neighbor_count = int(np.ceil(np.cbrt(len(df)))) * 2

# Calcula vizinhos mais próximos de uma linha do df baseado em distância Euclidiana
def get_nearest_neighbors(row_index):
    # Retira a própria linha da tabela
    temp_df = df.drop(row_index, axis=0)
    # Calcula todas as distâncias euclidianas com a dada linha
    temp_df['Euclidian'] = temp_df.apply(lambda x: euclidian_dist(x, df.iloc[row_index]), axis=1)
    # Seleciona as melhores
    nearest_neighbors = temp_df.sort_values('Euclidian', ascending=False).head(neighbor_count)
    return list(nearest_neighbors.index.values)

# Calcula os vizinhos mais próximos de cada linha do df
# No cálculo da fitness, cada 
def calculate_nearest_neighbors():
    nearest_neighbors = []
    for i in range(len(df)):
        nearest_neighbors.append(get_nearest_neighbors(i))
    return nearest_neighbors

nearest_neighbors = calculate_nearest_neighbors()

In [83]:
# Funcionalidade para imprimir árvore binária
import functools as fn

def printBTree(node, nodeInfo=None, inverted=False, isTop=True):

    # node value string and sub nodes
    stringValue, leftNode, rightNode = nodeInfo(node)

    stringValueWidth  = len(stringValue)

    # recurse to sub nodes to obtain line blocks on left and right
    leftTextBlock     = [] if not leftNode else printBTree(leftNode,nodeInfo,inverted,False)

    rightTextBlock    = [] if not rightNode else printBTree(rightNode,nodeInfo,inverted,False)

    # count common and maximum number of sub node lines
    commonLines       = min(len(leftTextBlock),len(rightTextBlock))
    subLevelLines     = max(len(rightTextBlock),len(leftTextBlock))

    # extend lines on shallower side to get same number of lines on both sides
    leftSubLines      = leftTextBlock  + [""] *  (subLevelLines - len(leftTextBlock))
    rightSubLines     = rightTextBlock + [""] *  (subLevelLines - len(rightTextBlock))

    # compute location of value or link bar for all left and right sub nodes
    #   * left node's value ends at line's width
    #   * right node's value starts after initial spaces
    leftLineWidths    = [ len(line) for line in leftSubLines  ]                            
    rightLineIndents  = [ len(line)-len(line.lstrip(" ")) for line in rightSubLines ]

    # top line value locations, will be used to determine position of current node & link bars
    firstLeftWidth    = (leftLineWidths   + [0])[0]  
    firstRightIndent  = (rightLineIndents + [0])[0] 

    # width of sub node link under node value (i.e. with slashes if any)
    # aims to center link bars under the value if value is wide enough
    # 
    # ValueLine:    v     vv    vvvvvv   vvvvv
    # LinkLine:    / \   /  \    /  \     / \ 
    #
    linkSpacing       = min(stringValueWidth, 2 - stringValueWidth % 2)
    leftLinkBar       = 1 if leftNode  else 0
    rightLinkBar      = 1 if rightNode else 0
    minLinkWidth      = leftLinkBar + linkSpacing + rightLinkBar
    valueOffset       = (stringValueWidth - linkSpacing) // 2

    # find optimal position for right side top node
    #   * must allow room for link bars above and between left and right top nodes
    #   * must not overlap lower level nodes on any given line (allow gap of minSpacing)
    #   * can be offset to the left if lower subNodes of right node 
    #     have no overlap with subNodes of left node                                                                                                                                 
    minSpacing        = 2
    rightNodePosition = fn.reduce(lambda r,i: max(r,i[0] + minSpacing + firstRightIndent - i[1]), \
                                 zip(leftLineWidths,rightLineIndents[0:commonLines]), \
                                 firstLeftWidth + minLinkWidth)

    # extend basic link bars (slashes) with underlines to reach left and right
    # top nodes.  
    #
    #        vvvvv
    #       __/ \__
    #      L       R
    #
    linkExtraWidth    = max(0, rightNodePosition - firstLeftWidth - minLinkWidth )
    rightLinkExtra    = linkExtraWidth // 2
    leftLinkExtra     = linkExtraWidth - rightLinkExtra

    # build value line taking into account left indent and link bar extension (on left side)
    valueIndent       = max(0, firstLeftWidth + leftLinkExtra + leftLinkBar - valueOffset)
    valueLine         = " " * max(0,valueIndent) + stringValue
    slash             = "\\" if inverted else  "/"
    backslash         = "/" if inverted else  "\\"
    uLine             = "¯" if inverted else  "_"

    # build left side of link line
    leftLink          = "" if not leftNode else ( " " * firstLeftWidth + uLine * leftLinkExtra + slash)

    # build right side of link line (includes blank spaces under top node value) 
    rightLinkOffset   = linkSpacing + valueOffset * (1 - leftLinkBar)                      
    rightLink         = "" if not rightNode else ( " " * rightLinkOffset + backslash + uLine * rightLinkExtra )

    # full link line (will be empty if there are no sub nodes)                                                                                                    
    linkLine          = leftLink + rightLink

    # will need to offset left side lines if right side sub nodes extend beyond left margin
    # can happen if left subtree is shorter (in height) than right side subtree                                                
    leftIndentWidth   = max(0,firstRightIndent - rightNodePosition) 
    leftIndent        = " " * leftIndentWidth
    indentedLeftLines = [ (leftIndent if line else "") + line for line in leftSubLines ]

    # compute distance between left and right sublines based on their value position
    # can be negative if leading spaces need to be removed from right side
    mergeOffsets      = [ len(line) for line in indentedLeftLines ]
    mergeOffsets      = [ leftIndentWidth + rightNodePosition - firstRightIndent - w for w in mergeOffsets ]
    mergeOffsets      = [ p if rightSubLines[i] else 0 for i,p in enumerate(mergeOffsets) ]

    # combine left and right lines using computed offsets
    #   * indented left sub lines
    #   * spaces between left and right lines
    #   * right sub line with extra leading blanks removed.
    mergedSubLines    = zip(range(len(mergeOffsets)), mergeOffsets, indentedLeftLines)
    mergedSubLines    = [ (i,p,line + (" " * max(0,p)) )       for i,p,line in mergedSubLines ]
    mergedSubLines    = [ line + rightSubLines[i][max(0,-p):]  for i,p,line in mergedSubLines ]                        

    # Assemble final result combining
    #  * node value string
    #  * link line (if any)
    #  * merged lines from left and right sub trees (if any)
    treeLines = [leftIndent + valueLine] + ( [] if not linkLine else [leftIndent + linkLine] ) + mergedSubLines

    # invert final result if requested
    treeLines = reversed(treeLines) if inverted and isTop else treeLines

    # return intermediate tree lines or print final result
    if isTop : print("\n".join(treeLines))
    else     : return treeLines  