In [33]:
import skmultilearn
from skmultilearn import dataset
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, hamming_loss, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.adapt import MLkNN

from deap import base, creator, gp, tools, algorithms
from itertools import combinations
import operator, math, random
import numpy as np
import pygraphviz as pgv
import sympy

In [2]:
def protectedDiv(left, right):
    try:
        return left / right
    except ZeroDivisionError:
        return 1

pset = gp.PrimitiveSet("MAIN", 1)
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(protectedDiv, 2)

pset.renameArguments(ARG0='x')

In [27]:
e_train = skmultilearn.dataset.load_dataset('emotions', 'train')
e_test = skmultilearn.dataset.load_dataset('emotions', 'test')

e_train = skmultilearn.dataset.load_dataset('medical', 'train')

emotions:train - exists, not redownloading
emotions:test - exists, not redownloading
medical:train - exists, not redownloading


In [28]:
# X, y, names of attributes, names of labels
X_train = e_train[0].toarray() # 391, 72
y_train = e_train[1].toarray() # 391, 6

In [30]:
y_train_u = np.unique(y_train, axis=0) # 26 unique labels
y_train_u.shape

(61, 45)

In [35]:
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

def to_dec(label):
    # label to integer
    str_list = list(str(x) for x in label)
    str_bin = ''.join(str_list)
    return int(str_bin, 2)

def evalSymbReg(individual, labels):
    # transform the tree expression in a callable function
    func = toolbox.compile(expr=individual)
    loss = 0
    
    for i in range(len(labels)):
        label_l = []
        for j in range(len(labels)):
            if i != j:
                label_l.append([np.count_nonzero(labels[i] != labels[j]),
                                func(to_dec(labels[j]))])
        
        closest_orig = sorted(label_l)[:5] # closest original labels
        closest_new = sorted(label_l, key=lambda x: x[1])[:5] # closest new labels
        
        for l in closest_orig:
            if l not in closest_new:
                loss += 1
        
    return loss,

toolbox.register("evaluate", evalSymbReg, labels=y_train_u)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))

def main():
    random.seed(0)

    pop = toolbox.population(n=20)
    hof = tools.HallOfFame(1)

    stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
    stats_size = tools.Statistics(len)
    mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
    mstats.register("avg", np.mean)
    mstats.register("std", np.std)
    mstats.register("min", np.min)
    mstats.register("max", np.max)

    pop, log = algorithms.eaSimple(pop, toolbox, 0.5, 0.1, 10, stats=mstats,
                                   halloffame=hof, verbose=True) # P(mate), P(mutate), ngen
    # print log
    return pop, log, hof # returns population, log, hall of fame

progs = main()

   	      	                    fitness                    	                      size                     
   	      	-----------------------------------------------	-----------------------------------------------
gen	nevals	avg  	gen	max	min	nevals	std    	avg	gen	max	min	nevals	std    
0  	20    	274.1	0  	411	193	20    	58.1428	4.8	0  	7  	3  	20    	1.98997
1  	15    	252.4	1  	301	193	15    	53.7293	4.7	1  	13 	1  	15    	2.77669
2  	11    	236.3	2  	411	193	11    	63.2496	5.2	2  	9  	3  	11    	2.18174
3  	10    	236.4	3  	411	193	10    	72.1224	5.2	3  	9  	3  	10    	1.77764
4  	16    	241.8	4  	411	193	16    	72.7115	5.7	4  	11 	1  	16    	2.47184
5  	13    	209.3	5  	411	193	13    	51.901 	6.7	5  	13 	3  	13    	2.91719
6  	20    	247.3	6  	411	193	20    	80.7026	6.2	6  	13 	3  	20    	2.78568
7  	11    	225.6	7  	411	193	11    	69.6853	7  	7  	13 	3  	11    	3.28634
8  	8     	193  	8  	193	193	8     	0      	6  	8  	11 	3  	8     	2.93258
9  	5     	203.9	9  	411	193	5     	

In [36]:
nodes, edges, labels = gp.graph(progs[2][0]) # hof

print(gp.PrimitiveTree(progs[2][0]))

g = pgv.AGraph()
g.add_nodes_from(nodes)
g.add_edges_from(edges)
g.layout(prog="dot")

for i in nodes:
    n = g.get_node(i)
    n.attr["label"] = labels[i]

g.draw("tree.pdf")

mul(mul(x, x), sub(x, x))
