1. load non-dominated semantics
2. figure out the region of each semantic based on its index
3. build each of the proposed methods
4. store its results (output per individual)
5. aggregate them 
6. plot charts
7. build tables


In [260]:
import numpy as np
import sys
import math

EVOLUTION_MODES = ['gp', 'gsgp']
STRATEGIES = ['random', 'kmeans', 'kernel']
K = range(2,6)
DATASETS = ['bioavailability', 'ccn', 'ccun', 'concrete', 'energyCooling', 'energyHeating', 'keijzer-7', 
            'parkinsons', 'towerData', 'vladislavleva-1', 'wineRed', 'wineWhite', 'yacht']

RESULTS_PATH = "/Volumes/externo/gsgp-mo/results"

In [342]:
individuals = {}

def parse_non_dominated_row(row):
    columns = row.split(",")
    return (int(columns[0]), np.array(list(map(lambda s: s.split(";"), columns[1:]))).astype(np.float))
    

def parse_non_dominated_files(path):
    files = [open("%s/non_dominated_tr_semantics.csv" % path),
             open("%s/non_dominated_val_semantics.csv" % path),
             open("%s/non_dominated_ts_semantics.csv" % path)]

    eof = False
    while not eof:
        executions = []
        individuals = []
        for file in files:
            curr_row = file.readline()
            if not curr_row:
                eof = True
                break
                
            curr_execution, curr_individuals = parse_non_dominated_row(curr_row)
            executions.append(curr_execution)
            individuals.append(curr_individuals)

        if not eof and not len(np.unique(executions)) == 1:
            raise Exception("Lines in non dominated files under % are not appearing in the same order." % path)
        
        if not eof:
            yield (executions[0], *individuals)
        
    for file in files:
        file.close()
        
def non_dominated_ensemble(path, method):
    output = []
    for execution, training, validation, test in parse_non_dominated_files(path):
        output.append([execution, np.array(method(path, execution, training, validation, test))])
        
    return output

def parse_groups(path, execution):
    file = open("%s/groups-%02d.txt" % (path, execution + 1))
    content = file.read().splitlines()
    file.close()
    
    k, groups = int(content[0]), list(map(lambda row: row.split(','), content[1:]))
    training = list(filter(lambda g: g[0] == "TRAINING", groups))
    validation = list(filter(lambda g: g[0] == "VALIDATION", groups))
    test = list(filter(lambda g: g[0] == "TEST", groups))
    
    return k, training, validation, test

def rmse(predicted, expected):
    return np.sqrt(np.mean((predicted-expected)**2))

def bit_to_int(value):
    return int(math.log(value, 2))
def ensemble_stacking(path, execution, training, validation, test):
    k_groups, tr_groups, val_groups, ts_groups = parse_groups(path, execution)
    
    errors = []
    
    for ind in validation:
        curr = {}
            
        for index, semantic in enumerate(ind):
            group = bit_to_int(int(val_groups[index][1]))
            
            if not group in curr:
                curr[group] = []

            curr[group].append((semantic - float(val_groups[index][-1])) ** 2)
        
        reduced = {}
        for key in curr:
            reduced[key] = np.sqrt(np.mean(curr[key]))
        errors.append(reduced)
    
    lowest = {}
    
    for k in range(k_groups):
        lowest[k] = (-1, np.inf)
        
    for index, err in enumerate(errors):
        for k in range(k_groups):
            low_index, low_error = lowest[k]
            
            if k in err and err[k] < low_error:
                lowest[k] = (index, err[k])
    
    output = []
    
    for index in range(0, len(test[0])):
        group = bit_to_int(int(ts_groups[index][1]))
        stack_index = lowest[group][0]
        
        if stack_index == -1:
            output.append(test[0][index])
        else:
            output.append(test[stack_index][index])
    
    return output
    # measure error by group per each individual
    
    # get the best individual by group
    
    # predict the test semantics based on its groups

def ensemble_unweighted(path, execution, training, validation, test):
    return np.mean(test, axis=0)

def ensemble_weighted(path, execution, training, validation, test):
    k_groups, tr_groups, val_groups, ts_groups = parse_groups(path, execution)

    val_expected_y = np.array(list(zip(*val_groups))[-1]).astype(float)
    val_errors = list(map(lambda a: rmse(a, val_expected_y), validation))

    sorted_errors = sorted(list(enumerate(val_errors)), key=lambda t: t[1], reverse=True)
    
    output = []
    coeficient = sum(range(1, len(sorted_errors) + 1))
    
    for rank, entry in enumerate(sorted_errors):
        ind_index, ind_error = entry
        current = []
        
        weight = (rank + 1) / coeficient
        
        for semantic in test[ind_index]:
            current.append(weight * semantic)
            
        output.append(current)

    return np.sum(output, axis=0)



In [344]:
def save_as_csv(filename, execution, content):    
    with open(filename, 'a') as f:
        f.writelines(",".join([str(execution), *np.array(content).astype(str)]) + "\n")
    
def save_ensembles(path, execution, training, validation, test):
    save_as_csv("%s/ensemble_stacking.csv" % path, execution, ensemble_stacking(path, execution, training, validation, test))
    save_as_csv("%s/ensemble_unweighted.csv" % path, execution, ensemble_unweighted(path, execution, training, validation, test))
    save_as_csv("%s/ensemble_weighted.csv" % path, execution, ensemble_weighted(path, execution, training, validation, test))    

In [347]:
import os

for mode in EVOLUTION_MODES:
    for strategy in STRATEGIES:
        for k in K:
            for dataset in DATASETS:
                directory = "%s/%s/%s/%d/output-%s" % (RESULTS_PATH, mode, strategy, k, dataset)
                
                os.remove("%s/ensemble_stacking.csv" % directory)
                os.remove("%s/ensemble_unweighted.csv" % directory)
                os.remove("%s/ensemble_weighted.csv" % directory)
                
                print("Building new ensembles of %s" % directory)
                non_dominated_ensemble(directory, save_ensembles)

Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-bioavailability
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-ccn
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-ccun
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-concrete
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-energyCooling
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-energyHeating
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-keijzer-7
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-parkinsons
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-towerData
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-vladislavleva-1
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-wineRed
Building new ensembles of

In [399]:
def read_ensemble(path):
    with open(path) as f:
        output = f.readlines()
        
    return list(map(lambda arr: np.array(arr.split(",")).astype(float), output))
    
def to_file(filename, content):
    with open(filename, 'w') as f:
        f.writelines(list(map(lambda row: ",".join(row), content)))
    
    f.close()
    
def save_fitness(path):
    stacking = read_ensemble("%s/ensemble_stacking.csv" % path)
    unweighted = read_ensemble("%s/ensemble_unweighted.csv" % path)
    weighted = read_ensemble("%s/ensemble_weighted.csv" % path)
    
    stacking_fitness = []
    unweighted_fitness = []
    weighted_fitness = []
    
    assert len(stacking) == len(unweighted) == len(weighted)
    for index in range(0, len(stacking)):
        assert stacking[index][0] == unweighted[index][0] == weighted[index][0]
        
        execution = int(stacking[index][0])
        k_groups, _, _, ts_groups = parse_groups(path, execution)
        expected_y = np.array(list(zip(*ts_groups))[-1]).astype(float)
        predicted_stacking_y = stacking[index][1:]
        predicted_unweighted_y = unweighted[index][1:]
        predicted_weighted_y = weighted[index][1:]
        
        if execution == 15:
            print(list(map(lambda v: "%0.5f" % v,(predicted_unweighted_y - expected_y) ** 2)))
            print(rmse(predicted_unweighted_y, expected_y))
        
        assert len(expected_y) == len(predicted_stacking_y) == len(predicted_unweighted_y) == len(predicted_weighted_y)
    
        stacking_fitness.append([execution, rmse(predicted_stacking_y, expected_y)])
        unweighted_fitness.append([execution, rmse(predicted_unweighted_y, expected_y)])
        weighted_fitness.append([execution, rmse(predicted_weighted_y, expected_y)])
    
        to_file("%s/ensemble_stacking_fitness.csv", stacking_fitness)
        to_file("%s/ensemble_unweighted_fitness.csv", unweighted_fitness)
        to_file("%s/ensemble_weighted_fitness.csv", weighted_fitness)
            
    save_fitness("/Volumes/externo/gsgp-mo/results/gp/kmeans/4/output-bioavailability")

['883.18403', '929.12202', '120.29337', '4630.60857', '1206.43641', '680.56512', '614.69577', '349.11055', '674.43007', '483.49089', '629.98983', '79.52365', '199.01077', '464.68329', '2228.70759', '260.06695', '3105.71712', '436.30796', '772.12821', '3258.42666', '1178.93434', '46.61816', '108.92026', '11068108464725.09961', '357.17828', '363.67051', '985.45619', '1035.78887', '2022.50991', '683.77210', '70.21144', '1720.20909', '438.36241', '181.23272', '670.92011', '1436.41521', '1830.59528', '801.33987', '644.93881', '1900.28235', '780.93739', '1918.50439', '1402.75076', '988.14297', '2611.85948', '36.87627', '196.18162', '10.79642', '4114.36372', '65.31099', '382.97912', '443.44045', '0.02047', '972.99919', '4805.04262', '1335.00139', '1656.75596', '372.41615', '1431.53167', '873.25068', '167.15149', '2272.10442', '0.50426', '23.22859', '600.41590', '1674.00177', '259.32876', '1618.13396', '121.06358', '114.90395', '708.15231', '2193.68148']
392076.1784214472
32.14647800085004
35.

In [349]:
non_dominated_ensemble("/Volumes/externo/gsgp-mo/results/gp/kmeans/2/output-bioavailability", ensemble_stacking)

[[5, array([ 71.84384658,  48.47269124,  40.8614952 ,  73.55960885,
          65.6137926 ,  97.220767  , 102.41719967,  69.16218501,
          47.63666291,  78.25315738,  97.97587745,  69.60480333,
          37.2254009 ,  63.56475714,  92.09441509,  69.42880556,
          71.42670816,  35.59734929,  68.52172778,  74.67292439,
         100.07869649,  57.57094527,  51.86991708,  69.82198377,
          64.06428637,  15.50408443,  41.30609545,  80.09582328,
          98.84988489,  84.07865633,  67.79663383,  39.27570883,
          62.13948215,  73.47704391, 109.212261  ,  73.56865741,
          31.36559043,  95.43876931,  39.72632345,  53.64092187,
          23.16295588,  69.89489279,  86.68744207,  71.51598889,
          31.73689581,  24.1666702 ,  97.38735215,  31.60463484,
          39.57993361, 105.81066474,  51.44340658,  52.80200092,
          64.42989717,  78.35002195, 120.31375258,  14.28821444,
          57.98117562,  74.85196771,  19.83233108,  13.44305049,
          95.95575251,