1. load non-dominated semantics
2. figure out the region of each semantic based on its index
3. build each of the proposed methods
4. store its results (output per individual)
5. aggregate them 
6. plot charts
7. build tables


In [260]:
import numpy as np
import sys
import math

EVOLUTION_MODES = ['gp', 'gsgp']
STRATEGIES = ['random', 'kmeans', 'kernel']
K = range(2,6)
DATASETS = ['bioavailability', 'ccn', 'ccun', 'concrete', 'energyCooling', 'energyHeating', 'keijzer-7', 
            'parkinsons', 'towerData', 'vladislavleva-1', 'wineRed', 'wineWhite', 'yacht']

RESULTS_PATH = "/Volumes/externo/gsgp-mo/results"

In [477]:
individuals = {}

def parse_non_dominated_row(row):
    columns = row.split(",")
    return (int(columns[0]), np.array(list(map(lambda s: s.split(";"), columns[1:]))).astype(np.float))
    

def parse_non_dominated_files(path):
    files = [open("%s/non_dominated_tr_semantics.csv" % path),
             open("%s/non_dominated_val_semantics.csv" % path),
             open("%s/non_dominated_ts_semantics.csv" % path)]

    eof = False
    while not eof:
        executions = []
        individuals = []
        for file in files:
            curr_row = file.readline()
            if not curr_row:
                eof = True
                break
                
            curr_execution, curr_individuals = parse_non_dominated_row(curr_row)
            executions.append(curr_execution)
            individuals.append(curr_individuals)

        if not eof and not len(np.unique(executions)) == 1:
            raise Exception("Lines in non dominated files under % are not appearing in the same order." % path)
        
        if not eof:
            yield (executions[0], *individuals)
        
    for file in files:
        file.close()
        
def non_dominated_ensemble(path, method):
    output = {
        'training': [],
        'validation': [],
        'test': []
    }
    
    for execution, training, validation, test in parse_non_dominated_files(path):
        res_training, res_validation, res_test = np.array(method(path, execution, training, validation, test))
        output['training'].append([execution, res_training])
        output['validation'].append([execution, res_validation])
        output['test'].append([execution, res_test])
        
    return output

def parse_groups(path, execution):
    file = open("%s/groups-%02d.txt" % (path, execution + 1))
    content = file.read().splitlines()
    file.close()
    
    k, groups = int(content[0]), list(map(lambda row: row.split(','), content[1:]))
    training = list(filter(lambda g: g[0] == "TRAINING", groups))
    validation = list(filter(lambda g: g[0] == "VALIDATION", groups))
    test = list(filter(lambda g: g[0] == "TEST", groups))
    
    return k, training, validation, test

def rmse(predicted, expected):
    return np.sqrt(np.mean((predicted-expected)**2))

def bit_to_int(value):
    return int(math.log(value, 2))
def ensemble_stacking(path, execution, training, validation, test):
    k_groups, tr_groups, val_groups, ts_groups = parse_groups(path, execution)
    
    errors = []
    
    for ind in validation:
        curr = {}
            
        for index, semantic in enumerate(ind):
            group = bit_to_int(int(val_groups[index][1]))
            
            if not group in curr:
                curr[group] = []

            curr[group].append((semantic - float(val_groups[index][-1])) ** 2)
        
        reduced = {}
        for key in curr:
            reduced[key] = np.sqrt(np.mean(curr[key]))
        errors.append(reduced)
    
    lowest = {}
    
    for k in range(k_groups):
        lowest[k] = (-1, np.inf)
        
    for index, err in enumerate(errors):
        for k in range(k_groups):
            low_index, low_error = lowest[k]
            
            if k in err and err[k] < low_error:
                lowest[k] = (index, err[k])
    
    res_training, res_validation, res_test = [], [], []

    for index in range(0, len(training[0])):
        group = bit_to_int(int(tr_groups[index][1]))
        stack_index = lowest[group][0]
        
        if stack_index == -1:
            res_training.append(training[0][index])
        else:
            res_training.append(training[stack_index][index])
    
    for index in range(0, len(validation[0])):
        group = bit_to_int(int(val_groups[index][1]))
        stack_index = lowest[group][0]
        
        if stack_index == -1:
            res_validation.append(validation[0][index])
        else:
            res_validation.append(validation[stack_index][index])

    
    for index in range(0, len(test[0])):
        group = bit_to_int(int(ts_groups[index][1]))
        stack_index = lowest[group][0]
        
        if stack_index == -1:
            res_test.append(test[0][index])
        else:
            res_test.append(test[stack_index][index])
    
    return (res_training, res_validation, res_test)
    # measure error by group per each individual
    
    # get the best individual by group
    
    # predict the test semantics based on its groups

def ensemble_unweighted(path, execution, training, validation, test):
    return (np.mean(training, axis=0), np.mean(validation, axis=0), np.mean(test, axis=0))

def ensemble_weighted(path, execution, training, validation, test):
    k_groups, tr_groups, val_groups, ts_groups = parse_groups(path, execution)

    val_expected_y = np.array(list(zip(*val_groups))[-1]).astype(float)
    val_errors = list(map(lambda a: rmse(a, val_expected_y), validation))

    sorted_errors = sorted(list(enumerate(val_errors)), key=lambda t: t[1], reverse=True)
    
    coeficient = sum(range(1, len(sorted_errors) + 1))
    
    res_training, res_validation, res_test = [], [], []
    for rank, entry in enumerate(sorted_errors):
        ind_index, ind_error = entry
        current_tr, current_val, current_ts = [], [], []
        
        weight = (rank + 1) / coeficient
        
        for semantic in training[ind_index]:
            current_tr.append(weight * semantic)
        for semantic in validation[ind_index]:
            current_val.append(weight * semantic)            
        for semantic in test[ind_index]:
            current_ts.append(weight * semantic)
        
        res_training.append(current_tr)
        res_validation.append(current_val)
        res_test.append(current_ts)

    return (np.sum(res_training, axis=0), np.sum(res_validation, axis=0), np.sum(res_test, axis=0))

In [491]:
def save_as_csv(filename, execution, content):    
    with open(filename, 'a') as f:
        f.writelines(",".join([str(execution), *np.array(content).astype(str)]) + "\n")
    
def save_ensembles(path, execution, training, validation, test):
    # Compute RMSE of training, validation, and test
    # Put into a file (one per training, validation, and set)
    k_groups, tr_groups, val_groups, ts_groups = parse_groups(path, execution)
    expected_tr_y = np.array(list(zip(*tr_groups))[-1]).astype(float)    
    expected_val_y = np.array(list(zip(*val_groups))[-1]).astype(float)    
    expected_ts_y = np.array(list(zip(*ts_groups))[-1]).astype(float)        
    
    tr_rmse = list(map(lambda arr: rmse(arr, expected_tr_y), training))
    val_rmse = list(map(lambda arr: rmse(arr, expected_val_y), validation))
    ts_rmse = list(map(lambda arr: rmse(arr, expected_ts_y), test)) 
    
    save_as_csv("%s/non_dominated_fitness_%s.csv" % (path, "training"), execution, tr_rmse)      
    save_as_csv("%s/non_dominated_fitness_%s.csv" % (path, "validation"), execution, val_rmse)      
    save_as_csv("%s/non_dominated_fitness_%s.csv" % (path, "test"), execution, ts_rmse)          
    
    res_training, res_validation, res_test = ensemble_stacking(path, execution, training, validation, test)
    save_as_csv("%s/ensemble_stacking_%s.csv" % (path, "training"), execution, res_training)      
    save_as_csv("%s/ensemble_stacking_%s.csv" % (path, "validation"), execution, res_validation)      
    save_as_csv("%s/ensemble_stacking_%s.csv" % (path, "test"), execution, res_test)          

    res_training, res_validation, res_test = ensemble_unweighted(path, execution, training, validation, test)
    save_as_csv("%s/ensemble_unweighted_%s.csv" % (path, "training"), execution, res_training)      
    save_as_csv("%s/ensemble_unweighted_%s.csv" % (path, "validation"), execution, res_validation)      
    save_as_csv("%s/ensemble_unweighted_%s.csv" % (path, "test"), execution, res_test)          
    
    res_training, res_validation, res_test = ensemble_weighted(path, execution, training, validation, test)
    save_as_csv("%s/ensemble_weighted_%s.csv" % (path, "training"), execution, res_training)      
    save_as_csv("%s/ensemble_weighted_%s.csv" % (path, "validation"), execution, res_validation)      
    save_as_csv("%s/ensemble_weighted_%s.csv" % (path, "test"), execution, res_test)          
    
    return ([], [], [])
#non_dominated_ensemble('/Volumes/externo/gsgp-mo/results/gp/random/2/output-yacht', save_ensembles)

In [493]:
import os

for mode in ['gp']:
    for strategy in STRATEGIES:
        for k in K:
            for dataset in DATASETS:
                directory = "%s/%s/%s/%d/output-%s" % (RESULTS_PATH, mode, strategy, k, dataset)
                
                files = [
                    "%s/non_dominated_fitness_training.csv" % directory,
                    "%s/non_dominated_fitness_validation.csv" % directory,
                    "%s/non_dominated_fitness_test.csv" % directory,                    
                    "%s/training_ensemble_stacking.csv" % directory,
                    "%s/training_ensemble_unweighted.csv" % directory,
                    "%s/training_ensemble_weighted.csv" % directory,
                    "%s/validation_ensemble_stacking.csv" % directory,
                    "%s/validation_ensemble_unweighted.csv" % directory,
                    "%s/validation_ensemble_weighted.csv" % directory,
                    "%s/test_ensemble_stacking.csv" % directory,
                    "%s/test_ensemble_unweighted.csv" % directory,                    
                    "%s/test_ensemble_weighted.csv" % directory,                    
                    "%s/ensemble_stacking.csv" % directory,
                    "%s/ensemble_stacking_training.csv" % directory,
                    "%s/ensemble_stacking_validation.csv" % directory,
                    "%s/ensemble_stacking_test.csv" % directory,
                    "%s/ensemble_unweighted.csv" % directory,
                    "%s/ensemble_unweighted_training.csv" % directory,
                    "%s/ensemble_unweighted_validation.csv" % directory,
                    "%s/ensemble_unweighted_test.csv" % directory,
                    "%s/ensemble_weighted.csv" % directory,
                    "%s/ensemble_weighted_training.csv" % directory,
                    "%s/ensemble_weighted_validation.csv" % directory,
                    "%s/ensemble_weighted_test.csv" % directory,
                    "%s/training_ensemble_stacking_fitness.csv" % directory,
                    "%s/training_ensemble_unweighted_fitness.csv" % directory,
                    "%s/training_ensemble_weighted_fitness.csv" % directory,
                    "%s/validation_ensemble_stacking_fitness.csv" % directory,
                    "%s/validation_ensemble_unweighted_fitness.csv" % directory,
                    "%s/validation_ensemble_weighted_fitness.csv" % directory,
                    "%s/test_ensemble_stacking_fitness.csv" % directory,
                    "%s/test_ensemble_unweighted_fitness.csv" % directory,
                    "%s/test_ensemble_weighted_fitness.csv" % directory                                        
                ]
                
                for file in files:
                    if os.path.exists(file):
                        os.remove(file)
                
                print("Building new ensembles of %s" % directory)
                non_dominated_ensemble(directory, save_ensembles)

Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-bioavailability
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-ccn
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-ccun
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-concrete
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-energyCooling
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-energyHeating
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-keijzer-7
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-parkinsons
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-towerData
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-vladislavleva-1
Building new ensembles of /Volumes/externo/gsgp-mo/results/gp/random/2/output-wineRed
Building new ensembles of

In [494]:
def read_ensemble(path):
    with open(path) as f:
        output = f.readlines()
        
    return list(map(lambda arr: np.array(arr.split(",")).astype(float), output))
    
def to_file(filename, content):
    with open(filename, 'w') as f:
        f.writelines(list(map(lambda row: "%d,%f\n" % (row[0], row[1]), content)))
    
    f.close()
    
def save_fitness(path, mode):
    stacking = read_ensemble("%s/ensemble_stacking_%s.csv" % (path, mode))
    unweighted = read_ensemble("%s/ensemble_unweighted_%s.csv" % (path, mode))
    weighted = read_ensemble("%s/ensemble_weighted_%s.csv" % (path, mode))
    
    stacking_fitness = []
    unweighted_fitness = []
    weighted_fitness = []
    
    assert len(stacking) == len(unweighted) == len(weighted)
    for index in range(0, len(stacking)):
        assert stacking[index][0] == unweighted[index][0] == weighted[index][0]
        
        execution = int(stacking[index][0])
        k_groups, tr_groups, val_groups, ts_groups = parse_groups(path, execution)
        
        curr_groups = None
        if (mode == 'training'):
            curr_groups = tr_groups
        if (mode == 'validation'):
            curr_groups = val_groups
        if (mode == 'test'):
            curr_groups = ts_groups
            
            
        expected_y = np.array(list(zip(*curr_groups))[-1]).astype(float)
        
        predicted_stacking_y = stacking[index][1:]
        predicted_unweighted_y = unweighted[index][1:]
        predicted_weighted_y = weighted[index][1:]
                
        assert len(expected_y) == len(predicted_stacking_y) == len(predicted_unweighted_y) == len(predicted_weighted_y)
    
        stacking_fitness.append([execution, rmse(predicted_stacking_y, expected_y)])
        unweighted_fitness.append([execution, rmse(predicted_unweighted_y, expected_y)])
        weighted_fitness.append([execution, rmse(predicted_weighted_y, expected_y)])
    
    to_file("%s/%s_ensemble_stacking_fitness.csv" % (path, mode), stacking_fitness)
    to_file("%s/%s_ensemble_unweighted_fitness.csv" % (path, mode), unweighted_fitness)
    to_file("%s/%s_ensemble_weighted_fitness.csv" % (path, mode), weighted_fitness)

In [495]:
for mode in ['gp']:
    for strategy in STRATEGIES:
        for k in K:
            for dataset in DATASETS:
                directory = "%s/%s/%s/%d/output-%s" % (RESULTS_PATH, mode, strategy, k, dataset)
                
                files = [
                    "%s/ensemble_stacking_fitness.csv" % directory,
                    "%s/ensemble_unweighted_fitness.csv" % directory,
                    "%s/ensemble_weighted_fitness.csv" % directory
                ]
                
                for file in files:
                    if os.path.exists(file):
                        os.remove(file)                
                
                print("Computing fitness of %s" % directory)
                
                save_fitness(directory, 'training')
                save_fitness(directory, 'validation')                
                save_fitness(directory, 'test')                

Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-bioavailability
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-ccn
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-ccun
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-concrete
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-energyCooling
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-energyHeating
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-keijzer-7
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-parkinsons
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-towerData
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-vladislavleva-1
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-wineRed
Computing fitness of /Volumes/externo/gsgp-mo/results/gp/random/2/output-wineWhi