In [260]:
import numpy as np
import math
from sklearn.manifold import TSNE

BASE_PATH = "/Volumes/externo/gsgp-mo/results/"
OUTPUT_PATH = "/Volumes/externo/assets"
STRATEGIES = ['random', 'kmeans', 'kernel']
STRATEGIES_TITLES = ['Random', 'K-Means', 'Kernel K-Means']
K = range(2, 6)
#COLORS = ["#3F5D7D", "green", "orange"]
COLORS = ["#5F6CAF", "#FFB677", "#F0134D"]
STRATEGY_COLORS = ["#FE9801", "#F65C78", "#018383"]
DATASETS = ['bioavailability', 'ccn', 'ccun', 'concrete', 'energyCooling', 'energyHeating',
            'keijzer-7', 'parkinsons', 'towerData', 'vladislavleva-1', 'wineRed', 'wineWhite',
            'yacht']

def bit_to_int(value):
    return int(math.log(value, 2))

def parse_groups(path, execution = 0):
    file = open("%s/groups-%02d.txt" % (path, execution + 1))
    content = file.read().splitlines()
    file.close()
    
    k, groups = int(content[0]), list(map(lambda row: row.split(','), content[1:]))
    training = list(filter(lambda g: g[0] == "TRAINING", groups))
    validation = list(filter(lambda g: g[0] == "VALIDATION", groups))
    test = list(filter(lambda g: g[0] == "TEST", groups))
    
    return k, training, validation, test

In [889]:
from sklearn.preprocessing import Normalizer
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

def embbed_x_and_y(dataset, execution = 0):
    path = "%s/%s/%s/%s/output-%s/" % (BASE_PATH, 'gp', "random", 2, dataset)
    k, training, _, _  = parse_groups(path, execution)
    
    helper = np.matrix(list(map(lambda arr: arr[1:], training)))
    x = np.squeeze(np.asarray(helper[:, 1:]).astype('float'))
    y = [np.squeeze(np.asarray(helper[:, -1]).astype('float'))]

    x_norm = Normalizer().fit(x).transform(x)
    y_norm = Normalizer().fit(y).transform(y)
    
    tsne = TSNE(n_components=2, n_iter=5000, perplexity=30.0, learning_rate=200.0, random_state=123456)
    x_embedded = tsne.fit_transform(x_norm)
    x1, x2 = list(zip(*x_embedded))
    
    return tsne, x1, x2, y_norm[0]   

def groups_colors(dataset, strategy, k, execution = 0):
    path = "%s/%s/%s/%s/output-%s/" % (BASE_PATH, 'gp', strategy, k, dataset)
    k, training, _, _  = parse_groups(path, execution)

    helper = np.matrix(list(map(lambda arr: arr[1:], training)))

    g = np.array(list(map(bit_to_int, np.squeeze(np.asarray(helper[:,0]).astype('int')))))
    colors = ["red", "blue", "green", "orange", "purple"]
    marker = ["o", "v", "s", "*", "D"]
    return zip(*list(map(lambda k: (colors[k], marker[k], "Region %s" % (k + 1)), g)))
    
def plot_regions(dataset,):
    tsne, x1, x2, y = embbed_x_and_y(dataset)
    
    f, ax = plt.subplots(len(STRATEGIES),len(K), sharey=True, figsize=(16,8)) 
    f.suptitle(dataset.upper(), fontsize=13,  y=1)
    
    for row, strategy in enumerate(STRATEGIES):
        for col, k in enumerate(K):        
            colors, markers, labels = groups_colors(dataset, strategy, k)
            index = row * (len(STRATEGIES) + 1) + col + 1
            ax[row, col].remove()
            ax[row, col]=f.add_subplot(len(STRATEGIES), len(K), index, projection='3d', facecolor='white')
            ax[row, col].set_xticks([])
            ax[row, col].set_yticks([])
            ax[row, col].set_zticks([])
            ax[row, col].grid(True)
            
            packed = list(zip(x1, x2, y, colors, markers, labels))
            
            for label in np.unique(labels):
                filtered = list(filter(lambda arr: arr[-1] == label, packed))
                (l_x1, l_x2, l_y, l_c, _, l_l) = zip(*filtered)
                
                ax[row, col].scatter(l_x1, l_x2, l_y, color=l_c, label=label, alpha=0.3) 
        
        pad = 5
        
        # Set a top title of each column
        for curr_col, curr_ax in enumerate(ax[0]):
            curr_ax.annotate("K = %s" % K[curr_col], xy=(0.5, 1), xytext=(0, pad),
                        xycoords='axes fraction', textcoords='offset points',
                        size='large', ha='center', va='baseline')
            
        # Set a left title for each row
        for curr_row, curr_ax in enumerate(ax[:, 0]):
            curr_ax.annotate(STRATEGIES_TITLES[curr_row], xy=(0, 0.5), xytext=(-curr_ax.yaxis.labelpad - pad, 0),
                        xycoords='axes fraction', textcoords='offset points',
                        size='large', ha='right', va='center', rotation=90)
        
        # Unify axis labels
    for curr_ax in ax[-1]:
        curr_ax.set_xlabel('$x_1$')
        curr_ax.set_ylabel('$x_2$')
    
    for curr_ax in ax[:, -1]:
        curr_ax.set_zlabel('y', linespacing=0)
        
    h, l = ax[0,-1].get_legend_handles_labels()
    f.legend(h, l, loc='lower center', ncol=5, labelspacing=0. )        
    f.tight_layout(rect=[0, 0.05, 1.05, 0.9])
    
    return (tsne, f)

In [890]:
selected = ['bioavailability', 'concrete', 'wineRed']
for dataset in selected:
    print("Generating region asset for %s" % (dataset))
    tsne, f = plot_regions(dataset)
    f.savefig("%s/regions_%s.pdf" % (OUTPUT_PATH, dataset), bbox_inches='tight', pad_inches=0.5) 
    print("KL Divergence: %.5f" % tsne.kl_divergence_)
    plt.close()

Generating region asset for bioavailability
KL Divergence: 0.21132
Generating region asset for concrete
KL Divergence: 0.36555
Generating region asset for wineRed
KL Divergence: 0.55156


In [891]:
selected = ['towerData', 'parkinsons']
for dataset in selected:
    print("Generating region asset for %s" % (dataset))
    tsne, f = plot_regions(dataset)
    f.savefig("%s/regions_%s.pdf" % (OUTPUT_PATH, dataset), bbox_inches='tight', pad_inches=0.5) 
    print("KL Divergence: %.5f" % tsne.kl_divergence_)
    plt.close()

Generating region asset for towerData
KL Divergence: 0.37464
Generating region asset for parkinsons
KL Divergence: 0.58296
