You will need to install matplotlib, seaborn, pandas and mpld3 to run this

In [None]:
# for macOS
# !pip3 install --user matplotlib seaborn pandas mpld3
!pip install matplotlib seaborn pandas mpld3

# Plotting Results

In [None]:
import argparse
import csv
import matplotlib.pyplot as plt
import glob
import os
import json
import seaborn as sns
import pandas as pd
from plot_fitness import *
import mpld3
from IPython import display

In [None]:
leonhard_directory = "test_scaling_of_naive_Nov_14_115812"

In [None]:
experiments, dataframes = extract_all_run_values(leonhard_directory)
experiments

In [None]:
fig, ax = plt.subplots()
sns.lineplot(ax=ax, x="epoch", y="fitness", data=dataframes[0][dataframes[0].epoch > 150])
ax.set_title(experiments[0])
ax

In [None]:
fig.savefig("naive_n_32.svg")

In [None]:
fig, ax = plt.subplots()
sns.lineplot(ax=ax, x="epoch", y="fitness", data=dataframes[4][dataframes[4].epoch > 150])
ax.set_title(experiments[4])
ax

In [None]:
fig.savefig("naive_n_1.svg")

In [None]:

# directory name is needed

data_dir = "grid_search_island_Nov_15_163809"


# parse JSON to identify variable parameters

all_names = os.listdir(data_dir)

json_filename = list(filter(lambda x: ".json" in x, all_names))

if len(json_filename) == 0:
    print("No JSON file specifying the experiment was found in directory {}.".format(directory_name))
    exit(1)
if len(json_filename) > 1:
    print("Found multiple JSON files ({}) in directory {}.".format(json_filename, directory_name))

json_filename = json_filename[0]


try:
    json_file = open(os.path.join(data_dir, json_filename), mode="r")    
except OSError:
    print("Failed to open file {}.".format(json_filename))
    exit(1)
    
json_experiment = json.load(json_file)
json_file.close()


# print the contents of the JSON file

json_experiment_name = json_experiment["name"] # str
assert(type(json_experiment_name) == str)

print("<name>")
print(json_experiment_name)

json_repetitions = json_experiment["repetitions"] # int
assert(type(json_repetitions) == int)

print("<repetitions>")
print(json_repetitions)

json_fixed_params = json_experiment["fixed_params"] # dictionary
assert(type(json_fixed_params) == dict)

print("<fixed parameters>")
for key, value in json_fixed_params.items():
    print("{} {}".format(key, value))

json_variable_params = json_experiment["variable_params"] # dictionary
assert(type(json_variable_params) == dict)

print("<variable parameters>")
for key, value in json_variable_params.items():
    
    # value is a dictionary with
    assert(type(value["type"]) == str)
    assert(type(value["list"]) == list)
    
    value_type = value["type"]
    assert(value_type == "list")
    
    value_range = value["list"]
    
    print("{} {}".format(key, value_range));


# store the variable parameters
    
variable_params_names = list()
variable_params_lists = list()

for key, value in json_variable_params.items():
    
    variable_params_names.append(key)
    variable_params_lists.append(value["list"])



In [None]:

# the idea is to get the value of all variable parameters from the file 'leonhard.log'
# which is stored inside each folder of the experiment (each folder contains a separate job)

logfile_name = "leonhard.log"

# list of all job directories

job_dirs = [os.path.join(data_dir, f) for f in os.listdir(data_dir) \
            if os.path.isdir(os.path.join(data_dir, f))]


for job_dir in job_dirs:
    
    print("<{}>".format(job_dir))
    
    logfile_path = os.path.join(job_dir, logfile_name)
    
    try:
        logfile = open(logfile_path, mode="r")
    except OSError:
        print("Filed to open {}.".format(logfile_path))
        exit(1)
    
    
    for param_name in variable_params_names:
        
        param_found = False
        
        for line in logfile:
            for word in line.split():
                
                if(param_found):
                    print("{} {}".format(param_name, word))
                    break
                    
                if(param_name == word):
                    param_found = True
                    
            if(param_found):
                break
                
        # reset the cursor to the beginning of the file
        logfile.seek(0)
        
        

In [None]:

for rep in range(json_repetitions):
    print(rep)



In [None]:

# create the big pandas tables

import sys

sys.path.append('../logging')
from process_log import Tags, Log, Epochs

tags = Tags("../logging/tags.hpp")

# deal with repetitions

job_stems = list(set(map(lambda x: "_".join(x.split("_")[:-1]), job_dirs)))
job_stems.sort()


# initialize pandas (one pandas for all repetitions)
# as all repetitions share the same columns it is easier not to initialize column names
    
df = pd.DataFrame()


for job in job_stems:

    
    for repetition in range(json_repetitions):
        
        job_dir = job + "_" + str(repetition)
        
        logfile_path = os.path.join(job_dir, logfile_name)
        
        # use key value pairs to store the current values of the
        # variable parameters
        curr_variable_params = dict()
        
        
        try:
            logfile = open(logfile_path, mode="r")
        except OSError:
            print("Filed to open {}.".format(logfile_path))
            exit(1)
            
        
        for param_name in variable_params_names:
        
            param_found = False

            for line in logfile:
                for word in line.split():

                    if(param_found):
                        curr_variable_params[param_name] = int(word) # store variable parameter value               
                        break

                    if(param_name == word):
                        param_found = True

                if(param_found):
                    break

            # reset the cursor to the beginning of the file
            logfile.seek(0)
        

        # aggregate rank data
        
        job_dir_contents = os.listdir(os.path.join(job_dir))
        job_dir_contents = list(filter(lambda x: ".bin" in x, job_dir_contents))

        
        for rank_data in job_dir_contents:  # corresponds to iteration over ranks
            
            rank = int(rank_data.split("_")[-2])
            log = Log(os.path.join(job_dir, rank_data), tags)
            epochs = Epochs(log, tags)
            
            
            df_curr_rank = pd.DataFrame(epochs.get_fitness_vs_time_dataframe(), \
                                        columns=["fitness", "wall clock time", "epoch"])
            
            df_curr_rank["rank"] = rank # varies always because of aggregation
            df_curr_rank["rep"] = repetition # varies always because of confidence interval
            
            for key, value in curr_variable_params.items(): # variable parameters of grid search
                df_curr_rank[key] = value
                
                
            # append to pandas (one pandas for all repetitions and ranks)
            
            df = df.append(df_curr_rank, ignore_index=True)
            
            # end loop over ranks

        
        # end loop over repetitions

    
    # plot pandas
    
    # end loop over job stems


# combine all parameters of the grid search into a single column which
# can be used as a hue for seaborn

hue_grid_search = " ".join(variable_params_names);

df[hue_grid_search] = list(zip(*[df[variable_param] for \
                                                 variable_param in variable_params_names]))

for variable_param in variable_params_names:
    df = df.drop(variable_param, 1) # 1 indicates column

df
    

In [None]:

df.to_csv("grid_searchIch .gz", compression="gzip")


In [None]:

# Take out rank variation
new_df = df.groupby(["epoch", "rep", hue_grid_search], as_index=False).agg({"fitness" : "min", "wall clock time" : "max"})
new_df = new_df.drop(columns="wall clock time")
new_df


In [None]:
df_plot = new_df.copy()

per = 200 # 1, 5, 10, 15, 20, 50, 100, 200

df_plot = df_plot.loc[df_plot[hue_grid_search].isin([(per, 1), (per, 5), (per, 25), (per, 50), \
                                                    (per, 75), (per, 100), (per, 250)])]

df_plot

In [None]:
import matplotlib.ticker as ticker

fig, ax = plt.subplots(figsize=(20, 10))
ax.set_xlim(100, 1000)
ax.set_ylim(7500, 11500)
sns.lineplot(ax=ax, x="epoch", y="fitness", hue=hue_grid_search, legend='full', data=df_plot)

fig.savefig("grid_search_200.svg")


In [None]:
import numpy as np
import math

# This is for the Island Grid Search



# truncation, truncation
# truncation, dejong
# universal, truncation
# universal, dejong
# tournament, truncation
# tournament, dejong

selection_string = "tournament" # truncation, universal, tournament
replacement_string = "dejong" # truncation, dejong


name_idx_X = 6 # migration period
name_idx_Y = 7 # migration amount

threshold_pc = 110 # set fitness threshold to 110% of best known
best_known_solution = 7542



threshold = (float(threshold_pc) / float(100)) * float(best_known_solution)


# throw out JSON and DS_Store
tmp_dirs = [name for name in os.listdir(data_dir) if name[-1:] == "0"] 

experiment_dirs = list()

for tmp_dir in tmp_dirs:
    
    tmp_dir_name_list = tmp_dir.split("_")
    
    # test if selection_string and replacement_string are
    # both contained within tmp_dir_name_list
    
    if selection_string == replacement_string:
        if tmp_dir_name_list.count(replacement_string) == 2:
            experiment_dirs.append(tmp_dir)
        
    elif all(x in tmp_dir_name_list for x in [selection_string, replacement_string]):
        experiment_dirs.append(tmp_dir)


num_experiments = len(experiment_dirs)
print(num_experiments)
assert(num_experiments == 6*8)

print(experiment_dirs)

# filter according to selection and replacement




# create a map for indexing the axes
valuesX = set()
valuesY = set() # use sets to guarantee unique values

for exp_idx, exp_dir in enumerate(experiment_dirs):
    
    # Extract the parameter values from the name of the experiment directory
    
    exp_dir_name_list = exp_dir.split("_")    
    
    valX = int(exp_dir_name_list[name_idx_X])
    valY = int(exp_dir_name_list[name_idx_Y])
    
    valuesX.add(valX)
    valuesY.add(valY)
    

valuesX = list(valuesX) # convert to list and sort
valuesX.sort()
valuesY = list(valuesY)
valuesY.sort()

# create a value to index map
value_to_idx_map_x = { val_x : i for i, val_x in enumerate(valuesX) }
value_to_idx_map_y = { val_y : i for i, val_y in enumerate(valuesY) }
    
#print(value_to_idx_map_X)
#print(value_to_idx_map_Y)


data_heat_map = np.zeros((len(valuesX), len(valuesY))) # num_rows, num_cols
#print(data_heat_map)
#print(len(valuesX))
    

for exp_idx, exp_dir in enumerate(experiment_dirs):
    
    # determine idx_x, idx_y based on val_x, val_y
    exp_dir_name_list = exp_dir.split("_")   
    
    val_x = int(exp_dir_name_list[name_idx_X])
    val_y = int(exp_dir_name_list[name_idx_Y])
    
    idx_x = value_to_idx_map_x[val_x]
    idx_y = value_to_idx_map_y[val_y]
    
    
    # aggregate the CSV data
    # determine the index of the iteration where the threshold was reached
    
    
    path_curr_experiment = os.path.join(data_dir, exp_dir)
        
    
    # get CSV files (one file per rank)
    rank_CSVs = [name for name in os.listdir(path_curr_experiment) if name[-3:] == "csv"]

    path_curr_rank_CSV = os.path.join(path_curr_experiment, rank_CSVs[0])
    # Aggregate rank data to get the best global individual for all iterations
    df_aggregated = pd.read_csv(path_curr_rank_CSV, names = ["idx", "fitness"])
    df_aggregated = df_aggregated.drop_duplicates(subset="idx", keep="first")    
        
    for CSV_idx in range(1, len(rank_CSVs)):
        
        path_curr_rank_CSV = os.path.join(path_curr_experiment, rank_CSVs[CSV_idx])
        
        df_tmp = pd.read_csv(path_curr_rank_CSV, names = ["idx", "fitness"])
        df_tmp = df_tmp.drop_duplicates(subset="idx", keep="first")

            
        df_aggregated = df_aggregated.combine(df_tmp, np.minimum)
    
    
    df_aggregated = df_aggregated.loc[df_aggregated["fitness"] > threshold]
    threshold_idx = len(df_aggregated.index)-1 # threshold index
    
    data_heat_map[idx_x, idx_y] = threshold_idx

    
#fig, axes = plt.subplots(figsize=(10,10))

valuesY_pc =  [str(math.ceil((float(val) / float(500)) * float(100))) + "%" for val in valuesY]


heatmap_plot = sns.heatmap(np.transpose(data_heat_map), 
                           vmin=0, vmax=800,
                           xticklabels=valuesX, yticklabels=valuesY_pc,
                           cmap=sns.cm.rocket_r)

plt.yticks(rotation=0) # rotate y labels
plt.ylabel("migration amount (relative)")
plt.xlabel("migration period")
plt.show()

fig = heatmap_plot.get_figure()
fig.savefig("tournament_dejong_fc.eps")

# x is period
# y is size


#tp = np.zeros((len(valuesX), len(valuesY)))

#for i, val_x in enumerate(valuesX):
#    for j, val_y in enumerate(valuesY):
        
#        tp[i,j] = math.ceil((float(val_y) / float(val_x)) / float(500) * float(100))
        
        
#sns.heatmap(np.transpose(tp), 
#            xticklabels=valuesX, yticklabels=valuesY_pc,
#            vmin=0, vmax=20,
#            cmap=sns.cm.rocket_r)





In [None]:
#def param_to_list(param):
#    if param["type"] == "range":
#        start = param["min"]
#        end = param["max"]
#        step = 1 if "stride" not in param else param["stride"]
#        return list(range(start, end, step))
#    elif param["type"] == "list":
#        return param["list"]
    

#def create_folder_name():
#    return 0


#parameters = ["selection_policy", "topology", "replacement_policy", 
#              "migration_period", "migration_amount", "rank"]

#grid_parameters = ["migration_period", "migration_amount"]

#idxX = parameters.index(grid_parameters[0])
#idxY = parameters.index(grid_parameters[1])

#print(idxX)
#print(idxY)




# Read JSON and parse parameters -> use this to identify files

#all_names = os.listdir(directory_name)

# Validate JSON
#json_filename = list(filter(lambda x: ".json" in x, all_names))

#if len(json_filename) == 0:
#    print("No JSON file specifying the experiment was not fond in directory {}. ".format(directory_name))
#    exit(1)
#if len(json_filename) > 1:
#    print("Found multiple JSON files ({}) in directory {}.".format(json_filename, directory_name))

#json_filename = json_filename[0]


#try:
#    json_file = open(os.path.join(directory_name, json_filename), mode="r")    
#except OSError:
#    print("Failed to open file {}.".format(json_filename))
#    exit(1)
    
#experiment_spec = json.load(json_file)
#json_file.close()


#experiment_name = experiment_spec["name"]

#variable_params_names = list()
#value_lists = list()

#for param_name, param in experiment_spec["variable_params"].items():
#    param_list = param_to_list(param)
    
#    variable_params_names.append(param_name)
#    value_lists.append(param_list)


# debug
#print(variable_params_names)
#print(value_lists)


#for idx, element in enumerate(itertools.product(*value_lists)):
    
#    nam = "_".join(str(v) for v in element).replace(".", "").replace("-", "")
#    nam += "_" + str(0) # NO REPETITIONS
    
#    print(nam)
    
    
#variable_params_names.sort()
#print(variable_params_names)




# Provide names of exactly two parameters

# Create a 2D heatmap based thereon




#smallest = float('Inf')
#idxSmallest = -1





    
    



#for idx in range(0,1):
    
#    print(dataframes[idx])
    
#    print('------------------------')
    
#    df = dataframes[idx]
#    print(df['fitness'].min()) # best value overall
#    print(df['fitness'].idxmin()) # takeover index
    
#    if (df['fitness'].min() < smallest): # keep track of smallest value
#        smallest = df['fitness'].min()
#        idxSmallest = idx
    
#    df = df.loc[df['fitness'] > threshold]
    
#    print(len(df.index)-1) # threshold index
    
    
    
#print(smallest)
#print(idxSmallest)
    

#ax = sns.lineplot(x="epoch", y="fitness", hue="run", data=dataframes[idxSmallest])
#ax.set_title(experiments[idxSmallest])
#ax




