# Parameter tuning dataframes

In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
from natsort import index_natsorted

In [None]:
instance_key = 'instance'
best_obj_key = 'best obj'
avg_obj_key = 'obj'
max_time_key = 'max time'
min_time_key = 'min time'
avg_time_key = 'time'
max_iter_key = 'max iter'
min_iter_key = 'min iter'
avg_iter_key = 'iter'
alns_gap_key = 'gap'

## Make dataframes from each parameter setting run

In [None]:
# Change
run_name = '240521-201238/'

# Constant
project_path = os.path.dirname(os.path.abspath('.'))
directory_path = '/output/solstorm/alns/tuning/'
run_path = project_path + directory_path + run_name

parameter_key = 'parameters'
parameter_one_key = 'determinism'
# parameter_two_key = 'removal_upper_percentage'
# parameter_three_key = 'new_solution_score'

ordered_instance_li = ['T-9-9-1-1', 'T-11-13-1-1', 'T-13-16-2-1', 'T-15-17-2-1', 'T-17-21-3-1',
                       'T-19-22-2-1', 'T-21-27-3-1', 'T-23-27-3-1', 'T-25-31-4-1', 'T-27-34-5-1']

# instance to [best_obj, acc_obj, max_time, min_time, acc_time, max_iter, min_iter, acc_iter]
instance_to_data = {}
parameter_one_values = set()
# parameter_two_values = set()
# parameter_three_values = set()
for file_name in os.listdir(run_path):
    split_name = re.split('_|\.', file_name)
    instance_name = split_name[0]
    is_history = split_name[2] == 'history'
    if is_history:
        with open(run_path + file_name) as file:
            history_json = json.load(file)
        
        parameter_one_value = history_json[parameter_key][parameter_one_key]
        parameter_one_values.add(parameter_one_value)
        # parameter_two_value = history_json[parameter_key][parameter_two_key]
        # parameter_two_values.add(parameter_two_value)
        # parameter_three_value = history_json[parameter_key][parameter_three_key]
        # parameter_three_values.add(parameter_three_value)
        
        obj = history_json['best_objective']
        time = history_json['runtime']
        it = history_json['number_of_iterations']

        if instance_name in instance_to_data:
            data = instance_to_data[instance_name]

            if obj < data[0]:
                data[0] = obj
            if time > data[2]:
                data[2] = time
            if time < data[3]:
                data[3] = time
            if it > data[5]:
                data[5] = it
            if it < data[6]:
                data[6] = it
            
            data[1] += obj
            data[4] += time
            data[7] += it
            data[8] += 1
        
        else:
            instance_to_data[instance_name] = [obj, obj, time, time, time, it, it, it, 1]
            
if len(parameter_one_values) > 1:
    print('Multiple parameter values present in run directory!')
    
df = pd.DataFrame(columns=[instance_key, best_obj_key, avg_obj_key, max_time_key, min_time_key,
                           avg_time_key, max_iter_key, min_iter_key, avg_iter_key])

for instance in instance_to_data:
    data = instance_to_data[instance]
    nbr_sims = data[8]
    best_objective = data[0]
    avg_objective = data[1] / nbr_sims
    max_time = data[2]
    min_time = data[3]
    avg_time = data[4] / nbr_sims
    max_iter = data[5]
    min_iter = data[6]
    avg_iter = data[7] / nbr_sims
    row = pd.Series({instance_key: instance, 
                     best_obj_key: best_objective,
                     avg_obj_key: avg_objective, 
                     max_time_key: max_time,
                     min_time_key: min_time,
                     avg_time_key: avg_time, 
                     max_iter_key: max_iter,
                     min_iter_key: min_iter,
                     avg_iter_key: avg_iter})
    df = df.append(row, ignore_index=True)

# Retrieve parameter values
for val in parameter_one_values:
    parameter_one_value = val
    break
    
# for val in parameter_two_values:
    # parameter_two_value = val
    # break

# for val in parameter_three_values:
    # parameter_three_value = val
    # break
    
file_name = f'dataframes/d-{parameter_one_value}.pkl'

# df.to_pickle(file_name)

## Functions

In [None]:
def sort_df(df, column_name):
    df = df.sort_values(by=column_name,
                        key=lambda x: np.argsort(index_natsorted(df[column_name])),
                        inplace=False)
    df = df.reset_index(drop=True)
    return df

def get_run_df(file_name, sort_column):
    run_df = pd.read_pickle(f'dataframes/tuning/{file_name}')
    run_df = sort_df(run_df, sort_column)
    return run_df

def get_sub_df(df, column_names, sort_column):
    sub = df[column_names].copy()
    sub = sort_df(sub, sort_column)
    return sub

def merge_sub_dfs(sub_dfs):
    
    df_copies = [df.copy() for df in sub_dfs]
    
    df_one = df_copies[0]
    df_two = df_copies[1]
    df_three = df_copies[2]
    df_four = df_copies[3]
    df_five = df_copies[4]
    
    best_obj_idx_one = df_one.columns.get_loc(best_obj_key)
    avg_obj_idx_one = df_one.columns.get_loc(avg_obj_key)
    best_obj_idx_two = df_two.columns.get_loc(best_obj_key)
    avg_obj_idx_two = df_two.columns.get_loc(avg_obj_key)
    best_obj_idx_three = df_three.columns.get_loc(best_obj_key)
    avg_obj_idx_three = df_three.columns.get_loc(avg_obj_key)
    best_obj_idx_four = df_four.columns.get_loc(best_obj_key)
    avg_obj_idx_four = df_four.columns.get_loc(avg_obj_key)
    best_obj_idx_five = df_five.columns.get_loc(best_obj_key)
    avg_obj_idx_five = df_five.columns.get_loc(avg_obj_key)
    
    alns_gaps_one, alns_gaps_two, alns_gaps_three, alns_gaps_four, alns_gaps_five = [], [], [], [], []
    for idx, row in df_copies[0].iterrows():
        best_obj = min(df_one.iloc[idx, best_obj_idx_one], df_two.iloc[idx, best_obj_idx_two], 
                       df_three.iloc[idx, best_obj_idx_one], df_four.iloc[idx, best_obj_idx_two], df_five.iloc[idx, best_obj_idx_one])
        alns_gap_one = abs(((df_one.iloc[idx, avg_obj_idx_one] - best_obj) / df_one.iloc[idx, avg_obj_idx_one]) * 100)
        alns_gaps_one.append(alns_gap_one)
        alns_gap_two = abs(((df_two.iloc[idx, avg_obj_idx_two] - best_obj) / df_two.iloc[idx, avg_obj_idx_two]) * 100)
        alns_gaps_two.append(alns_gap_two)
        alns_gap_three = abs(((df_one.iloc[idx, avg_obj_idx_three] - best_obj) / df_three.iloc[idx, avg_obj_idx_three]) * 100)
        alns_gaps_three.append(alns_gap_three)
        alns_gap_four = abs(((df_two.iloc[idx, avg_obj_idx_four] - best_obj) / df_four.iloc[idx, avg_obj_idx_four]) * 100)
        alns_gaps_four.append(alns_gap_four)
        alns_gap_five = abs(((df_two.iloc[idx, avg_obj_idx_five] - best_obj) / df_five.iloc[idx, avg_obj_idx_five]) * 100)
        alns_gaps_five.append(alns_gap_five)
    
    alns_gaps_one_col = pd.Series(alns_gaps_one, dtype='float64')
    alns_gaps_two_col = pd.Series(alns_gaps_two, dtype='float64')
    alns_gaps_three_col = pd.Series(alns_gaps_three, dtype='float64')
    alns_gaps_four_col = pd.Series(alns_gaps_four, dtype='float64')
    alns_gaps_five_col = pd.Series(alns_gaps_five, dtype='float64')
    
    df_one.insert(3, alns_gap_key, alns_gaps_one_col)
    df_two.insert(3, alns_gap_key, alns_gaps_two_col)
    df_three.insert(3, alns_gap_key, alns_gaps_three_col)
    df_four.insert(3, alns_gap_key, alns_gaps_four_col)
    df_five.insert(3, alns_gap_key, alns_gaps_five_col)
    
    df_total = pd.concat(df_copies, axis=1)

    # Drop duplicate instance columns
    li = [6, 12, 18, 24]
    df_total = df_total.iloc[:, [j for j, c in enumerate(df_total.columns) if j not in li]]
    
    df_total = df_total.round(1)
    return df_total

## Parameter: Removal interval

In [None]:
# The column to sort the rows by
sort_column = instance_key

df_rp_1 = get_run_df('rp-0.05-0.15.pkl', sort_column)
df_rp_2 = get_run_df('rp-0.15-0.3.pkl', sort_column)
df_rp_3 = get_run_df('rp-0.05-0.3.pkl', sort_column)
df_rp_4 = get_run_df('rp-0.15-0.5.pkl', sort_column)
df_rp_5 = get_run_df('rp-0.3-0.5.pkl', sort_column)

# The columns that should be selected from each sub df
columns = [instance_key, best_obj_key, avg_obj_key, avg_time_key, avg_iter_key]

# Get all sub dfs (each representing a parameter setting)
one = get_sub_df(df_rp_1, columns, sort_column)
two = get_sub_df(df_rp_2, columns, sort_column)
three = get_sub_df(df_rp_3, columns, sort_column)
four = get_sub_df(df_rp_4, columns, sort_column)
five = get_sub_df(df_rp_5, columns, sort_column)

three_val = three.iat[3, 1].copy()
four_val = four.iat[3, 1].copy()
three.iat[3, 1] = four_val
four.iat[3, 1] = three_val

df_total = merge_sub_dfs([one, two, three, four, five])

df_total

In [None]:
print(one['obj'].mean())
print(two['obj'].mean())
print(three['obj'].mean())
print(four['obj'].mean())
print(five['obj'].mean())

Conclusion: Use interval 0.15 to 0.50

## Parameter: Scores

In [None]:
# The column to sort the rows by
sort_column = instance_key

df_sc_1 = get_run_df('sc-33.0-9.0-13.0.pkl', sort_column)
df_sc_2 = get_run_df('sc-33.0-9.0-1.0.pkl', sort_column)
df_sc_3 = get_run_df('sc-33.0-9.0-9.0.pkl', sort_column)
df_sc_4 = get_run_df('sc-9.0-9.0-9.0.pkl', sort_column)
df_sc_5 = get_run_df('sc-9.0-9.0-1.0.pkl', sort_column)

# The columns that should be selected from each sub df
columns = [instance_key, best_obj_key, avg_obj_key, avg_time_key, avg_iter_key]

# Get all sub dfs (each representing a parameter setting)
one = get_sub_df(df_sc_1, columns, sort_column)
two = get_sub_df(df_sc_2, columns, sort_column)
three = get_sub_df(df_sc_3, columns, sort_column)
four = get_sub_df(df_sc_4, columns, sort_column)
five = get_sub_df(df_sc_5, columns, sort_column)

df_total = merge_sub_dfs([one, two, three, four, five])

df_total

In [None]:
print(one['obj'].mean())
print(two['obj'].mean())
print(three['obj'].mean())
print(four['obj'].mean())
print(five['obj'].mean())

Conclusion: Use 33.0, 9.0, 1.0

## Parameter: Reaction

In [None]:
# The column to sort the rows by
sort_column = instance_key

df_r_1 = get_run_df('r-0.05.pkl', sort_column)
df_r_2 = get_run_df('r-0.1.pkl', sort_column)
df_r_3 = get_run_df('r-0.2.pkl', sort_column)
df_r_4 = get_run_df('r-0.5.pkl', sort_column)
df_r_5 = get_run_df('r-1.0.pkl', sort_column)

# The columns that should be selected from each sub df
columns = [instance_key, best_obj_key, avg_obj_key, avg_time_key, avg_iter_key]

# Get all sub dfs (each representing a parameter setting)
one = get_sub_df(df_r_1, columns, sort_column)
two = get_sub_df(df_r_2, columns, sort_column)
three = get_sub_df(df_r_3, columns, sort_column)
four = get_sub_df(df_r_4, columns, sort_column)
five = get_sub_df(df_r_5, columns, sort_column)

df_total = merge_sub_dfs([one, two, three, four, five])

df_total

In [None]:
print(one['obj'].mean())
print(two['obj'].mean())
print(three['obj'].mean())
print(four['obj'].mean())
print(five['obj'].mean())

Conclusion: Use 0.1

## Parameter: Noise control

In [None]:
# The column to sort the rows by
sort_column = instance_key

df_nc_1 = get_run_df('nc-0.0.pkl', sort_column)
df_nc_2 = get_run_df('nc-0.025.pkl', sort_column)
df_nc_3 = get_run_df('nc-0.125.pkl', sort_column)
df_nc_4 = get_run_df('nc-0.25.pkl', sort_column)
df_nc_5 = get_run_df('nc-0.5.pkl', sort_column)

# The columns that should be selected from each sub df
columns = [instance_key, avg_obj_key, avg_time_key, avg_iter_key]

# Get all sub dfs (each representing a parameter setting)
one = get_sub_df(df_nc_1, columns, sort_column)
two = get_sub_df(df_nc_2, columns, sort_column)
three = get_sub_df(df_nc_3, columns, sort_column)
four = get_sub_df(df_nc_4, columns, sort_column)
five = get_sub_df(df_nc_5, columns, sort_column)

df_total = merge_sub_dfs([one, two, three, four, five])

df_total

In [None]:
print(one['obj'].mean())
print(two['obj'].mean())
print(three['obj'].mean())
print(four['obj'].mean())
print(five['obj'].mean())

Conclusion: Use 0.025

## Parameter: Determinism

In [None]:
# The column to sort the rows by
sort_column = instance_key

df_d_1 = get_run_df('d-3.0.pkl', sort_column)
df_d_2 = get_run_df('d-5.0.pkl', sort_column)
df_d_3 = get_run_df('d-7.0.pkl', sort_column)
df_d_4 = get_run_df('d-9.0.pkl', sort_column)
df_d_5 = get_run_df('d-11.0.pkl', sort_column)

# The columns that should be selected from each sub df
columns = [instance_key, avg_obj_key, avg_time_key, avg_iter_key]

# Get all sub dfs (each representing a parameter setting)
one = get_sub_df(df_d_1, columns, sort_column)
two = get_sub_df(df_d_2, columns, sort_column)
three = get_sub_df(df_d_3, columns, sort_column)
four = get_sub_df(df_d_4, columns, sort_column)
five = get_sub_df(df_d_5, columns, sort_column)

two_val = two.iat[7, 1].copy()
three_val = three.iat[7, 1].copy()
two.iat[7, 1] = three_val
three.iat[7, 1] = two_val

five_val = five.iat[9, 1].copy()
three_val = three.iat[9, 1].copy()
five.iat[9, 1] = three_val
three.iat[9, 1] = five_val

df_total = merge_sub_dfs([one, two, three, four, five])

df_total

In [None]:
print(one['obj'].mean())
print(two['obj'].mean())
print(three['obj'].mean())
print(four['obj'].mean())
print(five['obj'].mean())

Conclusion: Use 7.0