# V-polyhedral disjunctive cuts plotting worksheet
1. Table 1: Summary statistics for percent gap closed by VPCs --- avg (%) and number of strict wins (best by at least `EPS`), including set of all instances and set of ≥ 10% gap closed instances
2. Table 2: Average percent gap closed by num disj terms
3. Table 3: Summary statistics for time to solve instances with branch-and-bound

We select instances that meet the following criteria:
1. Belong to MIPLIB, NEOS, or COR@L
2. IP optimal value is known
3. ≤ 5000 variables and 5000 constraints (in presolved instance)
4. The partial branch-and-bound tree with 64 leaves does not find an IP optimal solution
5. The disjunctive lower bound is strictly less than the maximum objective value on any leaf node

There are some instances for which we do not have data for all 6 partial tree sizes. We include these instances in most tables, except if we are showing how some statistic changes as the disjunction increases in size.

# Section 0: Set variables, import whatever is needed, and read in data

### Global variables

In [None]:
## Global variables
EPS = 1e-7
GAP_DIFF_EPS = 1e-3
INFINITY = 1e+100
MAX_TIME = 3600.

## Set up variables containing relevant directories
import os
repos_key = 'REPOS_DIR'
try:
    REPOS_DIR = os.environ[repos_key]
    print("REPOS_DIR set to \"%s\"." % REPOS_DIR)
    HOME_DIR = os.environ['HOME']
    print("HOME_DIR set to \"%s\"." % HOME_DIR)
except KeyError:
    print("*** ERROR: %s not found!" % repos_key)

VPC_DIR = REPOS_DIR + "/vpc/"
RESULTS_DIR = VPC_DIR + "results/saved/"
# RESULTS_DIR = HOME_DIR + '/' + "results/saved/"
DATA_DIR = VPC_DIR + "data/"

ONLY_PURE_BINARY = False
ONLY_MIXED_BINARY = False

# What to multiply by when an instance times out for nodes and time
TIMEOUT_TIME_FACTOR = 2. # TIMEOUT_TIME_FACTOR * MAX_TIME
TIMEOUT_NODE_FACTOR = 2. # TIMEOUT_NODE_FACTOR * # nodes

### Import data processing, plotting, and export packages and functions

In [None]:
## Import data processing, plotting, and export packages and functions
from IPython.display import display

from plots_helper import * # this includes matplotlib (+ params), pandas, and custom LaTeX helper functions

import re # regular expressions

### `initialize_df`: common way to process each data frame that we need

In [None]:
## Common way to process each data frame that we need
def initialize_df(filename):
    """
    Create a multilevel index df out of data from file `filename`.
    """
    df = pd.read_csv(filename, sep=',', index_col=False, skiprows=1)
    df.sort_values(by = ['INSTANCE','disj_terms'], inplace=True)
    df.set_index(['INSTANCE','disj_terms'], inplace=True)
    df.replace({"\'-inf\'": -np.inf, "\'inf\'": np.inf}, inplace=True)
    return df

## Column name definitions

(It is possible / likely that we are not using all of these...)

In [None]:
# Types of stats reported
pct_gap_closed_stub = "% GAP CLOSED"
bound_stub = "BOUND"
obj_stub = "OBJ"
time_stub = "TIME"
time_w_cuts_stub = time_stub + " " + "W/CUTGEN"
nodes_stub = "NODES"

# Types of solvers
solver_stubs = [ "REF", "REF+V" ]
solver_stat_list = [ "FIRST", "AVG", "BEST", "MIN", "MAX" ]

# Chosen statistics for computing with reference solver
REF_TYPE = 'AVG'
REFV_TYPE = 'AVG'
selected_stat_list = [ REF_TYPE, REFV_TYPE ]

# Gap closed cut columns
reg_cut_type_long_list = [ "GMIC", "ROOT", "BEST DISJ", "VPC", "MAX(GMIC,VPC)", "VPC+GMIC" ]
reg_cut_type_short_list = [ "G", "R", "DB", "V", "max(G,V)", "V+G" ]

solver_cut_type_stubs = [ "FIRST_CUT_PASS", "LAST_CUT_PASS" ]
solver_type = solver_stubs[0]
ref_cut_type_long_list    = [ solver_type + ' ' + cut_type for cut_type in solver_cut_type_stubs ]
ref_cut_type_short_list   = [ "GurF", "GurL" ]
solver_type = solver_stubs[1]
refv_cut_type_long_list   = [ solver_type + ' ' + cut_type for cut_type in solver_cut_type_stubs ]
refv_cut_type_short_list  = [ "V+GurF", "V+GurL" ]

solver_cut_type_long_list  = [ [solver_type + ' ' + cut_type for cut_type in solver_cut_type_stubs] for solver_type in solver_stubs ]
solver_cut_type_short_list = [ ref_cut_type_short_list, refv_cut_type_short_list ]

col_num_passes_stub = "NUM PASSES"

# Objective value columns
col_lp_obj = 'LP OBJ'
col_ip_obj = 'IP OBJ'
col_best_disj_stub = 'BEST DISJ'
col_best_disj_obj = col_best_disj_stub + ' ' + obj_stub
col_best_disj_gap = col_best_disj_stub + ' ' + pct_gap_closed_stub
col_worst_disj_stub = 'WORST DISJ'
col_worst_disj_obj = col_worst_disj_stub + ' ' + obj_stub
col_worst_disj_gap = col_worst_disj_stub + ' ' + pct_gap_closed_stub

obj_val_col_list = \
  [ col_lp_obj, col_ip_obj, col_worst_disj_obj ] \
  + \
  [ cut_type + ' ' + obj_stub for cut_type in reg_cut_type_long_list if cut_type != "MAX(GMIC,VPC)" ] \
  + \
  [ stat_type + ' ' + solver_type + ' ' + cut_type
      for cut_type in solver_cut_type_stubs
      for stat_type in solver_stat_list[:3]
      for solver_type in solver_stubs
  ]

# Time closed columns
time_type_list = [ "TIME", "NODES" ]

# Generation time
col_vpc_gen_time = 'VPC_GEN_TIME'

# Number rows/columns/cuts
col_num_rows        = 'ROWS'
col_num_cols        = 'COLS'
col_num_vpc         = 'NUM VPC'
col_num_gmic        = 'NUM GMIC'
col_num_disj_terms  = 'NUM DISJ TERMS'
col_num_obj         = 'NUM OBJ'
col_num_fails       = 'NUM FAILS'

# Instance information
col_binary          = 'BINARY'
col_integer         = 'INTEGER'
col_continuous      = 'CONTINUOUS'
col_gen_int         = 'GEN INT'
col_pure_binary     = 'IS PURE BINARY'
col_mixed_binary    = 'IS MIXED BINARY'

# Run information
col_exit_reason     = 'ExitReason'

### Gap column names

In [None]:
### Gap column names

# These will be calculated from the data
col_gmic            = 'GMIC' + ' ' + pct_gap_closed_stub
col_root            = 'ROOT' + ' ' + pct_gap_closed_stub
col_best_disj       = 'BEST DISJ' + ' ' + pct_gap_closed_stub
col_vpc             = 'VPC' + ' ' + pct_gap_closed_stub
col_max_gmic_vpc    = 'MAX(GMIC,VPC)' + ' ' + pct_gap_closed_stub
col_vpc_gmic        = 'VPC+GMIC' + ' ' + pct_gap_closed_stub

col_best_disj_vpc                 = 'BEST VPC DISJ'
col_best_disj_gmic_vpc            = 'BEST GMIC+VPC DISJ'
col_best_disj_refv_first_cut_pass = 'Best ' + refv_cut_type_short_list[0] + ' DISJ'
col_best_disj_refv_last_cut_pass  = 'Best ' + refv_cut_type_short_list[1] + ' DISJ'

## First cut pass
col_stub = solver_cut_type_stubs[0]
col_first_cut_pass_stubs = [ 
    stat_type + ' ' + solver_type + ' ' + col_stub 
    for stat_type in solver_stat_list[:3]
    for solver_type in solver_stubs 
]
#col_refv_first_cut_pass_stubs = [ stat_type + ' ' + 'REF+V FIRST_CUT_PASS' for stat_type in ref_solver_stat_list ]

# ref
solver_type = solver_stubs[0]
col_first_cut_pass_bound_ref_first = 'FIRST' + ' ' + solver_type + ' ' + col_stub
col_first_cut_pass_bound_ref_avg   = 'AVG' + ' ' + solver_type + ' ' + col_stub
col_first_cut_pass_bound_ref_best  = 'BEST' + ' ' + solver_type + ' ' + col_stub
col_first_cut_pass_gap_ref_first   = 'FIRST' + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub
col_first_cut_pass_gap_ref_avg     = 'AVG' + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub
col_first_cut_pass_gap_ref_best    = 'BEST' + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub

col_first_cut_pass_bound_ref       = REF_TYPE + ' ' + solver_type + ' ' + col_stub
col_first_cut_pass_gap_ref         = REF_TYPE + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub

# refv
solver_type = solver_stubs[1]
col_first_cut_pass_bound_ref_v     = REFV_TYPE + ' ' + solver_type + ' ' + col_stub
col_first_cut_pass_gap_ref_v       = REFV_TYPE + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub

## Last cut pass
col_stub = solver_cut_type_stubs[1]
col_last_cut_pass_stubs = [ 
    stat_type + ' ' + solver_type + ' ' + col_stub
    for stat_type in solver_stat_list[:3]
    for solver_type in solver_stubs 
]

# col_ref_last_cut_pass_stubs = [ stat_type + ' ' + 'REF LAST_CUT_PASS' for stat_type in ref_solver_stat_list ]
# col_refv_last_cut_pass_stubs = [ stat_type + ' ' + 'REF+V LAST_CUT_PASS' for stat_type in ref_solver_stat_list ]

# ref
solver_type = solver_stubs[0]
col_last_cut_pass_bound_ref_first  = 'FIRST' + ' ' + solver_type + ' ' + col_stub
col_last_cut_pass_bound_ref_avg    = 'AVG' + ' ' + solver_type + ' ' + col_stub
col_last_cut_pass_bound_ref_best   = 'BEST' + ' ' + solver_type + ' ' + col_stub
col_last_cut_pass_gap_ref_first    = 'FIRST' + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub
col_last_cut_pass_gap_ref_avg      = 'AVG' + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub
col_last_cut_pass_gap_ref_best     = 'BEST' + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub

col_last_cut_pass_bound_ref        = REF_TYPE + ' ' + solver_type + ' ' + col_stub
col_last_cut_pass_gap_ref          = REF_TYPE + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub

# refv
solver_type = solver_stubs[1]
col_last_cut_pass_bound_ref_v      = REFV_TYPE + ' ' + solver_type + ' ' + col_stub
col_last_cut_pass_gap_ref_v        = REFV_TYPE + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub

## Set of all gap cols
gap_cols = \
    [ col_stub + ' ' + pct_gap_closed_stub for col_stub in reg_cut_type_long_list] \
    + \
    [ col_stub + ' ' + pct_gap_closed_stub for col_stub in col_first_cut_pass_stubs] \
    + \
    [ col_stub + ' ' + pct_gap_closed_stub for col_stub in col_last_cut_pass_stubs]

solver_type = solver_stubs[0]
ref_solver_gap_cols = [
    stat_type + ' ' + solver_type + ' ' + col_stub + ' ' + pct_gap_closed_stub
    for stat_type in solver_stat_list[:3]
    for col_stub in solver_cut_type_stubs
]
# refv_solver_gap_cols = [
#     col_first_cut_pass_gap_ref_v,
#     col_last_cut_pass_gap_ref_v,
# ]

# Create maps between short and long names for gap columns
map_short_to_cols_gap = {
    short_stub : long_stub + ' ' + pct_gap_closed_stub for short_stub, long_stub in zip(reg_cut_type_short_list, reg_cut_type_long_list)
}
# Add ref and refv for first/last cut pass
for solver_ind in range(len(solver_stubs)):
    for cut_type_ind in range(len(solver_cut_type_stubs)):
        short_stub = solver_cut_type_short_list[cut_type_ind][solver_ind]
        long_stub = selected_stat_list[solver_ind] + ' ' + solver_cut_type_long_list[cut_type_ind][solver_ind] + ' ' + pct_gap_closed_stub
        map_short_to_cols_gap[short_stub] = long_stub
        
map_cols_to_short_gap = {v: k for k, v in map_short_to_cols_gap.items()}

gap_cols_short = list(map_short_to_cols_gap.keys())


### Time column names

In [None]:
### Time column names

solver_ind = 0
solver_type = solver_stubs[solver_ind]
ref_time_stub   = solver_type + ' ' + time_stub
ref_nodes_stub  = solver_type + ' ' + nodes_stub
ref_time_col    = selected_stat_list[solver_ind] + ' ' + ref_time_stub
ref_nodes_col   = selected_stat_list[solver_ind] + ' ' + ref_nodes_stub

solver_ind = 1
solver_type = solver_stubs[solver_ind]
refv_time_stub        = solver_type + ' ' + time_stub
refv_w_cut_time_stub  = solver_type + ' ' + 'W/CUTGEN' + ' ' + time_stub
refv_nodes_stub       = solver_type + ' ' + nodes_stub
refv_time_col         = selected_stat_list[solver_ind] + ' ' + refv_time_stub
refv_w_cut_time_col   = selected_stat_list[solver_ind] + ' ' + refv_w_cut_time_stub
refv_nodes_col        = selected_stat_list[solver_ind] + ' ' + refv_nodes_stub

map_cols_to_short_time = {
    ref_time_col          : 'Gur',
    refv_time_col         : 'V',
    col_vpc_gen_time      : 'Gen',
}

map_cols_to_short_nodes = {
    ref_nodes_col        : 'Gur',
    refv_nodes_col       : 'V',
}

map_short_to_cols_time = {v: k for k, v in map_cols_to_short_time.items()}
map_short_to_cols_nodes = {v: k for k, v in map_cols_to_short_nodes.items()}

time_cols_short = list(map_short_to_cols_time.keys())
node_cols_short = list(map_short_to_cols_nodes.keys())
# display(time_cols, node_cols)

# Select a subset of columns for the "long" list used when updating the 0-row
time_cols_long = [map_short_to_cols_time[col] for col in time_cols_short]
node_cols_long = [map_short_to_cols_nodes[col] for col in node_cols_short]

# How to handle timeouts (do we take average across the seeds, do we take the best, do we account for the timeout multiplicative factor?)
#STAT_FOR_TIMEOUT = 'BEST'
STAT_FOR_TIMEOUT = 'AVG'
ref_timeout_col = STAT_FOR_TIMEOUT + ' ' + ref_time_stub
refv_timeout_col = STAT_FOR_TIMEOUT + ' ' + refv_time_stub


## `df_ipopt`: Retrieve best known IP objective values

In [None]:
## Best known IP objective values
df_ipopt = pd.read_csv(DATA_DIR + "ip_obj.csv")
df_ipopt = df_ipopt.set_index(df_ipopt[df_ipopt.columns[0]])
# df_ipopt.rename(columns = {'IP_OBJ' : col_ip_obj}, inplace=True) # for consistency with other dfs
# df_ipopt.rename(columns = {'IP Objective' : col_ip_obj}, inplace=True) # for consistency with other dfs
df_ipopt = df_ipopt[~df_ipopt.index.duplicated()]
display(df_ipopt.head())
display(df_ipopt[col_ip_obj]['bm23_presolved'])

## `df_preprocess`: Results from preprocessing instances

In [None]:
## Results from preprocessing instances
df_preprocess = pd.read_csv(RESULTS_DIR + "vpc-preprocess.csv", sep=',', index_col=False, skiprows=1)
df_preprocess = df_preprocess.set_index(df_preprocess[df_preprocess.columns[0]])
display(df_preprocess.head())

col_cleaned_lp_obj = 'CLEANED LP OBJ'
display(df_preprocess.loc['bm23',col_cleaned_lp_obj])

## `df_bb`: Results from generating VPCs for various number of disjunctive terms

In [None]:
## Results from generating VPCs for various number of disjunctive terms
df_bb = initialize_df(RESULTS_DIR + "vpc-bb0bb.csv")
display(df_bb.head())

## `df_disjset`: Read in results from using cuts across all 6 trees

In [None]:
df_disjset = initialize_df(RESULTS_DIR + "vpc-disjset.csv")

# Set disj_terms index to be -1 for all instances
df_disjset.index = df_disjset.index.set_levels([-1], level='disj_terms')

display(df_disjset.head())

## `df`: Append to `df_bb` results from running baseline solver 7 times

In [None]:
## Append results from running baseline solver 7 times
#df = df_bb.append(initialize_df(RESULTS_DIR + "vpc-bb0.csv")) # deprecated
df = pd.concat([df_disjset, df_bb, initialize_df(RESULTS_DIR + "vpc-bb0.csv")])
df.sort_values(by = ['INSTANCE','disj_terms'], inplace=True)

col_list = [col_best_disj_obj, col_worst_disj_obj]
for col in col_list:
    df[col] = pd.to_numeric(df[col])

# Create new column for number of disjunctive terms since original one is now index
df[col_num_disj_terms] = df.index.get_level_values(1)

## Identify pure binary instances, which are those where 'CLEANED BINARY' column equals 'CLEANED COLS'
df[col_pure_binary] = (df[col_binary] == df[col_num_cols])

## Identify mixed binary instances, which are those where 'CLEANED GEN INT' column = 0
df[col_mixed_binary] = (df[col_gen_int] == 0)

# col_list = [col_num_disj_terms]
# for col in col_list:
#     df[col] = pd.to_numeric(df[col])

# start = 220
# end = start + 15
# print(df.columns[start:end])
# print(df.dtypes[start:end])

display(df.head())

In [None]:
# # get the count of each instance-disj_terms pair
# counts = df.groupby(level=[0]).size()

# # get the instances that have only one occurrence
# instances_with_one_occurrence = counts[counts == 1].index.get_level_values(0).unique()

# # filter df_bb to only include instances with one occurrence
# df_bb_one_occurrence = df.loc[instances_with_one_occurrence]

# # display the filtered dataframe
# display(df_bb_one_occurrence)

### Remove `stein*` instances (keep modified `stein*_nocard` instances)

In [None]:
# Remove unmodified stein instances from consideration
df.drop(index = ['stein09_presolved', 'stein15_presolved', 'stein27_presolved', 'stein45_presolved'], inplace=True)
df.index = df.index.remove_unused_levels()

### Modify `mas` instances

In [None]:
# This is done to sort correctly with mas284
df_preprocess.rename(index={'mas74': 'mas074', 'mas76': 'mas076'}, inplace=True)

### Recompute average running time to account for timeouts

In [None]:
for curr_df in [df, df_disjset]:
  for solver in solver_stubs:
    # Split values in 'ALL REF TIME' into new columns for 'REF TIME (SEED)' where 'SEED' takes values 628 * [1,2,3,4,5,6,7]
    df_timing = curr_df['ALL '+solver+' TIME'].str.split(';', expand=True)
    df_timing.columns = [solver+' TIME (%d)' % (i+1) for i in range(df_timing.shape[1])]
    df_timing = df_timing.astype(float)
    df_timing_cols = df_timing.columns[df_timing.columns.str.contains(re.escape(solver)+r' TIME (.+)')]

    # Do the same for nodes
    df_nodes = curr_df['ALL '+solver+' NODES'].str.split(';', expand=True)
    df_nodes.columns = [solver+' NODES (%d)' % (i+1) for i in range(df_nodes.shape[1])]
    df_nodes = df_nodes.astype(float)
    df_nodes_cols = df_nodes.columns[df_nodes.columns.str.contains(re.escape(solver)+r' NODES (.+)')]

    # Select entries in which the max time is greater than MAX_TIME
    selected_entries = df_timing > (MAX_TIME - EPS)

    # Add min and max of 'REF TIME (SEED)' columns
    curr_df[solver+' TIME MIN'] = df_timing[df_timing_cols].min(axis=1)
    curr_df[solver+' TIME MAX'] = df_timing[df_timing_cols].max(axis=1)

    # Find average of 'REF TIME (SEED)' columns after adjusting for timeout
    df_timing[selected_entries] = TIMEOUT_TIME_FACTOR * MAX_TIME
    # df_timing[solver+' TIME AVG'] = df_timing.mean(axis=1)
    curr_df['AVG '+solver+' TIME'] = df_timing[df_timing_cols].mean(axis=1)

    ## Repeat for nodes
    # Add min and max of 'REF NODES (SEED)' columns
    curr_df[solver+' NODES MIN'] = df_nodes[df_nodes_cols].min(axis=1)
    curr_df[solver+' NODES MAX'] = df_nodes[df_nodes_cols].max(axis=1)

    # Find average of 'REF NODES (SEED)' columns after adjusting for timeout
    selected_entries.columns = df_nodes_cols
    df_nodes[selected_entries] *= TIMEOUT_NODE_FACTOR
    # df_nodes[solver+' NODES AVG'] = df_nodes[df_nodes_cols].mean(axis=1)
    curr_df['AVG '+solver+' NODES'] = df_nodes[df_nodes_cols].mean(axis=1)

    # Append df_timing to df
    curr_df = pd.concat([curr_df, df_timing], axis=1)

    # Append df_nodes to df
    curr_df = pd.concat([curr_df, df_nodes], axis=1)

df.head()

### Add col for avg b&b time counting cut generation

In [None]:
### Add total time for running solver + generating cuts
df[refv_w_cut_time_col] = df[refv_time_col] + df[col_vpc_gen_time]

inst_set = ['a1c1s1_presolved']

# display(df.loc['30n20b8_presolved'][[ref_time_col,refv_time_col,col_vpc_gen_time,refv_w_cut_time_col]])
# display(df.loc['binkar10_1_presolved'][[ref_time_col,refv_time_col,col_vpc_gen_time,refv_w_cut_time_col]])
display(df.loc[inst_set][[col_num_vpc, ref_time_col,refv_time_col,col_vpc_gen_time,refv_w_cut_time_col]])

### (Disabled; code fixed in commit 4ed946c) Fix mistake in code for one root pass containing wrong bound

In [None]:
# for prefix in solver_stat_list:
#     inst_set = df[prefix + " REF+V ROOT_PASSES"] == 1
#     tmp_df = df[inst_set]
#     tmp_df = tmp_df[[col_lp_obj,
#                      prefix + " REF+V BOUND", 
#                      prefix + " REF+V FIRST_CUT_PASS",
#                      prefix + " REF+V LAST_CUT_PASS"]]
#     tmp_df.tail(30)

#     tmp_tmp_df = tmp_df["LP OBJ"] - tmp_df[prefix + " REF+V FIRST_CUT_PASS"]
#     assert(tmp_tmp_df.max() < EPS)

#     refcol = prefix + " REF+V BOUND"
#     col = prefix + " REF+V FIRST_CUT_PASS"
#     df.loc[inst_set,col] = tmp_df[refcol].values
#     col = prefix + " REF+V LAST_CUT_PASS"
#     df.loc[inst_set,col] = tmp_df[refcol].values


# df.loc  [["misc02_presolved",'cap6000_presolved'],
#          ["FIRST REF+V BOUND",
#           "FIRST REF+V FIRST_CUT_PASS",
#           "FIRST REF+V LAST_CUT_PASS"
#          ]
#          +
#          ["BEST REF+V BOUND",
#           "BEST REF+V FIRST_CUT_PASS",
#           "BEST REF+V LAST_CUT_PASS"
#          ]
#         ]

## `instances`: get unique instance list

In [None]:
# Get unique instance list
if (ONLY_PURE_BINARY):
    # Select only instances in df in which column ['IS PURE BINARY'] is True
    tmp_df = df[df['IS PURE BINARY'] == True]
    tmp_df.index = tmp_df.index.remove_unused_levels()
    instances = tmp_df.index.levels[0]
elif (ONLY_MIXED_BINARY):
    tmp_df = df[df['IS MIXED BINARY'] == True]
    tmp_df.index = tmp_df.index.remove_unused_levels()
    instances = tmp_df.index.levels[0]
else:
    instances = df.index.levels[0]

instances.set_names(names = 'Instance', inplace=True)

pure_binary_instances = df[df[col_pure_binary] == True].index.get_level_values(0).unique().to_list()

print("Number of selected instances: ", len(instances))
print("Number of binary instances: {:d}/{:d}".format(len(pure_binary_instances), len(instances)))

In [None]:
# Write instances to file
with open('./' + "instances.txt", "w") as f:
    for inst in instances:
        f.write(inst + "\n")

## `df_rejection_reason`: Track why instances were not selected for our statistics

In [None]:
rejection_reasons = [
    'SELECTED_GAP', # *not* rejected for gap experiments; _must_ be first column
    'SELECTED_TIME', # *not* rejected for time experiments; _must_ be second column
    'SELECTED_6TREES', # *not* rejected for 6trees set for time experiments; _must_ be third column
    'NUM_WITH_OBJS', # number of attempts that successfully tried solving the PRLP
    'NUM_WITH_CUTS', # number of attempts that successfully yielded cuts
    'IP_OPT_UNKNOWN', # ip opt val must be known
    'TOO_MANY_ROWS_OR_COLS', # require max(nrows, ncols) ≤ 5K
    'OPTIMAL_SOLUTION_FOUND', # optimal solution should not be found by any of the partial trees
    'LP_OPT_IS_NOT_CUT', # check if lp opt < ip opt
    'DLB=DUB', # check if disj lb < disj ub
    'LP=DLB=DUB', # require either lp opt < disj lb or disj lb < disj ub
    'PRLP_INFEASIBLE', # require PRLP is feasible and solves within timelimit for at least one of the attempts
    'PRLP_TIME_LIMIT', # require PRLP solves within timelimit for at least one of the attempts
    'NO_CUTS', # there must be cuts from at least one of the partial b&b trees
    'NO_GAP', # require that ip opt != lp opt
    'GUR_TIMEOUT', # require Gur7 < 3600 (Gurobi is able to solve the instance to optimality within an hour either with or without using VPCs)'
    '<7_ATTEMPTS', # indicates not all partial trees were successfully run
]
df_rejection_reason = pd.DataFrame(index = instances, columns = rejection_reasons, dtype=bool)
df_rejection_reason.iloc[:,3:] = False # no rejection criteria at true

for col in ['OPTIMAL_SOLUTION_FOUND']:
    df_rejection_reason[col] = df_rejection_reason[col].astype(np.int64)
for col in ['NUM_WITH_OBJS', 'NUM_WITH_CUTS', 'LP_OPT_IS_NOT_CUT', 'DLB=DUB', 'LP=DLB=DUB', 'PRLP_INFEASIBLE', 'PRLP_TIME_LIMIT', 'GUR_TIMEOUT']:
    df_rejection_reason[col] = df_rejection_reason[col].astype(np.int8)
display(df_rejection_reason.head())

## `map_rejection_reason_to_number`: Reference paper's rejection criteria

In [None]:
# map_rejection_reason_to_number = {
#     'OPTIMAL_SOLUTION_FOUND':   '(3)',
#     'LP=DLB=DUB':               '(4a)',
#     'PRLP_INFEASIBLE':          '(4b)',
#     'PRLP_TIME_LIMIT':          '(4c)',
#     '<7_ATTEMPTS':              '(?)',
# }
map_rejection_reason_to_number = {
    'IP_OPT_UNKNOWN':           '\\ref{selection-criterion:ip-opt-known}',
    'NO_GAP':                   '\\ref{selection-criterion:ip-opt-known}',
    'TOO_MANY_ROWS_OR_COLS':    '\\ref{selection-criterion:max-instance-size}',
    'OPTIMAL_SOLUTION_FOUND':   '\\ref{selection-criterion:partial-tree-does-not-find-opt}',
    'LP=DLB=DUB':               '\\ref{selection-criterion:cuts-are-generated:not_lp=dlb=dub}',
    'PRLP_INFEASIBLE':          '\\ref{selection-criterion:cuts-are-generated:PRLP-primal-feasible}',
    'PRLP_TIME_LIMIT':          '\\ref{selection-criterion:cuts-are-generated:PRLP-time-limit}',
    'NO_CUTS':                  '\\ref{selection-criterion:cuts-are-generated:cuts-are-generated}',
    'GUR_TIMEOUT':              'G',
    '<7_ATTEMPTS':              '?',
}

## `df_status_by_depth`: Track success or failure reason by depth

In [None]:
sizes = [2, 4, 8, 16, 32, 64]
df_status_by_depth = pd.DataFrame(index = instances, columns = sizes, dtype=str)

DEFAULT_STATUS = map_rejection_reason_to_number['<7_ATTEMPTS']

df_status_by_depth[:] = DEFAULT_STATUS

display(df_status_by_depth.head())

### DEBUG

In [None]:
# col = "REF+V FIRST_CUT_PASS"
# tmp = df[col]
# display(tmp)

# for col in df.columns:
#     if str(col).endswith("FIRST_CUT_PASS"):
#         print("{}".format(col))

# inst = 'neos22_presolved'
# col = 'NUM DISJ TERMS'
# df.loc[inst][col]

# display(df.loc[('bppc4-08_presolved',2)]['LP OBJ'])
# display(df.loc[('bppc4-08_presolved',2)]['BEST DISJ OBJ'])
# display(df.loc[('bppc4-08_presolved',2)]['WORST DISJ OBJ'])
# display(df['BEST DISJ OBJ'])

# Section 1: Select instances

### `selected_gap_instances_dict` (original index, instance): Select instances for gap closed calculations

Criteria to filter gap closed instances:
* ip opt val is known
* lp opt < ip opt
* max(nrows, ncols) ≤ 5K
* optimal solution should not be found by any of the partial trees
* either lp opt < disj lb or disj lb < disj ub
* PRLP is feasible and solves within timelimit for at least one of the attempts

In [None]:
## Select instances for gap closed calculations
#
# Criteria to filter gap closed instances:
# * ip opt val is known
# * lp opt < ip opt
# * max(nrows, ncols) ≤ 5K
# * optimal solution should not be found by any of the partial trees
# * either lp opt < disj lb or disj lb < disj ub
# * PRLP is feasible and solves within timelimit for at least one of the attempts

# Constants
MAX_ROWS = 5000
MAX_COLS = MAX_ROWS
PRINT_SKIP_REASON = False

# Information to save
selected_gap_instances_dict = {} # dictionary of (original index, instance)
#selected_indices = []
num_gap_errors = 0

inst_set = instances
num_attempts = np.zeros(len(inst_set), dtype=int)

for i, inst in enumerate(inst_set):
    if inst == 'cod105_presolved':
        PRINT_SKIP_REASON = True
    else:
        PRINT_SKIP_REASON = False
    print("{}/{}".format(i+1,len(inst_set)), end='\r', flush=True)
    skip_instance = False
    curr_df = df.loc[inst]
    curr_df = curr_df[curr_df.index.get_level_values(0) >= 0]
    
    # Count number of times instance appears
    num_attempts[i] = len(curr_df)

    if num_attempts[i] < 7:
        print("*** ERROR: Instance {:d} -- {}: {:d} < 7 attempts.".format(i, inst, num_attempts[i]))
        skip_instance = True
        num_gap_errors += 1
        df_rejection_reason.loc[inst, '<7_ATTEMPTS'] = True

    # Check that LP opt < IP opt
    lp_obj = np.float64(df_preprocess.loc[remove_presolved_from_name(inst),col_cleaned_lp_obj])
    ip_obj = np.float64(df_ipopt.loc[inst,col_ip_obj])
    YES_GAP = (ip_obj - lp_obj) >= EPS
    if not YES_GAP:
        print("*** ERROR: Instance {:d} -- {}: not YES GAP (lp = {:.10f}; ip = {:.10f}, diff = {:.2f})".format(i, inst, lp_obj, ip_obj, ip_obj-lp_obj))
        skip_instance = True
        num_gap_errors += 1
        df_rejection_reason.loc[inst, 'NO_GAP'] = True
        
    # Check that ExitReason != OPTIMAL_SOLUTION_FOUND
    OPT_SOL_FOUND = False
    for curr_index, row in curr_df.iterrows():
        #print(i,j, curr_df['ExitReason'])
        curr_depth = int(curr_index)
        if curr_depth <= 0:
            continue
        exitreason = row[col_exit_reason]
        if exitreason == 'OPTIMAL_SOLUTION_FOUND' and not OPT_SOL_FOUND:
            if PRINT_SKIP_REASON:
                print("Skipping instance {:d} -- {}: optimal IP solution found at depth {:d}.".format(
                    i, inst, curr_depth
                ))
            skip_instance = True
            OPT_SOL_FOUND = True
            df_rejection_reason.loc[inst, 'OPTIMAL_SOLUTION_FOUND'] = curr_depth
        if OPT_SOL_FOUND:
            df_status_by_depth.loc[inst, curr_depth] = map_rejection_reason_to_number['OPTIMAL_SOLUTION_FOUND']
        else:
            df_status_by_depth.loc[inst, curr_depth] = ''

    # Check that best and worst bound on leaf nodes is not same (likely cause of primal infeasible PRLP)
    num_successful_attempts = 0
    has_zero = False # could check with "0 in curr_df['NUM DISJ TERMS'], but that would implicitly add an extra (short) loop per instance"
    terms = curr_df.index
    for curr_index in terms:
        if curr_df[col_num_disj_terms][curr_index] == 0:
            has_zero = True
            continue
            
        curr_lp_obj = curr_df[col_lp_obj][curr_index]
        curr_ip_obj = curr_df[col_ip_obj][curr_index]
        best_disj_obj = curr_df[col_best_disj_obj][curr_index]
        worst_disj_obj = curr_df[col_worst_disj_obj][curr_index]
        num_frac = curr_df['NUM FRAC'][curr_index]
        num_obj_tried = curr_df[col_num_obj][curr_index]
        num_cuts = curr_df[col_num_vpc][curr_index] # can be > 0 even if num_obj_tried = 0, b/c of OPTIMAL_SOLUTION_FOUND exit reason
        exitreason = curr_df[col_exit_reason][curr_index]

        # Quick double check that LP and IP objectives are correct
        if abs(curr_lp_obj - lp_obj) >= GAP_DIFF_EPS:
            raise ValueError(
                "*** ERROR: Instance {:d} -- {}: at depth {:d}, curr lp obj {:.10f} != lp obj {:.10f}".format(
                    i, inst, curr_index, curr_lp_obj, lp_obj
                )
            )
        if abs(curr_ip_obj - ip_obj) >= GAP_DIFF_EPS:
            raise ValueError(
                "*** ERROR: Instance {:d} -- {}: at depth {:d}, curr ip obj {:.10f} != ip obj {:.10f}".format(
                    i, inst, curr_index, curr_ip_obj, ip_obj
                )
            )

        curr_YES_GAP = (ip_obj - lp_obj) >= 1e-7
        LP_OPT_IS_CUT = (num_frac > 0) and curr_YES_GAP and abs(lp_obj - worst_disj_obj) >= 1e-7
        DLB_NE_DUB = (num_frac > 0) and abs(best_disj_obj - worst_disj_obj) >= 1e-7
        df_rejection_reason.loc[inst, 'NO_GAP'] += (not curr_YES_GAP)
        df_rejection_reason.loc[inst, 'LP_OPT_IS_NOT_CUT'] += (not LP_OPT_IS_CUT)
        df_rejection_reason.loc[inst, 'DLB=DUB'] += (not DLB_NE_DUB)
        df_rejection_reason.loc[inst, 'PRLP_INFEASIBLE'] += (exitreason == 'PRLP_INFEASIBLE')
        df_rejection_reason.loc[inst, 'PRLP_TIME_LIMIT'] += (exitreason == 'PRLP_TIME_LIMIT')
        # if not DLB_NE_DUB and num_obj_tried > 0:
        #     raise ValueError(
        #         "*** ERROR: Instance {:d} -- {}: at depth {:d}, num obj tried = {:d} (num cuts = {:d}) but lp opj {:.10f}, best_disj_obj {:.10f} = worst_disj_obj {:.10f} with exit reason {}".format(
        #             i, inst, curr_index, num_obj_tried, num_cuts, lp_obj, best_disj_obj, worst_disj_obj, curr_df['ExitReason'][curr_index]
        #         )
        #     )
        if LP_OPT_IS_CUT or DLB_NE_DUB:
            if (num_obj_tried == 0) and (exitreason not in ['PRLP_TIME_LIMIT','PRLP_INFEASIBLE','OPTIMAL_SOLUTION_FOUND','TIME_LIMIT']):
                # We should be trying objectives at this point, unless the initial PRLP timed out or was infeasible or an optimal solution was found
                raise ValueError(
                    "*** ERROR: Instance {:d} -- {}: at depth {:d}, num obj tried = 0 but lp opj {:.10f} < best_disj_obj {:.10f} < worst_disj_obj {:.10f} with exit reason {}".format(
                        i, inst, curr_index, lp_obj, best_disj_obj, worst_disj_obj, curr_df['ExitReason'][curr_index]
                    )
                )
            if num_obj_tried > 0:
                df_rejection_reason.loc[inst, 'NUM_WITH_OBJS'] += 1
                if num_cuts > 0:
                    num_successful_attempts += 1
                    df_rejection_reason.loc[inst, 'NUM_WITH_CUTS'] += 1
                else:
                    df_status_by_depth.loc[inst, curr_index] = map_rejection_reason_to_number['NO_CUTS']
            elif exitreason == 'PRLP_INFEASIBLE':
                df_status_by_depth.loc[inst, curr_index] = map_rejection_reason_to_number[exitreason]
            elif exitreason == 'PRLP_TIME_LIMIT':
                df_status_by_depth.loc[inst, curr_index] = map_rejection_reason_to_number[exitreason]
        else:
            # check that num obj tried is 0
            if (num_obj_tried > 0):
                raise ValueError(
                    "*** ERROR: Instance {:d} -- {}: at depth {:d}, num obj tried = {:d} > 0 but best_disj_obj {:f} = worst_disj_obj {:f}".format(
                        i, inst, curr_index, num_obj_tried, best_disj_obj, worst_disj_obj
                    )
                )
            df_rejection_reason.loc[inst, 'LP=DLB=DUB'] += 1
            df_status_by_depth.loc[inst, int(curr_index)] = map_rejection_reason_to_number['LP=DLB=DUB']

    if not has_zero:
        raise ValueError(
            "*** ERROR: Instance {:d} -- {}: has no bb0 entry.".format(
                i, inst, curr_index
            )
        )        
    
    if num_successful_attempts == 0 and not skip_instance:
        if PRINT_SKIP_REASON:
            print("Skipping instance {:d} -- {}: best and worst bound on leaf nodes coincide for all trees, no objectives ever tried, or no objectives successfully produced cuts.".format(
                i, inst, num_attempts[i]))
        skip_instance = True
        exitreason = 'NO_CUTS'
        df_rejection_reason.loc[inst, exitreason] = True
    else:        
        # Ensure IP objective value is known
        ip_obj = curr_df[col_ip_obj][curr_df.index[0]]
        if not isinstance(ip_obj,float):
            if PRINT_SKIP_REASON:
                print(
                    "Skipping instance {:d} -- {}: IP objective value ({}) is not detected to be a float value.".format(
                    i, inst, ip_obj))
            skip_instance = True
            df_rejection_reason.loc[inst, 'IP_OPT_UNKNOWN'] = True
            
        # Ensure nrows and ncols is not too many
        nrows = curr_df[col_num_rows][curr_df.index[0]]
        ncols = curr_df[col_num_cols][curr_df.index[0]]
        if (nrows > MAX_ROWS) or (ncols > MAX_COLS):
            if PRINT_SKIP_REASON:
                print("Skipping instance {:d} -- {}: nrows = {:d} > {:d} or ncols = {:d} > {:d}.".format(
                        i, inst, nrows, ncols, MAX_ROWS, MAX_COLS))
            skip_instance = True
            df_rejection_reason.loc[inst, 'TOO_MANY_ROWS_OR_COLS'] = True
    
    if not skip_instance:
        #selected_gap_instances_dict[len(selected_gap_instances_dict)] = inst
        selected_gap_instances_dict[inst] = i
    else:
        df_rejection_reason.loc[inst, 'SELECTED_GAP'] = False
        df_rejection_reason.loc[inst, 'SELECTED_TIME'] = False
        df_rejection_reason.loc[inst, 'SELECTED_6TREES'] = False

selected_gap_instances = selected_gap_instances_dict.keys()
num_selected_gap_instances = len(selected_gap_instances_dict)
print("Total number of errors: {}".format(num_gap_errors))
print("Total number of selected instances for gap closed reporting: {}/{:d}".format(num_selected_gap_instances,len(instances)))

# intersect the pure binary instances with the selected gap instances
binary_x_gap_instances = [inst for inst in pure_binary_instances if inst in selected_gap_instances]
print("Total number of binary instances for gap closed reporting: {}/{:d}".format(len(binary_x_gap_instances),num_selected_gap_instances))

In [None]:
# Retrieve all instances from df with df_rejection_reason '<7_ATTEMPTS' == True
instances_with_less_than_7_attempts = df_rejection_reason[df_rejection_reason['<7_ATTEMPTS'] == True].index.tolist()
display(instances_with_less_than_7_attempts)

# Get df_bb entries for instances_with_less_than_7_attempts
df_bb_with_less_than_7_attempts = df.loc[instances_with_less_than_7_attempts]

df_bb_with_less_than_7_attempts.loc[instances_with_less_than_7_attempts[0]]

In [None]:
max_num_vpc_df = df.loc[df.index.get_level_values(1) > 0]
max_num_vpc_df = max_num_vpc_df[col_num_vpc].groupby(level=0).max()
print("Across selected_gap_instances, minimum number of VPCs generated across all depths: ",max_num_vpc_df.loc[selected_gap_instances].min())

### `selected_time_instances_dict` and `all6_instances_dict` (original index, instance): Select instances for time tables

Criteria to filter instances for reporting time:
* ip opt val is known
* lp opt < ip opt
* max(nrows, ncols) ≤ 5K
* optimal solution should not be found by any of the partial trees
* either lp opt < disj lb or disj lb < disj ub
* PRLP is feasible and solves within timelimit for at least one of the attempts
* Gur7 < 3600 (Gurobi is able to solve the instance to optimality within an hour either with or without using VPCs)

6 trees set
* all six partial tree sizes produced VPCs and solved within timelimit (to make # of instances consistent across depths)

In [None]:
## Select instances for time tables
#
# Criteria to filter instances for reporting time:
# * ip opt val is known
# * lp opt < ip opt 
# * max(nrows, ncols) ≤ 5K
# * optimal solution should not be found by any of the partial trees
# * either lp opt < disj lb or disj lb < disj ub
# * PRLP is feasible and solves within timelimit for at least one of the attempts
# * min(Gur,Gur+V) < 3600 (Gurobi is able to solve the instance to optimality within an hour either with or without using VPCs)
#
# 6 trees set
# * all six partial tree sizes produced VPCs 
# * ** and solved within timelimit (to make number of instances consistent across depths)

# Constants
PRINT_SKIP_REASON = False

# Information to save
selected_time_instances_dict = {}   # dictionary of (original index, instance)
all6_instances_dict = {}            # dictionary of (original index, instance)
skipped_instances_dict = {}         # dictionary of (original index, instance)
error_instances_dict = {}           # dictionary of (original index, instance)

num_timeouts = 0
num_time_errors = 0

inst_set = selected_gap_instances
# inst_set = ['lotsize_presolved']
# inst_set = ['neos-3754480-nidda_presolved']
for i, inst in enumerate(inst_set):
    print("{}/{}".format(i+1,len(inst_set)), end='\r', flush=True)
    skip_instance = False
    curr_df = df.loc[inst]
    curr_df = curr_df[curr_df.index.get_level_values(0) >= 0]

    # Check Gur < 3600 (Gurobi is able to solve the instance to optimality within an hour without using VPCs)
    mintime_gur = float(curr_df.loc[2:64,ref_timeout_col].min())
    
    # Check V < 3600 (Gurobi is able to solve the instance to optimality within an hour either with using VPCs)
    mintime_gur7 = float(curr_df.loc[2:64,refv_timeout_col].min())

    mintime = min(mintime_gur, mintime_gur7)
    if mintime >= MAX_TIME:
        if PRINT_SKIP_REASON:
            print("{:d}: Skipping instance {:d} -- {}: Gurobi's best time (with or without VPCs) is {:.7f} >= {:.7f}.".format(
                    len(skipped_instances_dict), i, inst, mintime, MAX_TIME-EPS
                ))
        skip_instance = True
        skipped_instances_dict[inst] = i
        num_timeouts += 1
        df_rejection_reason.loc[inst, 'GUR_TIMEOUT'] += 1
        # df_status_by_depth.loc[inst, int(curr_index)] = map_rejection_reason_to_number['GUR_TIMEOUT']

    # Check how many times VPCs were successfully generated
    num_successful_attempts = 0
    curr_num_timeouts = 0
    has_zero = False
    for curr_index, row in curr_df.iterrows():
        if row[col_num_disj_terms] == 0:
            has_zero = True
            continue

        num_vpc = float(row[col_num_vpc])
        num_successful_attempts += (num_vpc > 0)

        curr_time = float(curr_df.loc[curr_index,refv_timeout_col])
        if curr_time > MAX_TIME - EPS:
            curr_num_timeouts += 1
            if df_status_by_depth.loc[inst, int(curr_index)] == DEFAULT_STATUS:
                df_status_by_depth.loc[inst, int(curr_index)] = map_rejection_reason_to_number['GUR_TIMEOUT']

    if not has_zero:
        raise ValueError(
            "*** ERROR: Instance {:d} -- {}: has no bb0 entry.".format(
                i, inst, curr_index
            )
        )        
    
    # if num_successful_attempts == 0 and not skip_instance:
    #     if PRINT_SKIP_REASON:
    #         print("Skipping instance {:d} -- {}: no VPCs generated successfully for any number of terms.".format(i, inst, num_attempts[i]))
    #     skip_instance = True
    #     skipped_instances_dict[inst] = i

    if not skip_instance:
        if num_successful_attempts == 6 and curr_num_timeouts == 0:
            all6_instances_dict[inst] = i
        else:
            df_rejection_reason.loc[inst, 'SELECTED_6TREES'] = False
        selected_time_instances_dict[inst] = i
    else:
        df_rejection_reason.loc[inst, 'SELECTED_TIME'] = False
        df_rejection_reason.loc[inst, 'SELECTED_6TREES'] = False

selected_time_instances = selected_time_instances_dict.keys()
all6_instances = all6_instances_dict.keys()

num_selected_time_instances = len(selected_time_instances_dict)
num_all6_instances = len(all6_instances_dict)
print("Total number of errors: {}".format(num_time_errors))
print("Total number of timeouts: {}".format(num_timeouts))
print("Total number of instances for time reporting: {}/{}".format(num_selected_time_instances, len(inst_set)))
print("Total number of \"6 trees\" instances: {}".format(num_all6_instances))

#list(set(selected_gap_instances_dict.keys()).intersection(pure_binary_instances))
#selected_pure_binary_instances.sort()

# print("Found {:d} pure binary instances".format(len(selected_pure_binary_instances)))

binary_x_time_instances = [inst for inst in pure_binary_instances if inst in selected_time_instances_dict]
print("Number of pure binary instances for time reporting: {}".format(len(binary_x_time_instances)))

all6_binary_x_time_instances = [inst for inst in all6_instances_dict.keys() if inst in binary_x_time_instances]
print("Number of pure binary instances in all-6 set: {}".format(len(all6_binary_x_time_instances)))

#### DEBUG (check which instances were selected but do not have all six runs)

In [None]:
## DEBUG (check which instances were selected but do not have all six runs)
not_all_6 = [key for key in selected_time_instances_dict.keys() if key not in all6_instances_dict.keys()]
not_all_6

In [None]:
import csv

# Define the file paths
gap_file_path = 'selected_gap_instances.csv'
time_file_path = 'selected_time_instances.csv'
rejected_file_path = 'rejected_instances.csv'

# Get the keys from selected_gap_instances_dict and selected_time_instances_dict
selected_gap_keys = list(selected_gap_instances_dict.keys())
selected_time_keys = list(selected_time_instances_dict.keys())

# Write the keys to the CSV files
with open(gap_file_path, 'w', newline='') as gap_file:
  gap_writer = csv.writer(gap_file)
  gap_writer.writerow(['Selected Gap Instances'])
  gap_writer.writerows([[key] for key in selected_gap_keys])

with open(time_file_path, 'w', newline='') as time_file:
  time_writer = csv.writer(time_file)
  time_writer.writerow(['Selected Time Instances'])
  time_writer.writerows([[key] for key in selected_time_keys])

df_rejection_reason.to_csv(rejected_file_path)

print('Selected instances have been written to the CSV files.')


# Section 2: Gap closed tables

### `calc_gap_closed` function

In [None]:
## Calculate gap closed for GMICs, Gurobi, and VPCs
def calc_gap_closed(gap_df, col):
    return np.where(
        ((gap_df[col] > EPS) & (gap_df[col] < INFINITY)) | ((gap_df[col] < -EPS) & (gap_df[col] > -INFINITY)), # condition
        100. * (gap_df[col] - gap_df[col_lp_obj]) / (gap_df[col_ip_obj] - gap_df[col_lp_obj]), # if condition is true
        0.0 # if condition is false
    )

# def calc_gap_closed2(gap_df, col):
#     conditions = gap_df[col] > EPS & np.isfinite(gap_df[col])
#     choices = 100. * (gap_df[col] - gap_df["LP OBJ"]) / (gap_df["IP OBJ"] - gap_df["LP OBJ"])
#     return np.select(conditions, choices, default=0.0)

### `gap_df`: Calculate gap closed for GMICs, Gurobi, and VPCs

In [None]:
# Create subset of dataframe relevant to gap closed
gap_df = df.loc[:, 
                [
                    col_num_disj_terms,
                    col_num_rows,
                    col_num_cols
                ]
                +
                obj_val_col_list
                +
                [
                    col_num_gmic,
                    col_num_vpc,
                    col_num_obj,
                    col_exit_reason
                ]
               ]

# Calculate some missing % gap closed columns
# gap closed = 100 * (post_cut_opt_val - lp_opt_val) / (ip_opt_val - lp_opt_val)
for cut_type in reg_cut_type_long_list:
    col = cut_type + ' ' + obj_stub
    if col not in df.columns:
        if cut_type == 'MAX(GMIC,VPC)':
            # Add max(G,V) column
            gap_df[cut_type + ' ' + pct_gap_closed_stub] = \
                np.maximum(
                    gap_df['GMIC' + ' ' + pct_gap_closed_stub],
                    gap_df[ 'VPC' + ' ' + pct_gap_closed_stub]
                )
        continue
    gap_df[cut_type + ' ' + pct_gap_closed_stub] = calc_gap_closed(gap_df, col)

# Compare against reference solver
for stat_type in solver_stat_list[:3]:
    for solver_type in solver_stubs:
        for cut_type in solver_cut_type_stubs:
            col = stat_type + ' ' + solver_type + ' ' + cut_type
            gap_df[col + ' ' + pct_gap_closed_stub] = calc_gap_closed(gap_df, col)

display(gap_df.loc[['bm23_presolved','maxgasflow_presolved']][gap_cols])
display(gap_df.loc[("bm23_presolved",2)])

### `selected_gap_df`: Gap closed for selected instances, adding 0-row that has best for `V+` cols

In [None]:
## `selected_gap_df`: Gap closed for selected instances, adding 0-row that has best for `V+` cols
## Show the instances that have been selected (and their original index)
## and then set the selected_gap_df as the selected instances from gap_df
## We also set the '0' row to contain the best result for each method
## (including the option of not using VPCs at all)
## and we replace any runs with no VPCs with the values obtained without them
selected_gap_df = gap_df.loc[selected_gap_instances]

# From https://pandas.pydata.org/docs/user_guide/advanced.html#defined-levels
# "The MultiIndex keeps all the defined levels of an index, even if they are not actually used.
# When slicing an index, you may notice this."
# Even without using remove_unused_levels, index was correct with selected_gap_df.index.get_level_values(0).unique()
selected_gap_df.index = selected_gap_df.index.remove_unused_levels()

#display(selected_gap_df.index.difference(gap_df.index))
#selected_gap_df.drop(['22433_presolved'])

# # Check what the selected_gap_df contains for bm23
# inst = "bm23_presolved"
# display(selected_gap_df.loc[inst])

#inst = "10teams_presolved"
# inst = '22433_presolved'
# curr_df = selected_gap_df.loc[inst]
# display(curr_df)
# # for i in curr_df.index:
# #     display(curr_df.loc[i])

#display(selected_gap_df.index.get_level_values(0).unique())

# Do we update the value of the "best" in each column when no VPCs are generated for a run and we use the "no-VPCs" data?
# This may cause the stats in the "best" row to improve
# For example, we replace V+GurF with GurF when no VPCs are generated, since that is what would occur without VPCs
# But if GurF is better than any V+GurF when VPCs are produced, then the average in the max-row is inflated
SHOULD_UPDATE_MAX_WHEN_NO_VPCS = False

# inst_set = selected_gap_df.index.get_level_values(0).unique()
inst_set = selected_gap_df.index.levels[0]
num_inst = len(inst_set)
for curr_inst_ind, inst in enumerate(inst_set):
    print("{}/{}".format(curr_inst_ind+1,num_inst), end='\r', flush=True)
    curr_df = selected_gap_df.loc[inst].copy() # copy needed to not throw SettingWithCopyWarning
    curr_df = curr_df[curr_df.index.get_level_values(0) >= 0] # to remove disjset = -1 row, for example

    # Set 0-row to have max values across all rows for this instance
    max_vals = curr_df[gap_cols].max()
    selected_gap_df.loc[(inst,0),gap_cols] = max_vals

    for ind in curr_df.index:
        if ind <= 0:
            continue

        # Propogate GurF and GurL down
        # sel_gap = [col_first_ref_first, col_first_ref, col_last_ref_first, col_last_ref]
        sel_gap = ref_solver_gap_cols
        selected_gap_df.loc[(inst,ind),sel_gap] = curr_df.loc[0,sel_gap]

        # If no VPCs produced, the values for V+GurF and V+GurL have not been provided
        # We replace these by GurF and GurL
        # Currently disabled: update max for that column too (if disabled, we instead keep max as the value among those that generated VPCs)
        num_vpc = curr_df.loc[ind,col_num_vpc]
        if num_vpc == 0:
            # print("Zero cuts for inst {} at depth {:d}".format(inst, ind))
            ref_gap = [col_first_cut_pass_gap_ref, col_last_cut_pass_gap_ref] # this is where we pull info from
            refinds = [gap_cols.index(colname) for colname in ref_gap] 
            sel_gap = [col_first_cut_pass_gap_ref_v, col_last_cut_pass_gap_ref_v] # this is where we put the info
            selected_gap_df.loc[(inst,ind),sel_gap] = curr_df.loc[0,ref_gap].to_numpy()

            if SHOULD_UPDATE_MAX_WHEN_NO_VPCS:
                for i in refinds:
                    if curr_df.loc[0,gap_cols[i]] > selected_gap_df.loc[(inst,0),gap_cols[i+1]]:
                        # if curr_df.loc[0,gap_cols[i]] > 0:
                            # print("DEBUG: Updating {} for inst {} from {:f} to {:f}".format(
                            #     gap_cols[i+1], 
                            #     inst, 
                            #     selected_gap_df.loc[(inst,0),gap_cols[i+1]], 
                            #     curr_df.loc[0,gap_cols[i]]))
                        selected_gap_df.loc[(inst,0),gap_cols[i+1]] = curr_df.loc[0,gap_cols[i]]

display(selected_gap_df.head(21).loc[:,[col_num_vpc]+gap_cols])

#### DEBUG: Why REF+V is less than REF

In [None]:
### DEBUG
# Why REF+V < REF

# inst = 'f2gap801600_presolved'
#inst = 'neos-1112787_presolved'
inst = 'neos-1330346_presolved'

tmp_df = gap_df.loc[inst,
                      [col_num_vpc]
                      +
                      [col_first_cut_pass_bound_ref, col_first_cut_pass_bound_ref_v]
                      +
                      gap_cols
                      +
                      [col_lp_obj, col_ip_obj]
                    ]

# # Compare against reference solver
# for stat_type in solver_stat_list:
#     for solver_type in solver_stubs:
#         for cut_type in solver_cut_type_list:
#             col = stat_type + ' ' + solver_type + ' ' + cut_type
#             gap_df[col + ' ' + pct_gap_closed_stub] = calc_gap_closed(gap_df, col)

# display(tmp_df)
# display(gap_df.loc[inst,['NUM VPC']+['BEST REF FIRST_CUT_PASS']+['FIRST REF+V FIRST_CUT_PASS']+gap_cols])

# display(selected_gap_df.loc[inst,['NUM VPC']+['BEST REF FIRST_CUT_PASS']+['FIRST REF+V FIRST_CUT_PASS']+gap_cols])

tmp_df[col_first_cut_pass_gap_ref_v] = calc_gap_closed(tmp_df, col_first_cut_pass_bound_ref_v)
display(tmp_df)

# display(gap_df.loc[inst,['NUM VPC']+['BEST REF FIRST_CUT_PASS']+['FIRST REF+V FIRST_CUT_PASS']+gap_cols])

# display(selected_gap_df.loc[inst,['NUM VPC']+['BEST REF FIRST_CUT_PASS']+['FIRST REF+V FIRST_CUT_PASS']+gap_cols])

### `best_gap_df`: For each instance, what the best gap closed is (and how that was obtained)

The columns are defined by:
* `gap_cols_short`
* the best disjunction for 'V', 'V+G', 'V+GurF', 'V+GurL'
* number of VPCs leading to best 'V' result
* number of GMICs for the instance

In [None]:
## Create best df = for each instance, what the best gap closed is (and how that was obtained)

# inst_set = selected_gap_instances_dict.keys()
inst_set = selected_gap_df.index.levels[0]
# inst_set = ['neos22_presolved']
# inst_set = ['usAbbrv-8-25_70_presolved']

best_gap_df = pd.DataFrame(
    columns = gap_cols_short+[
        col_best_disj_vpc,
        col_best_disj_gmic_vpc,
        col_best_disj_refv_first_cut_pass,
        col_best_disj_refv_last_cut_pass,
        col_num_vpc,
        col_num_gmic,
    ],
    index = inst_set,
    dtype = float,
)

best_disj_short_types = [ 'V', 'V+G' ] + refv_cut_type_short_list
ind_of_gap_for_best_disj = [
    gap_cols_short.index(cut_type) for cut_type in best_disj_short_types
]

num_inst = len(inst_set)
for i, inst in enumerate(inst_set):
    print("{}/{}".format(i+1,num_inst), end='\r', flush=True)
    # print("Processing instance {:d} with name {}.".format(i, inst))
    best_gaps = [ -1 for _ in range(len(gap_cols_short)) ]
    best_disj = [ -1 for _ in range(len(best_disj_short_types)) ]
    best_num_vpc = -1
    best_num_gmic = -1
    
    # # best_vpc = -1.
    # best_vpc_disj = -1
    # # best_max_gmic_vpc = -1.
    # # best_vpcgmic = -1.
    # best_vpcgmic_disj = -1
    # # best_VGurF = -1.
    # best_VGurF_disj = -1
    # # best_VGurL = -1.
    # best_VGurL_disj = -1
    
    curr_df = selected_gap_df.loc[inst]
    # curr_df = curr_df[curr_df.index.get_level_values(0) > 0]
    
    # Get info for GurF and GurL from the no-VPC row
    # row = curr_df.loc[0]
    # gmic_gap = float(row[col_gmic])
    # GurF_gap = float(row[col_first_cut_pass_gap_ref])
    # GurL_gap = float(row[col_last_cut_pass_gap_ref])
    # root_gap = float(row[col_root])
    # disj_gap = float(row[col_best_disj])

    for index, row in curr_df.iterrows():
        num_disj_terms = int(row[col_num_disj_terms])
        # num_obj_tried  = float(row[col_num_obj])
        num_vpc        = float(row[col_num_vpc])
        if num_disj_terms <= 0:
            continue
        if num_vpc <= 0:
            # Check if we update 'DB' value
            db_ind = gap_cols_short.index('DB')
            if row[col_best_disj] > best_gaps[db_ind]:
                best_gaps[db_ind] = row[col_best_disj]
            continue
            
        # print("Index {:d}: Processing instance {} with {:d} disj terms.".format(index, inst, num_disj_terms))
        curr_gaps = [ float(row[map_short_to_cols_gap[col]]) for col in gap_cols_short ]
        num_gmic    = float(row[col_num_gmic])
        # vpc_gap     = float(row[col_vpc])
        # vpcgmic_gap = float(row[col_vpc_gmic])
        # VGurF_gap   = float(row[col_first_ref_v])
        # VGurL_gap   = float(row[col_last_ref_v])
        # num_vpc     = float(row['NUM VPC'])

        # Update best gap closed for each method
        for j, gap in enumerate(curr_gaps):
            if gap > best_gaps[j]:
                best_gaps[j] = gap
                if j in ind_of_gap_for_best_disj:
                    best_disj[ind_of_gap_for_best_disj.index(j)] = index
                if gap_cols_short[j] == 'V':
                    best_num_vpc = num_vpc
                    best_num_gmic = num_gmic
        
        # if (best_vpc < vpc_gap): #or (is_val(best_vpc, vpc_gap) and best_num_vpc == 0):
        #     best_vpc = vpc_gap
        #     best_vpc_disj = index
        #     best_num_vpc = num_vpc
        #     best_num_gmic = num_gmic
        # if best_vpcgmic < vpcgmic_gap:
        #     best_vpcgmic = vpcgmic_gap
        #     best_vpcgmic_disj = index
        # if best_max_gmic_vpc < max(vpc_gap, gmic_gap):
        #     best_max_gmic_vpc = max(vpc_gap, gmic_gap)
        # if best_VGurF < VGurF_gap:
        #     best_VGurF = VGurF_gap
        #     best_VGurF_disj = index
        # if best_VGurL < VGurL_gap:
        #     best_VGurL = VGurL_gap
        #     best_VGurL_disj = index


    best_gaps = [ gap if gap >= EPS else 0. for gap in best_gaps ]
    best_gap_df.iloc[i] = best_gaps + best_disj + [best_num_vpc, best_num_gmic]
        # gmic_gap if gmic_gap >= EPS else 0.,
        # root_gap if root_gap >= EPS else 0.,
        # disj_gap if disj_gap >= EPS else 0.,
        # best_vpc if best_vpc >= EPS else 0.,
        # best_max_gmic_vpc if best_max_gmic_vpc >= EPS else 0.,
        # best_vpcgmic if best_vpcgmic >= EPS else 0.,
        # GurF_gap if GurF_gap >= EPS else 0.,
        # best_VGurF if best_VGurF >= EPS else 0.,
        # GurL_gap if GurL_gap >= EPS else 0.,
        # best_VGurL if best_VGurL >= EPS else 0.,
    #     best_vpc_disj,
    #     best_vpcgmic_disj,
    #     best_VGurF_disj,
    #     best_VGurL_disj,
    #     best_num_vpc,
    #     best_num_gmic,
    # ]

int_col_list = [ col_best_disj_vpc, col_best_disj_gmic_vpc, col_best_disj_refv_first_cut_pass, col_best_disj_refv_last_cut_pass, col_num_vpc, col_num_gmic ]
#['BEST VPC DISJ', 'BEST GMIC+VPC DISJ', 'BEST V+GurF DISJ', 'BEST V+GurL DISJ', 'NUM VPC', 'NUM GMIC']
for col in int_col_list:
    best_gap_df[col] = best_gap_df[col].astype(np.int64)

display(best_gap_df)

#### DEBUG: Look at `best_gap_df` entries

In [None]:
# best_gap_df.to_csv('best_gap.csv')
tmp_inst_set = ['cap6000_presolved','neos-1330346_presolved']
best_gap_df.loc[tmp_inst_set]

#### DEBUG: In `best_gap_df`, can get V > V+G due to numerical issues

In [None]:
## DEBUG: You can get V > V+G due to numerical issues

col1 = best_gap_df['V']
col2 = best_gap_df['V+G']

display(best_gap_df[(col1 > col2 + EPS) == True])

#df.loc['neos-1058477_presolved'] #.to_csv("neos-1058477_presolved_data.csv")
df.loc['seymour-disj-10_presolved']

#### DEBUG: Find instances in which V+GurF max does not match up

In [None]:
## DEBUG: Find instances in which V+GurF max does not match up
# This causes the value in Table 2 'Best' row to not match Table 1 'All'

# For instance f2gap801600_presolved, the gap closed at the end of the root node is 0% whenever VPCs are used,
# but without VPCs, the gap closed is 50%
# In `best_gap_df`, for an instance in which no VPCs were generated,
# we use the value of GurF/GurL for V+GurF/V+GurL
# In `selected_gap_df`, the "zero" row contains

num_inst = len(best_gap_df.index)
col = 'V+GurF'
origcol = map_short_to_cols_gap[col]
num_errors = 0
avg1 = 0
avg2 = 0
for inst in best_gap_df.index:
    val1 = best_gap_df.loc[inst,col]
    val2 = selected_gap_df.loc[(inst,0),origcol]
    if abs(val1-val2) > EPS:
        print("{} has best_gap_df = {:f} and selected_gap_df = {:f} for col {} (diff = {:e})".format(inst,val1,val2,col,abs(val1-val2)))
        num_errors += 1
    avg1 += val1 / num_inst
    avg2 += val2 / num_inst

print("Average from best_gap_df = {}".format(avg1))
print("Average from selected_gap_df = {}".format(avg2))
print("Total # of errors =", num_errors, flush=True)

#### DEBUG: Print relevant info from `selected_gap_df` and `best_gap_df` to further debug

In [None]:
## DEBUG
# inst = 'f2gap801600_presolved'
# inst = 'neos22_presolved'
# inst = 'neos-1112787_presolved'
# display(best_gap_df.loc[inst])
# display(selected_gap_df.loc[inst,[col_num_vpcs]+gap_cols])

In [None]:
# ## DEBUG
# gap_cols = [
#     'GMIC % GAP CLOSED',
#     'BEST DISJ % GAP CLOSED',
#     'VPC % GAP CLOSED',
#     'VPC+GMIC % GAP CLOSED',
#     'REF FIRST_CUT_PASS % GAP CLOSED',
#     'REF+V FIRST_CUT_PASS % GAP CLOSED',
#     'REF LAST_CUT_PASS % GAP CLOSED',
#     'REF+V LAST_CUT_PASS % GAP CLOSED',
# ]
# col_num_vpcs = 'NUM VPC'

inst = 'f2gap801600_presolved'
tmp_selected_gap_df = gap_df.loc[selected_gap_instances_dict.keys()]
# Check if inst is in tmp_selected_gap_df
if inst not in tmp_selected_gap_df.index.get_level_values(0).unique():
    print(ValueError("Instance {} is not in tmp_selected_gap_df".format(inst)))
else:
    curr_df = tmp_selected_gap_df.loc[inst].copy() # copy needed to not throw SettingWithCopyWarning

    # Set 0-row to have max values across all rows for this instance
    max_vals = curr_df[gap_cols].max()
    # selected_gap_df.loc[(inst,0),gap_cols] = max_vals

    display(tmp_selected_gap_df.loc[inst])
    display(max_vals)

### Table 1: `avg_gap_df`: average percent gap closed across different combinations of cuts

In [None]:
## TABLE 1: average percent gap closed across different combinations of cuts
## Create avg_gap_df = average gap closed across instances
all_set_name = 'All'
good_vpc_set_name = tex_escape('≥10%')
binary_set_name = 'Binary'
avg_row_name = tex_escape('Avg (%)')
wins_row_name = 'Wins'

idx = pd.MultiIndex.from_product(
    [ [all_set_name, good_vpc_set_name, binary_set_name], [avg_row_name, wins_row_name] ],
    names = ['Set', '']
)
    
ncols = len(best_gap_df.columns)
nrows = len(idx)

col = best_gap_df['V'].astype(float)
good_vpc_df = best_gap_df[col >= 10.]

binary_instances_df = best_gap_df.loc[binary_x_gap_instances]

data = np.zeros((nrows, ncols), dtype=float)
data[0,:] = [best_gap_df[col].mean() for col in best_gap_df.columns]
data[2,:] = [good_vpc_df[col].mean() for col in best_gap_df.columns]
data[4,:] = [binary_instances_df[col].mean() for col in best_gap_df.columns]

# display(best_gap_df.head())
avg_gap_df = pd.DataFrame(
    data,
    columns = best_gap_df.columns,
    index = idx,
    dtype = object
)

inst_col_name = '# inst'
avg_gap_df[inst_col_name] = [len(best_gap_df), 0, len(good_vpc_df), 0, len(binary_instances_df), 0]

avg_gap_df.iloc[1] = ["" for i in range(ncols+1)]
avg_gap_df.iloc[3] = ["" for i in range(ncols+1)]
avg_gap_df.iloc[5] = ["" for i in range(ncols+1)]

display(avg_gap_df)

### `wins_df`: num wins

In [None]:
## Create num wins df
# x wins over y for an instance if x > y + EPS
#shortcols = avg_gap_df.columns[0:-1]
wins_df = pd.DataFrame(
    np.zeros((len(gap_cols_short), len(gap_cols_short)), dtype=int),
    columns = gap_cols_short,
    index = gap_cols_short,
    dtype = int,
)

WINS_EPS = GAP_DIFF_EPS

from itertools import permutations
for (ind1, ind2) in permutations(range(len(gap_cols_short)), 2):
    wins_df.at[gap_cols_short[ind1],gap_cols_short[ind2]] =\
        int(sum(best_gap_df[gap_cols_short[ind1]] > best_gap_df[gap_cols_short[ind2]] + WINS_EPS))
    wins_df.at[gap_cols_short[ind2],gap_cols_short[ind1]] =\
        int(sum(best_gap_df[gap_cols_short[ind2]] > best_gap_df[gap_cols_short[ind1]] + WINS_EPS))

# Sets we are considering
# all_set = 'Wins (All)'
# good_vpc_set = 'Wins (V ≥ 10%)'
all_set = (all_set_name,wins_row_name)
good_vpc_set = (good_vpc_set_name,wins_row_name)
binary_set = (binary_set_name,wins_row_name)

# "G" are wins relative to "V"
shortrefcol = 'V'
#refcol = 'VPC % GAP CLOSED'
#refcol = map_short_to_cols[shortrefcol]
refcol = shortrefcol
shortdestcol = 'G'
#col = 'GMIC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
avg_gap_df.at[all_set,shortdestcol] = wins_df.at[shortdestcol,shortrefcol]
avg_gap_df.at[good_vpc_set,shortdestcol] = sum(good_vpc_df[destcol] > good_vpc_df[refcol] + WINS_EPS)
avg_gap_df.at[binary_set,shortdestcol] = sum(binary_instances_df[destcol] > binary_instances_df[refcol] + WINS_EPS)

# "DB", "V", "V+G": wins are relative to "G"
shortrefcol = 'G'
#refcol = 'GMIC % GAP CLOSED'
#refcol = map_short_to_cols[shortrefcol]
refcol = shortrefcol
shortdestcol = 'DB'
#col = 'BEST DISJ % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
avg_gap_df.at[all_set,shortdestcol] = wins_df.at[shortdestcol,shortrefcol]
avg_gap_df.at[good_vpc_set,shortdestcol] = sum(good_vpc_df[destcol] > good_vpc_df[refcol] + WINS_EPS)
avg_gap_df.at[binary_set,shortdestcol] = sum(binary_instances_df[destcol] > binary_instances_df[refcol] + WINS_EPS)

shortdestcol = 'V'
#col = 'VPC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
avg_gap_df.at[all_set,shortdestcol] = wins_df.at[shortdestcol,shortrefcol]
avg_gap_df.at[good_vpc_set,shortdestcol] = sum(good_vpc_df[destcol] > good_vpc_df[refcol] + WINS_EPS)
avg_gap_df.at[binary_set,shortdestcol] = sum(binary_instances_df[destcol] > binary_instances_df[refcol] + WINS_EPS)

shortdestcol = 'V+G'
#col = 'VPC+GMIC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
avg_gap_df.at[all_set,shortdestcol] = wins_df.at[shortdestcol,shortrefcol]
avg_gap_df.at[good_vpc_set,shortdestcol] = sum(good_vpc_df[destcol] > good_vpc_df[refcol] + WINS_EPS)
avg_gap_df.at[binary_set,shortdestcol] = sum(binary_instances_df[destcol] > binary_instances_df[refcol] + WINS_EPS)

# "V+GurF" are wins relative to "GurF"
shortrefcol = 'GurF'
refcol = shortrefcol
shortdestcol = 'V+GurF'
destcol = shortdestcol
#col = map_short_to_cols[shortcol]
avg_gap_df.at[all_set,shortdestcol] = wins_df.at[shortdestcol,shortrefcol]
avg_gap_df.at[good_vpc_set,shortdestcol] = sum(good_vpc_df[destcol] > good_vpc_df[refcol] + WINS_EPS)
avg_gap_df.at[binary_set,shortdestcol] = sum(binary_instances_df[destcol] > binary_instances_df[refcol] + WINS_EPS)

# "V+GurL" are wins relative to "GurL"
shortrefcol = 'GurL'
refcol = shortrefcol
shortdestcol = 'V+GurL'
destcol = shortdestcol
wins_df.at[shortdestcol,shortrefcol] = int(sum(best_gap_df[destcol] > best_gap_df[refcol] + WINS_EPS))
wins_df.at[shortrefcol,shortdestcol] = int(sum(best_gap_df[refcol] > best_gap_df[destcol] + WINS_EPS))
avg_gap_df.at[all_set,shortdestcol] = wins_df.at[shortdestcol,shortrefcol]
avg_gap_df.at[good_vpc_set,shortdestcol] = sum(good_vpc_df[destcol] > good_vpc_df[refcol] + WINS_EPS)
avg_gap_df.at[binary_set,shortdestcol] = sum(binary_instances_df[destcol] > binary_instances_df[refcol] + WINS_EPS)

# Count number of instances that have V+G > 0
shortdestcol = inst_col_name
#col = 'V+GurL'
destcol = 'V+G'
avg_gap_df.at[all_set,shortdestcol] = sum(best_gap_df[destcol] > WINS_EPS)
avg_gap_df.at[good_vpc_set,shortdestcol] = sum(good_vpc_df[destcol] > WINS_EPS)
avg_gap_df.at[binary_set,shortdestcol] = sum(binary_instances_df[destcol] > WINS_EPS)

display(avg_gap_df)
display(wins_df)

In [None]:
binary_instances_df.to_csv('binary_instances.csv')

### Analyze instances in which DB > G but V <= G

In [None]:
col1 = 'DB'
col2 = 'G'
tmp_df = best_gap_df.loc[best_gap_df[col1] > best_gap_df[col2] + GAP_DIFF_EPS]

col1 = 'V'
tmp_df = tmp_df[tmp_df[col1] <= tmp_df[col2] + GAP_DIFF_EPS]
display(tmp_df.head())

# inst_set = tmp_df.index
inst_depth_set = [(inst,tmp_df.at[inst,col_best_disj_vpc]) for inst in tmp_df.index]

print("Total num inst with DB > G >= V is {:d}".format(len(tmp_df)))
print("Num times hit cut limit = {:d}".format(sum(df.loc[inst_depth_set,col_exit_reason] == 'CUT_LIMIT')))

# display(df.loc[inst_depth_set])


### Analyze instances in which V+G <= G

In [None]:
col1 = 'V+G'
col2 = 'G'
tmp_df = best_gap_df.loc[best_gap_df[col1] <= best_gap_df[col2] + GAP_DIFF_EPS]

display(tmp_df)

inst_depth_set = [(inst,tmp_df.at[inst,col_best_disj_vpc]) for inst in tmp_df.index]

print("Total num inst with V+G <= G is {:d}".format(len(tmp_df)))
refval = 100. - GAP_DIFF_EPS
print("Num times with G > {}% gap closed = {:d}".format(refval, sum(tmp_df['G'] > refval)))
print("Num times with V+G = 0% gap closed = {:d}".format(sum(tmp_df['V+G'] == 0.)))
print("Num times hit cut limit = {:d}".format(sum(df.loc[inst_depth_set,col_exit_reason] == 'CUT_LIMIT')))

# display(df.loc[inst_depth_set])

### Analyze when G > V

In [None]:
col1 = 'G'
col2 = 'V'
tmp_df = best_gap_df.loc[best_gap_df[col1] > best_gap_df[col2] + GAP_DIFF_EPS]

display(tmp_df)

inst_depth_set = [(inst,tmp_df.at[inst,col_best_disj_vpc]) for inst in tmp_df.index]

print("Total num inst with G > V is {:d}".format(len(tmp_df)))
print("Num times with #V < 10 is {:d}".format(len(tmp_df[(tmp_df[col_num_vpc] < 10)])))
print("Num times with #V < 10 while #G > 10 is {:d}".format(len(tmp_df[(tmp_df[col_num_vpc] < 10) & (tmp_df[col_num_gmic] > 10)])))
# print("Num times with #V < 10 is {:d}".format(sum(tmp_df['NUM VPC'] < 10)))

# print("Num times with V+G = 0% gap closed = {:d}".format(sum(tmp_df['V+G'] == 0.)))
print("Num times hit cut limit = {:d}".format(sum(df.loc[inst_depth_set,col_exit_reason] == 'CUT_LIMIT')))

tmp_inst_set = tmp_df[(tmp_df[col_num_vpc] < 10) & (tmp_df[col_num_gmic] > 10)].index
tmp_inst_depth_set = [(inst,tmp_df.at[inst,col_best_disj_vpc]) for inst in tmp_inst_set]
print("Num times hit cut limit when #G > #V = {:d} (should be 0)".format(sum(df.loc[tmp_inst_depth_set,col_exit_reason] == 'CUT_LIMIT')))

# display(df.loc[inst_depth_set])

### Analyze when DB % gap closed nontrivial

In [None]:
# Select instances in selected_gap_df for which value in col_best_disj is at least MIN_DISJ_GAP
MIN_DISJ_GAP = 0.
tmp_df = selected_gap_df.loc[selected_gap_df[col_best_disj_gap] >= MIN_DISJ_GAP]

# Collect instance names with best disj gap >= MIN_DISJ_GAP
tmp_df.index = tmp_df.index.remove_unused_levels()
inst_set_db = tmp_df.index.get_level_values(0).unique()
num_inst_db = len(inst_set_db)
inst_set_orig = selected_gap_df.index.levels[0]
num_inst_orig = len(inst_set)
print("Total num inst with best disj gap >= {:f} is {:d} (out of {:d} total instances).".format(MIN_DISJ_GAP, num_inst_db, num_inst_orig))

# Report average in each column broken down by depth
tmp_df_grouped = tmp_df.groupby(level='disj_terms').mean(numeric_only=True)
display(tmp_df_grouped[gap_cols])

# Repeat with MIN_DISJ_GAP = 1.0
MIN_DISJ_GAP = 1.
tmp_df = selected_gap_df.loc[selected_gap_df[col_best_disj_gap] >= MIN_DISJ_GAP]

# Collect instance names with best disj gap >= MIN_DISJ_GAP
tmp_df.index = tmp_df.index.remove_unused_levels()
inst_set_db = tmp_df.index.get_level_values(0).unique()
num_inst_db = len(inst_set_db)
inst_set_orig = selected_gap_df.index.levels[0]
num_inst_orig = len(inst_set)
print("Total num inst with best disj gap >= {:f} is {:d} (out of {:d} total instances).".format(MIN_DISJ_GAP, num_inst_db, num_inst_orig))

# Report average in each column broken down by depth
tmp_df_grouped = tmp_df.groupby(level='disj_terms').mean(numeric_only=True)
display(tmp_df_grouped[gap_cols])

# Repeat with MIN_DISJ_GAP = 10.
MIN_DISJ_GAP = 10.
tmp_df = selected_gap_df.loc[selected_gap_df[col_best_disj_gap] >= MIN_DISJ_GAP]

# Collect instance names with best disj gap >= MIN_DISJ_GAP
tmp_df.index = tmp_df.index.remove_unused_levels()
inst_set_db = tmp_df.index.get_level_values(0).unique()
num_inst_db = len(inst_set_db)
inst_set_orig = selected_gap_df.index.levels[0]
num_inst_orig = len(inst_set)
print("Total num inst with best disj gap >= {:f} is {:d} (out of {:d} total instances).".format(MIN_DISJ_GAP, num_inst_db, num_inst_orig))

# Report average in each column broken down by depth
tmp_df_grouped = tmp_df.groupby(level='disj_terms').mean(numeric_only=True)
display(tmp_df_grouped[gap_cols])


### Table 2: `gap_by_size_df`: gap closed by num leaves

In [None]:
## TABLE 2: gap closed by num leaves
## Note that ``best'' can be worse than for a single row
## because when no VPCs are generated, we assume the "no VPCs" results hold for Gurobi,
## but we do not count that for the ``best'' calculation,
## since otherwise there is potential bias, as sometimes Gurobi does better without VPCs
shortcols = [
        'DB',
        'V',
        'max(G,V)',
        'V+G'
      ] + solver_cut_type_short_list[1]

gap_by_size_df = pd.DataFrame(
    columns = shortcols,
    index = [0] + sizes + ['Best'],
    # index = [str(size) + " leaves" for size in sizes]+['Best'],
    dtype = float,
)
zero_row_name = 0

# `grouped_df` will collect gap closed across instances, grouped by num terms
grouped_df = selected_gap_df.groupby(level='disj_terms').mean(numeric_only=True)
ungrouped_df = best_gap_df.mean(numeric_only=True)

# For each of the columns (in shortcols),
# save the average value for each size
# (this will put in the right place as the index is based on sizes for both)
for col in shortcols:
    orig_col = map_short_to_cols_gap[col]
    #gap_by_size_df.loc[2]['DB'] = best_gap_df[orig_col].mean()
    gap_by_size_df[col] = grouped_df[orig_col]

# Fill in the 'Best' row, since that is currently stored in `gap_by_size_df` in the "0" row
gap_by_size_df.loc['Best'] = gap_by_size_df.loc[zero_row_name]

# Now update the zero row with correct values
col = 'DB'
gap_by_size_df[col][zero_row_name] = 0.

col = 'V'
gap_by_size_df[col][zero_row_name] = 0.

stubs = ['G', 'GurF', 'GurL']
for stub in stubs:
    col = 'V+'+stub
    # orig_col = map_short_to_cols[stub]
    gap_by_size_df[col][0] = ungrouped_df[stub]

# Also replace the 0-row of the "max(G,V)" column with the value of G, since that corresponds to no VPCs
gap_by_size_df['max(G,V)'][0] = gap_by_size_df['V+G'][0]

# Reindex to add "leaves" to index
idx = ['VPCs disabled']+[str(size) + " leaves" for size in sizes]+['Best']
reidx = {old_id : new_id for old_id, new_id in zip(gap_by_size_df.index,idx)}
gap_by_size_df.rename(reidx, inplace=True)

# display(grouped_df[gap_cols])
display(ungrouped_df)
display(gap_by_size_df)

# Create new df with additional columns:
# (1) the ratio 'V'/'DB'
# (2) the ratio 'max(G,V)'/'V+G'
gap_by_size_df_new = gap_by_size_df.copy()
gap_by_size_df_new['V/DB'] = gap_by_size_df_new['V'] / gap_by_size_df_new['DB']
gap_by_size_df_new['max(G,V)/V+G'] = gap_by_size_df_new['max(G,V)'] / gap_by_size_df_new['V+G']
display(gap_by_size_df_new)

### Table 5: `all_gap_results_df`: complete gap closed results

In [None]:
inst_set = selected_gap_df.index.levels[0]
inst_set.set_names("Instance",inplace=True)

col_idx = pd.MultiIndex.from_arrays(
    [
        ['', '', '# cuts', '# cuts'] + ['% gap closed']*len(gap_cols_short),
        ['Rows', 'Cols', 'G', 'V'] + gap_cols_short
    ],
)

all_gap_results_df = pd.DataFrame(
    columns = col_idx,
    index = inst_set,
    dtype = object,
)

# Enter number of rows and cols
tmp_df = df.xs(0, level='disj_terms').loc[inst_set,[col_num_rows, col_num_cols]]
tmp_df.columns = pd.MultiIndex.from_product([[''],['Rows','Cols']])
all_gap_results_df.loc[:,tmp_df.columns] = tmp_df

# Enter number of cuts
# tmp_df = best_gap_df.xs(0, level='disj_terms').loc[inst_set,['NUM GMIC', 'NUM VPC']]
tmp_df = best_gap_df.loc[inst_set, [col_num_gmic, col_num_vpc]]
tmp_df.columns = pd.MultiIndex.from_product([['# cuts'],['G','V']])
all_gap_results_df.loc[:,tmp_df.columns] = tmp_df

# Enter gap closed
tmp_df = best_gap_df.loc[inst_set, gap_cols_short]
tmp_df.columns = pd.MultiIndex.from_product([['% gap closed'],gap_cols_short])
all_gap_results_df.loc[:,tmp_df.columns] = tmp_df

# Add average row
all_gap_results_df.loc["Average"] = all_gap_results_df.loc[:,('% gap closed',gap_cols_short)].mean()

# Now convert the % gap closed columns to objects so we can add an int row
all_gap_results_df.loc[:,('% gap closed',gap_cols_short)] = all_gap_results_df.loc[:,('% gap closed',gap_cols_short)].astype(object)

# Add wins row
win_gap_cols_short = ['DB', 'V', 'V+G'] + solver_cut_type_short_list[1]
all_gap_results_df.loc['Wins',('% gap closed',win_gap_cols_short)] = avg_gap_df.loc[all_set,win_gap_cols_short].values.tolist()
# all_gap_results_df.loc['Wins',('% gap closed',win_gap_cols_short)] = avg_gap_df.loc[all_set,gap_cols_short].astype(np.int64).values.tolist()
# all_gap_results_df.loc["Wins"] = avg_gap_df.loc[all_set,gap_cols_short]
# wins_df.at[cols[ind1],cols[ind2]] = int(sum(best_gap_df[cols[ind1]] > best_gap_df[cols[ind2]] + EPS))

# Replace missing entries with empty string
all_gap_results_df = all_gap_results_df.fillna('',downcast=False)

# Convert rows, cols, # cuts to int values
tmp_cols = pd.MultiIndex.from_product([[''],['Rows','Cols']])
all_gap_results_df.loc[inst_set,tmp_cols] = all_gap_results_df.loc[inst_set,tmp_cols].astype(np.int64)
tmp_cols = pd.MultiIndex.from_product([['# cuts'],['G','V']])
all_gap_results_df.loc[inst_set,tmp_cols] = all_gap_results_df.loc[inst_set,tmp_cols].astype(np.int64)

display(all_gap_results_df.tail())

print("Num instances =",len(all_gap_results_df)-2)

# Section 3: Time tables

In [None]:
from statistics import geometric_mean
SHIFT_TIME  = 1
SHIFT_GEN_TIME = 1
SHIFT_NODES = 1000
WIN_BY_TIME_FACTOR = 1.1
WIN_BY_NODES_FACTOR = 1.0

## Prepare variables for row/col names

In [None]:
## Prepare variables for row/col names

# Row names
bb_classes = ['All', '6 trees', 'Binary']
num_bb_classes = len(bb_classes)

bucket_min = [0, 10, 100, 1000]
bucket_max = [3600, 3600, 3600, 3600]
num_buckets = len(bucket_min)
assert(len(bucket_max) == num_buckets)
bb_buckets = ['[' + str(bucket_min[j]) + ',' + str(bucket_max[j]) + ')' for j in range(num_buckets)]
# bucket_names = [classes[i] + ' [' + str(bucket_min[j]) + ',' + str(bucket_max[j]) + ')' for i in range(num_classes) for j in range(num_buckets)]
# display(bucket_names)

bb_metrics = ['Gmean', 'Wins']

# Column names
time_col_header = 'Time (s)'
node_col_header = 'Nodes (#)'


## Function `create_avg_bb_by_depth_df` that can be reused

In [None]:
import warnings
warnings.filterwarnings('ignore')

def create_avg_bb_by_depth_df(inst_set, DEBUG = False):
  bb_classes_by_depth = [str(t) + ' leaves' for t in sizes]
  num_bb_classes_by_depth = len(bb_classes_by_depth)

  bb_buckets_by_depth = bb_buckets
  bb_metrics_by_depth = bb_metrics

  cols_time_by_depth       = time_cols_long #[ref_time_col, refv_time_col, refv_w_cut_time_col]
  shortcols_time_by_depth  = [map_cols_to_short_time[col] for col in cols_time_by_depth]
  cols_nodes_by_depth      = [ref_nodes_col, refv_nodes_col]
  shortcols_nodes_by_depth = [map_cols_to_short_nodes[col] for col in cols_nodes_by_depth]

  avg_bb_cols_by_depth = pd.MultiIndex.from_arrays(
      [
        [time_col_header]*len(shortcols_time_by_depth) + 
        [node_col_header]*len(shortcols_nodes_by_depth), 
        shortcols_time_by_depth + shortcols_nodes_by_depth
      ],
      names = ['criterion', 'type'])

  bb_row_names_by_depth = pd.MultiIndex.from_product(
      [bb_classes_by_depth, bb_buckets_by_depth, bb_metrics_by_depth],
      names=['class', 'bucket', 'metric'])

  tmp_avg_bb_by_depth_df = pd.DataFrame(
      columns = avg_bb_cols_by_depth,
      index = bb_row_names_by_depth,
      dtype = float
  )

  # Make all columns "object" type to allow for integer values
  tmp_avg_bb_by_depth_df.loc[:,(time_col_header,shortcols_time_by_depth)] = tmp_avg_bb_by_depth_df.loc[:,(time_col_header,shortcols_time_by_depth)].astype(object)
  tmp_avg_bb_by_depth_df.loc[:,(node_col_header,shortcols_nodes_by_depth)] = tmp_avg_bb_by_depth_df.loc[:,(node_col_header,shortcols_nodes_by_depth)].astype(object)

  num_inst_by_depth = np.zeros(len(tmp_avg_bb_by_depth_df),dtype = np.int64)
  row_ind = 0

  # Calculate stats for instances by depth
  cols = cols_time_by_depth + cols_nodes_by_depth + [col_num_vpc, ref_timeout_col, refv_timeout_col]
  cols = list(set(cols)) # remove duplicates
  curr_df = df.loc[inst_set,cols]
  curr_df = curr_df[curr_df.index.get_level_values(1) > 0]

  for curr_size_ind in range(0,len(bb_classes_by_depth)):
      # print("{}".format(bb_classes_by_depth[curr_size_ind]))
      curr_by_depth_df = curr_df[curr_df.index.get_level_values(1) == sizes[curr_size_ind]] # take only chosen depth
      
      # Take only instances in which num vpcs > 0
      curr_by_depth_df = curr_by_depth_df[curr_by_depth_df[col_num_vpc] > 0]

      if DEBUG:
          curr_by_depth_timeouts_df = curr_by_depth_df[
              (curr_by_depth_df[ref_timeout_col] >= MAX_TIME)
              & (curr_by_depth_df[refv_timeout_col] >= MAX_TIME)
          ]
          if (len(curr_by_depth_timeouts_df) > 0):
            display(curr_by_depth_timeouts_df)

      # Remove instances in which both Gurobi and VPCs timed out
      # (If inst_set = selected_time_instances, then this is not necessary, as we already filtered out instances that timed out for both solvers)
      curr_by_depth_df = curr_by_depth_df[
          (curr_by_depth_df[ref_timeout_col] < MAX_TIME) 
          | (curr_by_depth_df[refv_timeout_col] < MAX_TIME)
      ]

      if DEBUG:
          print("Num instances selected for depth {:d} = {:d}".format(sizes[curr_size_ind],len(curr_by_depth_df)))
          print("Num instances solved w/i timelimit for Gur: {:d}".format(len(curr_by_depth_df[(curr_by_depth_df[ref_timeout_col] < MAX_TIME)])))
          print("Num instances solved w/i timelimit for Gur+V: {:d}".format(len(curr_by_depth_df[(curr_by_depth_df[refv_timeout_col] < MAX_TIME)])))

      for bucket_ind in range(num_buckets):
          # Decide which instances to include in this bucket
          # This will be the instances for which the average solver time is at least the bucket minimum, for all solvers
          # TODO: is it fair to use "&" or "|" here?
          curr_by_depth_df = curr_by_depth_df[ 
              (curr_by_depth_df[ref_time_col] >= bucket_min[bucket_ind])
              & (curr_by_depth_df[refv_time_col] >= bucket_min[bucket_ind])
            ]
          
          bb_metric_ind = 0
          # For each column, compute the geometric mean of the values in the column
          # Use SHIFT_TIME for the first two time columns, and maybe use a different shift for generation time
          for col_ind in range(len(cols_time_by_depth)):
              col = cols_time_by_depth[col_ind]
              SHIFT = SHIFT_TIME if col_ind < 2 else SHIFT_GEN_TIME
              tmp_avg_bb_by_depth_df.loc[
                  (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[bb_metric_ind]),
                  (time_col_header,shortcols_time_by_depth[col_ind])] = \
              geometric_mean(curr_by_depth_df[col] + SHIFT) - SHIFT

          # display(avg_bb_by_depth_df.loc[
          #         (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[0]),
          #         (time_col_header,shortcols_time_by_depth)].head())
          tmp_avg_bb_by_depth_df.loc[
                  (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[bb_metric_ind]),
                  (node_col_header,shortcols_nodes_by_depth)] = \
              [geometric_mean(curr_by_depth_df[col] + SHIFT_NODES) - SHIFT_NODES for col in cols_nodes_by_depth]
          
          # print("row {:d}: {:d}".format(row_ind,len(curr_by_depth_df)))

          num_inst_by_depth[row_ind:row_ind+len(bb_metrics_by_depth)] = len(bb_metrics_by_depth)*[len(curr_by_depth_df)]
          row_ind += len(bb_metrics_by_depth)

          ## Update wins rows
          # A win in terms of time is counted when the ``Gur'' baseline seconds taken 
          # is at least 10\% slower, to account for some variability in runtimes.
          bb_metric_ind = 1
          refcol = ref_time_col
          tmp_avg_bb_by_depth_df.loc[
                  (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[bb_metric_ind]),
                  (time_col_header,shortcols_time_by_depth)] = \
              [ int(sum( curr_by_depth_df[refv_time_col] > WIN_BY_TIME_FACTOR * curr_by_depth_df[ref_time_col] )),
                int(sum( curr_by_depth_df[ref_time_col] > WIN_BY_TIME_FACTOR * curr_by_depth_df[refv_time_col] )),
                int(sum( curr_by_depth_df[ref_time_col] > WIN_BY_TIME_FACTOR * (curr_by_depth_df[refv_time_col] + curr_by_depth_df[col_vpc_gen_time]) )),
              ]

          # A win in terms of nodes is when the ``Gur'' baseline number of nodes is higher.
          refcol = ref_nodes_col
          tmp_avg_bb_by_depth_df.loc[
                  (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[bb_metric_ind]),
                  (node_col_header,shortcols_nodes_by_depth)] = \
              [ int(sum(curr_by_depth_df[refv_nodes_col] > curr_by_depth_df[ref_nodes_col])) ] + \
              [ int(sum(curr_by_depth_df[refcol] > curr_by_depth_df[col])) for col in cols_nodes_by_depth[1:] ]

  tmp_avg_bb_by_depth_df[inst_col_name] = num_inst_by_depth

  # for i in range(num_buckets):
  #     curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
      
  display(tmp_avg_bb_by_depth_df.loc[(bb_classes_by_depth, bb_buckets_by_depth, bb_metrics_by_depth),:])

  return tmp_avg_bb_by_depth_df


## Table 3: "all": `avg_bb_df_by_depth`

In [None]:
inst_set = selected_time_instances # will be filtered down

avg_bb_by_depth_df = create_avg_bb_by_depth_df(inst_set)

## Table 3 "all 6": `all6_avg_bb_by_depth` for `all6_instances`

In [None]:
inst_set = all6_instances

all6_avg_bb_by_depth_df = create_avg_bb_by_depth_df(inst_set, True)

## Table 3: "binary": `binary_avg_bb_df_by_depth`

In [None]:
inst_set = binary_x_time_instances # will be filtered down

binary_avg_bb_by_depth_df = create_avg_bb_by_depth_df(inst_set)

# bb_classes_by_depth = [str(t) + ' leaves' for t in sizes]
# num_bb_classes_by_depth = len(bb_classes_by_depth)

# bb_buckets_by_depth = bb_buckets
# bb_metrics_by_depth = bb_metrics

# cols_time_by_depth       = time_cols_long #[ref_time_col, refv_time_col, refv_w_cut_time_col]
# shortcols_time_by_depth  = [map_cols_to_short_time[col] for col in cols_time_by_depth]
# cols_nodes_by_depth      = [ref_nodes_col, refv_nodes_col]
# shortcols_nodes_by_depth = [map_cols_to_short_nodes[col] for col in cols_nodes_by_depth]

# avg_bb_cols_by_depth = pd.MultiIndex.from_arrays(
#     [
#       [time_col_header]*len(shortcols_time_by_depth) + 
#       [node_col_header]*len(shortcols_nodes_by_depth), 
#       shortcols_time_by_depth + shortcols_nodes_by_depth
#     ],
#     names = ['criterion', 'type'])

# bb_row_names_by_depth = pd.MultiIndex.from_product(
#     [bb_classes_by_depth, bb_buckets_by_depth, bb_metrics_by_depth],
#     names=['class', 'bucket', 'metric'])

# binary_avg_bb_by_depth_df = pd.DataFrame(
#     columns = avg_bb_cols_by_depth,
#     index = bb_row_names_by_depth,
#     dtype = float
# )

# # Make all columns "object" type to allow for integer values
# binary_avg_bb_by_depth_df.loc[:,(time_col_header,shortcols_time_by_depth)] = avg_bb_by_depth_df.loc[:,(time_col_header,shortcols_time_by_depth)].astype(object)
# binary_avg_bb_by_depth_df.loc[:,(node_col_header,shortcols_nodes_by_depth)] = avg_bb_by_depth_df.loc[:,(node_col_header,shortcols_nodes_by_depth)].astype(object)

# num_inst_by_depth = np.zeros(len(avg_bb_by_depth_df),dtype = np.int64)
# row_ind = 0

# # Calculate stats for instances by depth
# cols = cols_time_by_depth + cols_nodes_by_depth + [col_num_vpc, ref_timeout_col, refv_timeout_col]
# cols = list(set(cols)) # remove duplicates
# curr_df = df.loc[inst_set,cols]
# curr_df = curr_df[curr_df.index.get_level_values(1) > 0]

# for curr_size_ind in range(0,len(bb_classes_by_depth)):
#     # print("{}".format(bb_classes_by_depth[curr_size_ind]))
#     curr_by_depth_df = curr_df[curr_df.index.get_level_values(1) == sizes[curr_size_ind]] # take only chosen depth
    
#     # Take only instances in which num vpcs > 0
#     curr_by_depth_df = curr_by_depth_df[curr_by_depth_df[col_num_vpc] > 0]

#     # Remove instances in which both Gurobi and VPCs timed out
#     curr_by_depth_df = curr_by_depth_df[
#         (curr_by_depth_df[ref_timeout_col] < MAX_TIME)
#         | (curr_by_depth_df[refv_timeout_col] < MAX_TIME)
#     ]

#     for bucket_ind in range(num_buckets):
#         # Decide which instances to include in this bucket
#         # This will be the instances for which the average solver time is at least the bucket minimum, for all solvers
#         # TODO: is it fair to use "&" or "|" here?
#         curr_by_depth_df = curr_by_depth_df[ 
#             (curr_by_depth_df[ref_time_col] >= bucket_min[bucket_ind])
#             & (curr_by_depth_df[refv_time_col] >= bucket_min[bucket_ind])
#           ]
        
#         bb_metric_ind = 0
#         # For each column, compute the geometric mean of the values in the column
#         # Use SHIFT_TIME for the first two time columns, and maybe use a different shift for generation time
#         for col_ind in range(len(cols_time_by_depth)):
#             col = cols_time_by_depth[col_ind]
#             SHIFT = SHIFT_TIME if col_ind < 2 else SHIFT_GEN_TIME
#             binary_avg_bb_by_depth_df.loc[
#                 (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[bb_metric_ind]),
#                 (time_col_header,shortcols_time_by_depth[col_ind])] = \
#             geometric_mean(curr_by_depth_df[col] + SHIFT) - SHIFT

#         # display(avg_bb_by_depth_df.loc[
#         #         (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[0]),
#         #         (time_col_header,shortcols_time_by_depth)].head())
#         binary_avg_bb_by_depth_df.loc[
#                 (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[bb_metric_ind]),
#                 (node_col_header,shortcols_nodes_by_depth)] = \
#             [geometric_mean(curr_by_depth_df[col] + SHIFT_NODES) - SHIFT_NODES for col in cols_nodes_by_depth]
        
#         # print("row {:d}: {:d}".format(row_ind,len(curr_by_depth_df)))

#         num_inst_by_depth[row_ind:row_ind+len(bb_metrics_by_depth)] = len(bb_metrics_by_depth)*[len(curr_by_depth_df)]
#         row_ind += len(bb_metrics_by_depth)

#         ## Update wins rows
#         # A win in terms of time is counted when the ``Gur'' baseline seconds taken 
#         # is at least 10\% slower, to account for some variability in runtimes.
#         bb_metric_ind = 1
#         refcol = ref_time_col
#         binary_avg_bb_by_depth_df.loc[
#                 (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[bb_metric_ind]),
#                 (time_col_header,shortcols_time_by_depth)] = \
#             [ int(sum( curr_by_depth_df[refv_time_col] > WIN_BY_TIME_FACTOR * curr_by_depth_df[ref_time_col] )),
#               int(sum( curr_by_depth_df[ref_time_col] > WIN_BY_TIME_FACTOR * curr_by_depth_df[refv_time_col] )),
#               int(sum( curr_by_depth_df[ref_time_col] > WIN_BY_TIME_FACTOR * (curr_by_depth_df[refv_time_col] + curr_by_depth_df[col_vpc_gen_time]) )),
#             ]

#         # A win in terms of nodes is when the ``Gur'' baseline number of nodes is higher.
#         refcol = ref_nodes_col
#         binary_avg_bb_by_depth_df.loc[
#                 (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[bucket_ind], bb_metrics_by_depth[bb_metric_ind]),
#                 (node_col_header,shortcols_nodes_by_depth)] = \
#             [ int(sum(curr_by_depth_df[refv_nodes_col] > curr_by_depth_df[ref_nodes_col])) ] + \
#             [ int(sum(curr_by_depth_df[refcol] > curr_by_depth_df[col])) for col in cols_nodes_by_depth[1:] ]

# binary_avg_bb_by_depth_df[inst_col_name] = num_inst_by_depth

# # for i in range(num_buckets):
# #     curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
# display(binary_avg_bb_by_depth_df.loc[(bb_classes_by_depth, bb_buckets_by_depth, bb_metrics_by_depth),:])

## Compute `best_df`

In [None]:
cols_time_by_depth = time_cols_long #[ref_time_col, refv_time_col, refv_w_cut_time_col]
cols_nodes_by_depth = [ref_nodes_col, refv_nodes_col]

curr_df = df.loc[selected_time_instances, [col_num_vpc] + cols_time_by_depth + cols_nodes_by_depth]
curr_df = curr_df[curr_df.index.get_level_values(1) > 0]

# Calculate minimum value for each column, per instance (index level 0)
curr_argmin_df = curr_df.groupby(level=0).idxmin()
best_df = curr_df.loc[curr_argmin_df[refv_time_col]]

# Sort by increasing value of cols_times_by_depth[1
print("Sorting by increasing value of",cols_time_by_depth[1],"...")
best_df = best_df.sort_values(by=cols_time_by_depth[1])

best_df.head()

In [None]:
# Move 'disj_terms' (second level of index) to column
best_df.reset_index(level=1, inplace=True)
best_df.head()

In [None]:
# Compute summary metrics: shifted geometric mean and wins for ref_time_col, refv_time_col, ref_nodes_col, and refv_nodes_col

# Set up best_df_summary_metrics_df to store the summary metrics
best_df_summary_metrics_df = best_df.copy(deep=True)

# Compute shifted geometric mean
cols_for_shifted_time_gmean = [col for col in best_df.columns if 'TIME' in col]
cols_for_shifted_nodes_gmean = [col for col in best_df.columns if 'NODES' in col]

# Apply shift to each column
for col in cols_for_shifted_time_gmean:
    best_df_summary_metrics_df[col] = best_df_summary_metrics_df[col] + SHIFT_TIME
for col in cols_for_shifted_nodes_gmean:
    best_df_summary_metrics_df[col] = best_df_summary_metrics_df[col] + SHIFT_NODES

# Compute shifted geometric mean for time
best_df_summary_metrics_df.loc['Gmean'] = best_df_summary_metrics_df[cols_for_shifted_time_gmean+cols_for_shifted_nodes_gmean].apply(geometric_mean, axis=0)

# Change shift back
for col in cols_for_shifted_time_gmean:
    best_df_summary_metrics_df[col] = best_df_summary_metrics_df[col] - SHIFT_TIME
for col in cols_for_shifted_nodes_gmean:
    best_df_summary_metrics_df[col] = best_df_summary_metrics_df[col] - SHIFT_NODES
best_df_summary_metrics_df.loc['Gmean', cols_for_shifted_time_gmean] = best_df_summary_metrics_df.loc['Gmean', cols_for_shifted_time_gmean] - SHIFT_TIME
best_df_summary_metrics_df.loc['Gmean', cols_for_shifted_nodes_gmean] = best_df_summary_metrics_df.loc['Gmean', cols_for_shifted_nodes_gmean] - SHIFT_NODES

# Compute wins for ref_time_col, refv_time_col, ref_nodes_col, and refv_nodes_col
best_df_summary_metrics_df.loc['Wins', cols_for_shifted_time_gmean] = \
    [ 
      int(sum( best_df_summary_metrics_df[refv_time_col] > WIN_BY_TIME_FACTOR * best_df_summary_metrics_df[ref_time_col] )),
      int(sum( best_df_summary_metrics_df[ref_time_col] > WIN_BY_TIME_FACTOR * best_df_summary_metrics_df[refv_time_col] )),
      int(sum( best_df_summary_metrics_df[ref_time_col] > WIN_BY_TIME_FACTOR * (best_df_summary_metrics_df[refv_time_col] + best_df_summary_metrics_df[col_vpc_gen_time]) )),
    ]

best_df_summary_metrics_df.loc['Wins', cols_for_shifted_nodes_gmean] = \
    [ 
      int(sum(best_df_summary_metrics_df[refv_nodes_col] > best_df_summary_metrics_df[ref_nodes_col])),
      int(sum(best_df_summary_metrics_df[ref_nodes_col] > best_df_summary_metrics_df[refv_nodes_col]))
    ]

best_df_summary_metrics_df.loc[['Gmean','Wins']]

In [None]:
# put short cols in place of orig_cols
orig_cols = best_df_summary_metrics_df.columns

new_orig_cols = [col for col in orig_cols if ('TIME' not in col) and ('NODES' not in col)]
new_orig_cols = ['# terms', '# cuts']
new_time_cols = [map_cols_to_short_time[col] for col in orig_cols if 'TIME' in col]
new_nodes_cols = [map_cols_to_short_nodes[col] for col in orig_cols if 'NODES' in col]

display(new_orig_cols + new_time_cols + new_nodes_cols)

# Add second level to columns
new_orig_cols = pd.MultiIndex.from_product(
  [[''],new_orig_cols],
)
new_time_cols = pd.MultiIndex.from_product(
  [['Time (s)'],new_time_cols],
)
new_nodes_cols = pd.MultiIndex.from_product(
  [['Nodes (#)'],new_nodes_cols],
)

best_df_summary_metrics_df.columns = new_orig_cols.append(new_time_cols).append(new_nodes_cols)
best_df_summary_metrics_df.head()

# Section 4: "Combined" results

## Select "Combined" instances

In [None]:
# Create df_nodes dataframe, in which each row is an instance, and each column is a solver (with or w/o cuts, and with a fixed random seed)
# These will be pulled from 'ALL REF TIME' and 'ALL REF+V TIME' columns
inst_set = list(instances)
# inst_set = ['lotsize_presolved']
PRINT_SKIP_REASON = False

disjset_num_gap_errors = 0

selected_disjset_gap_instances_dict = {} # dictionary of (original index, instance)
selected_disjset_time_instances_dict = {} # dictionary of (original index, instance)
for i, inst in enumerate(inst_set):
  print("{}/{}".format(i+1,len(inst_set)), end='\r', flush=True)
  skip_instance = False

  # # Check that -1 depth exists
  # if not (-1 in df.loc[inst].index):
  #   # if PRINT_SKIP_REASON:
  #   print("Skipping instance {:d} -- {}: no -1 depth.".format(
  #       i, inst
  #   ))
  #   skip_instance = True
  #   continue
  if not inst in df_disjset.index:
    if PRINT_SKIP_REASON:
      print("Skipping instance {:d} -- {}: no disjset results.".format(
          i, inst
      ))
    skip_instance = True
    continue
  
  curr_df = df_disjset.loc[(inst,-1)]

  # Ensure nrows and ncols is not too many
  nrows = curr_df[col_num_rows]
  ncols = curr_df[col_num_cols]
  if (nrows > MAX_ROWS) or (ncols > MAX_COLS):
    if PRINT_SKIP_REASON:
        print("Skipping instance {:d} -- {}: nrows = {:d} > {:d} or ncols = {:d} > {:d}.".format(
                i, inst, nrows, ncols, MAX_ROWS, MAX_COLS))
    skip_instance = True

  # Ensure IP objective value is known
  ip_obj = np.float64(df_ipopt.loc[inst,col_ip_obj])
  if not isinstance(ip_obj,float):
    if PRINT_SKIP_REASON:
        print(
            "Skipping instance {:d} -- {}: IP objective value ({}) is not detected to be a float value.".format(
            i, inst, ip_obj))
    skip_instance = True

  check_ip_obj = curr_df[col_ip_obj]
  if not is_val(ip_obj,check_ip_obj):
    print("*** ERROR: Instance {:d} -- {}: IP objective value ({}) does not match value in dataframe ({}). REPLACING WITH KNOWN VALUE!".format(
        i, inst, ip_obj, check_ip_obj))
    df.loc[(inst,-1),col_ip_obj] = ip_obj
    df_disjset.loc[(inst,-1),col_ip_obj] = ip_obj
    # skip_instance = True

  # Check that LP opt < IP opt
  lp_obj = np.float64(df_preprocess.loc[remove_presolved_from_name(inst),col_cleaned_lp_obj])
  YES_GAP = (ip_obj - lp_obj) >= EPS
  if not YES_GAP:
    print("*** ERROR: Instance {:d} -- {}: not YES GAP (lp = {:.10f}; ip = {:.10f}, diff = {:.2f})".format(i, inst, lp_obj, ip_obj, ip_obj-lp_obj))
    skip_instance = True
    disjset_num_gap_errors += 1

  # Check that ExitReason != OPTIMAL_SOLUTION_FOUND
  exitreason = curr_df['ExitReason']
  if exitreason == 'OPTIMAL_SOLUTION_FOUND':
    if PRINT_SKIP_REASON:
      print("Skipping instance {:d} -- {}: optimal IP solution found.".format(
          i, inst
      ))
    skip_instance = True

  # Check that VPCs were generated
  num_vpc = curr_df[col_num_vpc]
  if num_vpc == 0:
    if PRINT_SKIP_REASON:
      print("Skipping instance {:d} -- {}: no VPCs generated.".format(
          i, inst
      ))
    skip_instance = True

  if not skip_instance:
    selected_disjset_gap_instances_dict[inst] = i

    # If either ref or refv times are < MAX_TIME - EPS, include in nodes experiments
    ref_time = curr_df[ref_time_col]
    refv_time = curr_df[refv_time_col]
    if (ref_time < MAX_TIME - EPS) or (refv_time < MAX_TIME - EPS):
      selected_disjset_time_instances_dict[inst] = i

selected_disjset_gap_instances = list(selected_disjset_gap_instances_dict.keys())
print("Total number of errors: {}".format(disjset_num_gap_errors))
print("Num instances selected for disjset gap closed results = {:d}".format(len(selected_disjset_gap_instances)))

selected_disjset_time_instances = list(selected_disjset_time_instances_dict.keys())
print("Num instances selected for disjset time results = {:d}".format(len(selected_disjset_time_instances)))

### List instances that are in exactly one of the two sets `selected_gap_instances` and `selected_disjset_gap_instances`

In [None]:
# Symmetric difference between two sets selected_gap_instances and selected_disjset_gap_instances
print("Instances in selected_disjset_gap_instances that are not in selected_gap_instances:")
newly_selected_instances = [inst for inst in selected_disjset_gap_instances if inst not in selected_gap_instances]
print(newly_selected_instances)

# Remove newly selected instances
for inst in newly_selected_instances:
  selected_disjset_gap_instances.remove(inst)

print("")
print("Instances in selected_gap_instances that are not in selected_disjset_gap_instances:")
print([inst for inst in selected_gap_instances if inst not in selected_disjset_gap_instances])

print("")
print("Instances in selected_time_instances that are not in selected_disjset_time_instances:")
print([inst for inst in selected_time_instances if inst not in selected_disjset_time_instances])

print("")
print("Instances in selected_disjset_time_instances that are not in selected_time_instances:")
print([inst for inst in selected_disjset_time_instances if inst not in selected_time_instances])

## "Combined" gap summary

### `disjset_gap_df`: Analyze disjset gap closed

In [None]:
# Create subset of dataframe relevant to gap closed
disjset_gap_df = df.loc[selected_disjset_gap_instances, 
                [
                    col_num_disj_terms,
                    col_num_rows,
                    col_num_cols
                ]
                +
                obj_val_col_list
                +
                [
                    col_num_gmic,
                    col_num_vpc,
                    col_num_obj,
                    col_exit_reason
                ]
               ]

# Take only -1 depth
disjset_gap_df = disjset_gap_df.xs(-1, level='disj_terms')

# Calculate some missing % gap closed columns
# gap closed = 100 * (post_cut_opt_val - lp_opt_val) / (ip_opt_val - lp_opt_val)
for cut_type in reg_cut_type_long_list:
    col = cut_type + ' ' + obj_stub
    if col not in df.columns:
        if cut_type == 'MAX(GMIC,VPC)':
            # Add max(G,V) column
            disjset_gap_df[cut_type + ' ' + pct_gap_closed_stub] = \
                np.maximum(
                    disjset_gap_df['GMIC' + ' ' + pct_gap_closed_stub],
                    disjset_gap_df[ 'VPC' + ' ' + pct_gap_closed_stub]
                )
        continue
    disjset_gap_df[cut_type + ' ' + pct_gap_closed_stub] = calc_gap_closed(disjset_gap_df, col)

# Compare against reference solver
for stat_type in solver_stat_list[:3]:
    for solver_type in solver_stubs:
        for cut_type in solver_cut_type_stubs:
            col = stat_type + ' ' + solver_type + ' ' + cut_type
            disjset_gap_df[col + ' ' + pct_gap_closed_stub] = calc_gap_closed(disjset_gap_df, col)

# Rename each column with its shortname
disjset_gap_df.rename(columns=map_cols_to_short_gap,inplace=True)

display(disjset_gap_df.loc[['bm23_presolved','maxgasflow_presolved']][gap_cols_short])
display(disjset_gap_df.loc[("bm23_presolved")])

In [None]:
disjset_gap_df.to_csv('disjset_gap.csv')

### `disjset_avg_gap_df`: summary of results with disjset instances

In [None]:
## TABLE 1.2: average percent gap closed across different combinations of cuts
## Create disjset_avg_gap_df = average gap closed across instances
idx = pd.MultiIndex.from_product(
    [ [all_set_name, good_vpc_set_name, binary_set_name], [avg_row_name, wins_row_name] ],
    names = ['Set', '']
)
    
ncols = len(gap_cols_short)
nrows = len(idx)

col = disjset_gap_df['V'].astype(float)
disjset_good_vpc_df = disjset_gap_df[col >= 10.]
disjset_binary_instances_df = disjset_gap_df.loc[ [inst for inst in pure_binary_instances if inst in selected_disjset_gap_instances ] ]

data = np.zeros((nrows, ncols), dtype=float)
data[0,:] = [disjset_gap_df[col].mean() for col in gap_cols_short]
data[2,:] = [disjset_good_vpc_df[col].mean() for col in gap_cols_short]
data[4,:] = [disjset_binary_instances_df[col].mean() for col in gap_cols_short]

# display(best_gap_df.head())
disjset_avg_gap_df = pd.DataFrame(
    data,
    columns = gap_cols_short,
    index = idx,
    dtype = object
)

inst_col_name = '# inst'
disjset_avg_gap_df[inst_col_name] = [len(disjset_gap_df), 0, len(disjset_good_vpc_df), 0, len(disjset_binary_instances_df), 0]

disjset_avg_gap_df.iloc[1] = ["" for i in range(ncols+1)]
disjset_avg_gap_df.iloc[3] = ["" for i in range(ncols+1)]
disjset_avg_gap_df.iloc[5] = ["" for i in range(ncols+1)]

display(disjset_avg_gap_df)

### `disjset_wins_df`: number of wins across methods

In [None]:
## Create num wins df
# x wins over y for an instance if x > y + EPS
#shortcols = avg_gap_df.columns[0:-1]
disjset_wins_df = pd.DataFrame(
    np.zeros((len(gap_cols_short), len(gap_cols_short)), dtype=int),
    columns = gap_cols_short,
    index = gap_cols_short,
    dtype = int,
)

WINS_EPS = GAP_DIFF_EPS

from itertools import permutations
for (ind1, ind2) in permutations(range(len(gap_cols_short)), 2):
    disjset_wins_df.at[gap_cols_short[ind1],gap_cols_short[ind2]] =\
        int(sum(disjset_gap_df[gap_cols_short[ind1]] > disjset_gap_df[gap_cols_short[ind2]] + WINS_EPS))
    disjset_wins_df.at[gap_cols_short[ind2],gap_cols_short[ind1]] =\
        int(sum(disjset_gap_df[gap_cols_short[ind2]] > disjset_gap_df[gap_cols_short[ind1]] + WINS_EPS))

# Sets we are considering
# all_set = 'Wins (All)'
# good_vpc_set = 'Wins (V ≥ 10%)'
disjset_all_set = (all_set_name,wins_row_name)
disjset_good_vpc_set = (good_vpc_set_name,wins_row_name)
disjset_binary_set = (binary_set_name,wins_row_name)

# "G" are wins relative to "V"
shortrefcol = 'V'
#refcol = 'VPC % GAP CLOSED'
#refcol = map_short_to_cols[shortrefcol]
refcol = shortrefcol
shortdestcol = 'G'
#col = 'GMIC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
disjset_avg_gap_df.at[disjset_all_set,shortdestcol] = disjset_wins_df.at[shortdestcol,shortrefcol]
disjset_avg_gap_df.at[disjset_good_vpc_set,shortdestcol] = sum(disjset_good_vpc_df[destcol] > disjset_good_vpc_df[refcol] + WINS_EPS)
disjset_avg_gap_df.at[disjset_binary_set,shortdestcol] = sum(disjset_binary_instances_df[destcol] > disjset_binary_instances_df[refcol] + WINS_EPS)

# "DB", "V", "V+G": wins are relative to "G"
shortrefcol = 'G'
#refcol = 'GMIC % GAP CLOSED'
#refcol = map_short_to_cols[shortrefcol]
refcol = shortrefcol
shortdestcol = 'DB'
#col = 'BEST DISJ % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
disjset_avg_gap_df.at[disjset_all_set,shortdestcol] = disjset_wins_df.at[shortdestcol,shortrefcol]
disjset_avg_gap_df.at[disjset_good_vpc_set,shortdestcol] = sum(disjset_good_vpc_df[destcol] > disjset_good_vpc_df[refcol] + WINS_EPS)
disjset_avg_gap_df.at[disjset_binary_set,shortdestcol] = sum(disjset_binary_instances_df[destcol] > disjset_binary_instances_df[refcol] + WINS_EPS)

shortdestcol = 'V'
#col = 'VPC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
disjset_avg_gap_df.at[disjset_all_set,shortdestcol] = disjset_wins_df.at[shortdestcol,shortrefcol]
disjset_avg_gap_df.at[disjset_good_vpc_set,shortdestcol] = sum(disjset_good_vpc_df[destcol] > disjset_good_vpc_df[refcol] + WINS_EPS)
disjset_avg_gap_df.at[disjset_binary_set,shortdestcol] = sum(disjset_binary_instances_df[destcol] > disjset_binary_instances_df[refcol] + WINS_EPS)

shortdestcol = 'V+G'
#col = 'VPC+GMIC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
disjset_avg_gap_df.at[disjset_all_set,shortdestcol] = disjset_wins_df.at[shortdestcol,shortrefcol]
disjset_avg_gap_df.at[disjset_good_vpc_set,shortdestcol] = sum(disjset_good_vpc_df[destcol] > disjset_good_vpc_df[refcol] + WINS_EPS)
disjset_avg_gap_df.at[disjset_binary_set,shortdestcol] = sum(disjset_binary_instances_df[destcol] > disjset_binary_instances_df[refcol] + WINS_EPS)

# "V+GurF" are wins relative to "GurF"
shortrefcol = 'GurF'
refcol = shortrefcol
shortdestcol = 'V+GurF'
destcol = shortdestcol
#col = map_short_to_cols[shortcol]
disjset_avg_gap_df.at[disjset_all_set,shortdestcol] = disjset_wins_df.at[shortdestcol,shortrefcol]
disjset_avg_gap_df.at[disjset_good_vpc_set,shortdestcol] = sum(disjset_good_vpc_df[destcol] > disjset_good_vpc_df[refcol] + WINS_EPS)
disjset_avg_gap_df.at[disjset_binary_set,shortdestcol] = sum(disjset_binary_instances_df[destcol] > disjset_binary_instances_df[refcol] + WINS_EPS)

# "V+GurL" are wins relative to "GurL"
shortrefcol = 'GurL'
refcol = shortrefcol
shortdestcol = 'V+GurL'
destcol = shortdestcol
disjset_wins_df.at[shortdestcol,shortrefcol] = int(sum(disjset_gap_df[destcol] > disjset_gap_df[refcol] + WINS_EPS))
disjset_wins_df.at[shortrefcol,shortdestcol] = int(sum(disjset_gap_df[refcol] > disjset_gap_df[destcol] + WINS_EPS))
disjset_avg_gap_df.at[disjset_all_set,shortdestcol] = disjset_wins_df.at[shortdestcol,shortrefcol]
disjset_avg_gap_df.at[disjset_good_vpc_set,shortdestcol] = sum(disjset_good_vpc_df[destcol] > disjset_good_vpc_df[refcol] + WINS_EPS)
disjset_avg_gap_df.at[disjset_binary_set,shortdestcol] = sum(disjset_binary_instances_df[destcol] > disjset_binary_instances_df[refcol] + WINS_EPS)

# Count number of instances that have V+G > 0
shortdestcol = inst_col_name
#col = 'V+GurL'
destcol = 'V+G'
disjset_avg_gap_df.at[disjset_all_set,shortdestcol] = sum(disjset_gap_df[destcol] > WINS_EPS)
disjset_avg_gap_df.at[disjset_good_vpc_set,shortdestcol] = sum(disjset_good_vpc_df[destcol] > WINS_EPS)
disjset_avg_gap_df.at[disjset_binary_set,shortdestcol] = sum(disjset_binary_instances_df[destcol] > WINS_EPS)

display(disjset_avg_gap_df)
display(disjset_wins_df)

### Update Table 2 `gap_by_size_df` with "Combined" results

In [None]:
# Create a new row with label "Combined"
all_row = pd.DataFrame(columns=gap_by_size_df.columns, index=["Combined"])

# Calculate the average values for each column from the `disjset` dataframe
all_row = disjset_avg_gap_df.loc['All',gap_by_size_df.columns][0:1].to_numpy()

# Append the new row to the `gap_by_size` dataframe
gap_by_size_df.loc['Combined'] = all_row[0]

# Display the updated `gap_by_size` dataframe
display(gap_by_size_df)


## "Combined" timing summary

### `disjset_timing`

In [None]:
inst_set = selected_disjset_time_instances
NUM_SEEDS = 7
disjset_timing_cols = \
    ['ALL' + ' ' + solver_type + ' ' + time_stub for solver_type in solver_stubs] \
        + \
        time_cols_long \
        + \
        [col_num_vpc]
new_disjset_timing_cols = [ 
        [
            solver_type + ' ' + '(%d)' % seed + ' ' + time_stub
            for seed in range(1,NUM_SEEDS+1)
        ]
        for solver_type in solver_stubs
    ]
new_disjset_wins_cols = [
    "Wins ({} over {})".format(solver_stubs[0], solver_stubs[1]),
    "Wins ({} over {})".format(solver_stubs[1], solver_stubs[0])
]
disjset_timing_df = df_disjset.loc[inst_set, disjset_timing_cols]
# display(disjet_timing_df.head())

# Add columns solver_type (%d) + ' ' + time_stub for the 7 random seeds
# These will be parsed from the ALL REF TIME and ALL REF+V TIME columns,
# which contain semicolon-separated values
for seed_ind in range(NUM_SEEDS):
    for solver_ind in range(len(solver_stubs)):
        solver_type = solver_stubs[solver_ind]
        orig_col = disjset_timing_cols[solver_ind]
        new_col = new_disjset_timing_cols[solver_ind][seed_ind]
        disjset_timing_df[new_col] = disjset_timing_df[orig_col].str.split(';').str[seed_ind]
        disjset_timing_df[new_col] = disjset_timing_df[new_col].astype(float)

# Add wins columns
for solver_ind in range(len(solver_stubs)):
    other_solver_ind = 1 - solver_ind

    disjset_timing_df[new_disjset_wins_cols[solver_ind]] = sum(
        disjset_timing_df[new_disjset_timing_cols[other_solver_ind][seed_ind]]
        > 
        WIN_BY_TIME_FACTOR * disjset_timing_df[new_disjset_timing_cols[solver_ind][seed_ind]]
        for seed_ind in range(NUM_SEEDS)
    )

display(disjset_timing_df.head())

In [None]:
disjset_timing_df.to_csv("disjet_timing.csv")

### `disjset_time_geomean_df`

In [None]:
# Row names
disjset_bb_classes = ['Combined', 'Combined-Binary']
disjset_bb_classes_lists = [ selected_disjset_time_instances, [inst for inst in pure_binary_instances if inst in selected_disjset_time_instances] ]
disjset_num_bb_classes = len(disjset_bb_classes)

# Rows are geomean and wins for each bucket
disjset_bb_row_names = pd.MultiIndex.from_product(
    [disjset_bb_classes, bb_buckets, bb_metrics],
    names=['class', 'bucket', 'metric'])
disjset_time_geomean_cols = [
    [
        new_disjset_timing_cols[solver_ind][seed_ind] 
        for seed_ind in range(NUM_SEEDS) 
        for solver_ind in range(len(solver_stubs))
    ]
    + time_cols_long
    + new_disjset_wins_cols
]
disjset_time_geomean_df = pd.DataFrame(
    columns = disjset_time_geomean_cols,
    index = disjset_bb_row_names,
    dtype = float
)

# Make all columns "object" type to allow for integer values
disjset_time_geomean_df.loc[:,(disjset_time_geomean_cols[0])] = disjset_time_geomean_df.loc[:,(disjset_time_geomean_cols[0])].astype(object)

# Prepare num_inst columns
disjset_num_inst_by_seed_bucket_class = [
    np.zeros(len(disjset_time_geomean_df),dtype = np.int64)
    for _ in range(NUM_SEEDS) 
    # for _ in range(num_buckets) 
    # for _ in range(disjset_num_bb_classes)
]
disjset_num_inst_avg = np.zeros(len(disjset_time_geomean_df),dtype = np.int64)
disjset_num_ref_seed_wins = np.zeros(len(disjset_time_geomean_df),dtype = np.int64)
disjset_num_refv_seed_wins = np.zeros(len(disjset_time_geomean_df),dtype = np.int64)

# For each class, calculate geomean
for seed_ind in range(NUM_SEEDS):
    row_ind = 0
    seed_cols = [ new_disjset_timing_cols[solver_ind][seed_ind] for solver_ind in range(len(solver_stubs)) ]
    
    for class_ind in range(disjset_num_bb_classes):
        print("\nClass: {}".format(disjset_bb_classes[class_ind]))
        curr_df = disjset_timing_df.loc[disjset_bb_classes_lists[class_ind]]
        seed_curr_df = curr_df[seed_cols]

        # For every instance in which curr_by_depth_df[col] >= MAX_TIME - EPS, multiply by TIMEOUT_TIME_FACTOR
        for col_ind in range(len(seed_cols)):
            time_col = seed_cols[col_ind]
            seed_curr_df.loc[seed_curr_df[time_col] >= MAX_TIME - EPS, time_col] = TIMEOUT_TIME_FACTOR * MAX_TIME

        for bucket_ind in range(num_buckets):
            print("Bucket: [{:d},{:d})".format(bucket_min[bucket_ind],bucket_max[bucket_ind]))

            # Take subset of instances for which both solvers (with this seed) solve the instance in the time frame for the bucket
            seed_curr_df = seed_curr_df[
                (seed_curr_df[seed_cols].min(axis=1) >= bucket_min[bucket_ind])
                & (seed_curr_df[seed_cols].min(axis=1) < bucket_max[bucket_ind] - EPS)
            ]

            # Calculate geomean
            metric_ind = 0
            disjset_time_geomean_df.loc[
                (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                seed_cols
            ] = [
                    geometric_mean(seed_curr_df[col] + SHIFT_TIME) - SHIFT_TIME
                    for col in seed_cols
                ]
            
            # Calculate wins
            metric_ind = 1
            for solver_ind in range(len(solver_stubs)):
                other_solver_ind = 1 - solver_ind
                disjset_time_geomean_df.loc[
                    (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                    seed_cols[solver_ind]
                ] = sum(
                    seed_curr_df[seed_cols[other_solver_ind]]
                    > 
                    WIN_BY_TIME_FACTOR * seed_curr_df[seed_cols[solver_ind]]
                )

            print("row {:d}: {:d}".format(row_ind,len(seed_curr_df)))
            disjset_num_inst_by_seed_bucket_class[seed_ind][row_ind:row_ind+len(bb_metrics)] = [len(seed_curr_df)] * len(bb_metrics)

            row_ind += len(bb_metrics)

            # for solver_ind in range(len(solver_stubs)):
            #     other_solver_ind = 1 - solver_ind
            #     disjset_time_geomean_df.loc[
            #         (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
            #         new_disjset_wins_cols[solver_ind]
            #     ] = sum(
            #         seed_curr_df[new_disjset_timing_cols[other_solver_ind][seed_ind]]
            #         > 
            #         WIN_BY_TIME_FACTOR * seed_curr_df[new_disjset_timing_cols[solver_ind][seed_ind]]
            #     )


row_ind = 0
only_time_cols = [ ref_time_col, refv_time_col ]
cols = time_cols_long
wins_cols = new_disjset_wins_cols
for class_ind in range(disjset_num_bb_classes):
    print("\nClass: {}".format(disjset_bb_classes[class_ind]))
    curr_df = disjset_timing_df.loc[disjset_bb_classes_lists[class_ind]]

    # For every instance in which curr_by_depth_df[col] >= MAX_TIME - EPS, multiply by TIMEOUT_TIME_FACTOR
    for col_ind in range(len(only_time_cols)):
        time_col = only_time_cols[col_ind]
        curr_df.loc[curr_df[time_col] >= MAX_TIME - EPS, time_col] = TIMEOUT_TIME_FACTOR * MAX_TIME

    for bucket_ind in range(num_buckets):
        print("Bucket: [{:d},{:d})".format(bucket_min[bucket_ind],bucket_max[bucket_ind]))

        # Take subset of instances for which both solvers (with this seed) solve the instance in the time frame for the bucket
        curr_df = curr_df[
            (curr_df[only_time_cols].min(axis=1) >= bucket_min[bucket_ind])
            & (curr_df[only_time_cols].min(axis=1) < bucket_max[bucket_ind] - EPS)
        ]

        # Calculate geomean
        metric_ind = 0
        disjset_time_geomean_df.loc[
            (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
            cols
        ] = [
                geometric_mean(curr_df[col] + SHIFT_TIME) - SHIFT_TIME
                for col in cols
            ]
        
        # Calculate wins
        metric_ind = 1
        for solver_ind in range(len(solver_stubs)):
            other_solver_ind = 1 - solver_ind
            disjset_time_geomean_df.loc[
                (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                only_time_cols[solver_ind]
            ] = sum(
                curr_df[only_time_cols[other_solver_ind]]
                > 
                WIN_BY_TIME_FACTOR * curr_df[only_time_cols[solver_ind]]
            )

            # Also report average number of seeds for which REF and REF+V win
            disjset_time_geomean_df.loc[
                (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[0:len(bb_metrics)]),
                wins_cols[solver_ind]
            ] = sum(curr_df[wins_cols[solver_ind]]) / len(curr_df)

        print("row {:d}: {:d}".format(row_ind,len(curr_df)))
        disjset_num_inst_avg[row_ind:row_ind+len(bb_metrics)] = [len(curr_df)] * len(bb_metrics)

        row_ind += len(bb_metrics)

# Insert columns with # inst (seed)
for seed_ind in range(NUM_SEEDS):
    disjset_time_geomean_df.insert(
        2 + seed_ind * (len(solver_stubs) + 1),
        "# inst ({:d})".format(seed_ind+1),
        disjset_num_inst_by_seed_bucket_class[seed_ind]
    )

# Insert columns with # inst (avg)
disjset_time_geomean_df.insert(
    2 + NUM_SEEDS * (len(solver_stubs) + 1),
    "# inst (avg)",
    disjset_num_inst_avg
)

disjset_time_geomean_df
    

## "Combined" nodes summary

### disjset_nodes

In [None]:
inst_set = selected_disjset_time_instances

disjset_nodes_cols = \
    ['ALL' + ' ' + solver_type + ' ' + nodes_stub for solver_type in solver_stubs] \
    + \
    [ref_nodes_col, refv_nodes_col] \
    + \
    [col_num_vpc]
new_disjset_nodes_cols = [ 
        [
            solver_type + ' ' + '(%d)' % seed + ' ' + nodes_stub
            for seed in range(1,NUM_SEEDS+1)
        ]
        for solver_type in solver_stubs
    ]
new_disjset_wins_cols = [
    "Wins ({} over {})".format(solver_stubs[0], solver_stubs[1]),
    "Wins ({} over {})".format(solver_stubs[1], solver_stubs[0])
]
disjset_nodes_df = df_disjset.loc[inst_set, disjset_nodes_cols]
# display(disjet_timing_df.head())

# Add columns solver_type (%d) + ' ' + time_stub for the 7 random seeds
# These will be parsed from the ALL REF TIME and ALL REF+V TIME columns,
# which contain semicolon-separated values
for seed_ind in range(NUM_SEEDS):
    for solver_ind in range(len(solver_stubs)):
        solver_type = solver_stubs[solver_ind]
        orig_col = disjset_nodes_cols[solver_ind]
        new_col = new_disjset_nodes_cols[solver_ind][seed_ind]
        disjset_nodes_df[new_col] = disjset_nodes_df[orig_col].str.split(';').str[seed_ind]
        disjset_nodes_df[new_col] = disjset_nodes_df[new_col].astype(float)

# Add wins columns
for solver_ind in range(len(solver_stubs)):
    other_solver_ind = 1 - solver_ind

    disjset_nodes_df[new_disjset_wins_cols[solver_ind]] = sum(
        disjset_nodes_df[new_disjset_nodes_cols[other_solver_ind][seed_ind]]
        > 
        WIN_BY_TIME_FACTOR * disjset_nodes_df[new_disjset_nodes_cols[solver_ind][seed_ind]]
        for seed_ind in range(NUM_SEEDS)
    )

display(disjset_nodes_df.head())

In [None]:
disjset_nodes_df.to_csv("disjset_nodes.csv")

### disjset_nodes_geomean_df

In [None]:
# Row names
disjset_nodes_geomean_cols = [
    [
        new_disjset_nodes_cols[solver_ind][seed_ind] 
        for seed_ind in range(NUM_SEEDS) 
        for solver_ind in range(len(solver_stubs))
    ]
    + [ref_nodes_col, refv_nodes_col]
    + new_disjset_wins_cols
]
disjset_nodes_geomean_df = pd.DataFrame(
    columns = disjset_nodes_geomean_cols,
    index = disjset_bb_row_names,
    dtype = float
)

# Make all columns "object" type to allow for integer values
disjset_nodes_geomean_df.loc[:,(disjset_nodes_geomean_cols[0])] = disjset_nodes_geomean_df.loc[:,(disjset_nodes_geomean_cols[0])].astype(object)

# Prepare num_inst columns
disjset_num_inst_by_seed_bucket_class = [
    np.zeros(len(disjset_nodes_geomean_df),dtype = np.int64)
    for _ in range(NUM_SEEDS) 
]
disjset_num_inst_avg = np.zeros(len(disjset_nodes_geomean_df),dtype = np.int64)
disjset_num_ref_seed_wins = np.zeros(len(disjset_nodes_geomean_df),dtype = np.int64)
disjset_num_refv_seed_wins = np.zeros

# For each class, calculate geomean
for seed_ind in range(NUM_SEEDS):
    row_ind = 0
    for class_ind in range(disjset_num_bb_classes):
        print("\nClass: {}".format(disjset_bb_classes[class_ind]))
        curr_df = disjset_nodes_df.loc[disjset_bb_classes_lists[class_ind]]
        seed_cols = [ new_disjset_nodes_cols[solver_ind][seed_ind] for solver_ind in range(len(solver_stubs)) ]
        seed_curr_df = curr_df[seed_cols]
        
        time_curr_df = disjset_timing_df.loc[disjset_bb_classes_lists[class_ind]]
        time_seed_cols = [ new_disjset_timing_cols[solver_ind][seed_ind] for solver_ind in range(len(solver_stubs)) ]
        seed_time_curr_df = time_curr_df[time_seed_cols]

        # if (len(seed_curr_df) != len(seed_time_curr_df)):
        #     display(len(seed_curr_df))
        #     display(len(seed_time_curr_df))
        # break

        # For every instance in which curr_by_depth_df[col] >= MAX_TIME - EPS, multiply nodes processed by TIMEOUT_NODE_FACTOR
        for col_ind in range(len(time_seed_cols)):
            time_col = time_seed_cols[col_ind]
            node_col = seed_cols[col_ind]
            curr_selected_indices = seed_time_curr_df[time_col] >= MAX_TIME - EPS
            seed_curr_df.loc[curr_selected_indices, node_col] = TIMEOUT_NODE_FACTOR * seed_curr_df[node_col]

        for bucket_ind in range(num_buckets):
            print("Bucket: [{:d},{:d})".format(bucket_min[bucket_ind],bucket_max[bucket_ind]))

            # Take subset of instances for which both solvers (with this seed) solve the instance in the time frame for the bucket
            curr_selected_indices = \
                (seed_time_curr_df[time_seed_cols].min(axis=1) >= bucket_min[bucket_ind]) \
                & (seed_time_curr_df[time_seed_cols].min(axis=1) < bucket_max[bucket_ind] - EPS)
            seed_curr_df = seed_curr_df[curr_selected_indices]
            seed_time_curr_df = seed_time_curr_df[curr_selected_indices]

            # Calculate geomean
            metric_ind = 0
            disjset_nodes_geomean_df.loc[
                (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                seed_cols
            ] = [
                    geometric_mean(seed_curr_df[col] + SHIFT_NODES) - SHIFT_NODES
                    for col in seed_cols
                ]
            
            # Calculate wins
            metric_ind = 1
            for solver_ind in range(len(solver_stubs)):
                other_solver_ind = 1 - solver_ind
                disjset_nodes_geomean_df.loc[
                    (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                    seed_cols[solver_ind]
                ] = sum(
                    seed_curr_df[seed_cols[other_solver_ind]]
                    > 
                    WIN_BY_NODES_FACTOR * seed_curr_df[seed_cols[solver_ind]]
                )

            print("row {:d}: {:d}".format(row_ind,len(seed_curr_df)))
            disjset_num_inst_by_seed_bucket_class[seed_ind][row_ind:row_ind+len(bb_metrics)] = [len(seed_curr_df)] * len(bb_metrics)

            row_ind += len(bb_metrics)

row_ind = 0
time_cols = [ ref_time_col, refv_time_col ]
nodes_cols = [ ref_nodes_col, refv_nodes_col ]
wins_cols = new_disjset_wins_cols
for class_ind in range(disjset_num_bb_classes):
    print("\nClass: {}".format(disjset_bb_classes[class_ind]))
    curr_df = disjset_nodes_df.loc[disjset_bb_classes_lists[class_ind]]
    time_curr_df = disjset_timing_df.loc[disjset_bb_classes_lists[class_ind]]

    # For every instance in which curr_by_depth_df[col] >= MAX_TIME - EPS, multiply nodes processed by TIMEOUT_NODE_FACTOR
    for col_ind in range(len(time_cols)):
        time_col = time_cols[col_ind]
        node_col = nodes_cols[col_ind]
        curr_selected_indices = time_curr_df[time_col] >= MAX_TIME - EPS
        curr_df.loc[curr_selected_indices, node_col] = TIMEOUT_NODE_FACTOR * curr_df[node_col]

    for bucket_ind in range(num_buckets):
        print("Bucket: [{:d},{:d})".format(bucket_min[bucket_ind],bucket_max[bucket_ind]))

        # Take subset of instances for which both solvers (with this seed) solve the instance in the time frame for the bucket
        curr_selected_indices = \
            (time_curr_df[time_cols].min(axis=1) >= bucket_min[bucket_ind]) \
            & (time_curr_df[time_cols].min(axis=1) < bucket_max[bucket_ind] - EPS)
        curr_df = curr_df[curr_selected_indices]
        time_curr_df = time_curr_df[curr_selected_indices]

        # Calculate geomean
        metric_ind = 0
        disjset_nodes_geomean_df.loc[
            (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
            nodes_cols
        ] = [
                geometric_mean(curr_df[col] + SHIFT_NODES) - SHIFT_NODES
                for col in nodes_cols
            ]
        
        # Calculate wins
        metric_ind = 1
        for solver_ind in range(len(solver_stubs)):
            other_solver_ind = 1 - solver_ind
            disjset_nodes_geomean_df.loc[
                (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                nodes_cols[solver_ind]
            ] = sum(
                curr_df[nodes_cols[other_solver_ind]]
                > 
                WIN_BY_NODES_FACTOR * curr_df[nodes_cols[solver_ind]]
            )

            # Also report average number of seeds for which REF and REF+V win
            disjset_nodes_geomean_df.loc[
                (disjset_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[0:len(bb_metrics)]),
                wins_cols[solver_ind]
            ] = sum(curr_df[wins_cols[solver_ind]]) / len(curr_df)

        print("row {:d}: {:d}".format(row_ind,len(curr_df)))
        disjset_num_inst_avg[row_ind:row_ind+len(bb_metrics)] = [len(curr_df)] * len(bb_metrics)

        row_ind += len(bb_metrics)

# Insert columns with # inst (seed)
for seed_ind in range(NUM_SEEDS):
    disjset_nodes_geomean_df.insert(
        2 + seed_ind * (len(solver_stubs) + 1),
        "# inst ({:d})".format(seed_ind+1),
        disjset_num_inst_by_seed_bucket_class[seed_ind]
    )

# Insert columns with # inst (avg)
disjset_nodes_geomean_df.insert(
    2 + NUM_SEEDS * (len(solver_stubs) + 1),
    "# inst (avg)",
    disjset_num_inst_avg
)

disjset_nodes_geomean_df


## Table 3: `disjset_avg_bb_df` for "Combined" instances

In [None]:
## Set up empty `disjset_avg_bb_df`

disjset_time_cols_short = time_cols_short
disjset_node_cols_short = node_cols_short

disjset_avg_bb_cols = pd.MultiIndex.from_arrays(
    [
      [time_col_header]*len(disjset_time_cols_short) + [node_col_header]*len(disjset_node_cols_short),
      disjset_time_cols_short + disjset_node_cols_short
    ],
    names = ['criterion', 'type'])

#bb_row_names = pd.MultiIndex.from_product([bb_buckets, bb_row_names], names=['bucket', 'metric'])
disjset_bb_row_names = pd.MultiIndex.from_product(
    [disjset_bb_classes, bb_buckets, bb_metrics],
    names=['class', 'bucket', 'metric'])

disjset_avg_bb_df = pd.DataFrame(
    columns = disjset_avg_bb_cols,
    index = disjset_bb_row_names,
    dtype = float
)

# Make all columns "object" type to allow for integer values
disjset_avg_bb_df.loc[:,(time_col_header,disjset_time_cols_short)] = disjset_avg_bb_df.loc[:,(time_col_header,disjset_time_cols_short)].astype(object)
disjset_avg_bb_df.loc[:,(node_col_header,disjset_node_cols_short)] = disjset_avg_bb_df.loc[:,(node_col_header,disjset_node_cols_short)].astype(object)

num_inst = np.zeros(len(disjset_avg_bb_df),dtype = np.int64)
# cols = disjset_time_cols_short + disjset_node_cols_short

## Fill in values from disjset_time_geomean_df and disjset_nodes_geomean_df (from the ref and refv cols)
# Fill in values from disjset_time_geomean_df
for index, row in disjset_time_geomean_df.iterrows():
    class_val, bucket_val, metric_val = index
    for col in disjset_time_cols_short:
        disjset_avg_bb_df.loc[(class_val, bucket_val, metric_val), (time_col_header, col)] = \
            row[map_short_to_cols_time[col]]

# Fill in values from disjset_nodes_geomean_df
for index, row in disjset_nodes_geomean_df.iterrows():
    class_val, bucket_val, metric_val = index
    for col in disjset_node_cols_short:
        disjset_avg_bb_df.loc[(class_val, bucket_val, metric_val), (node_col_header, col)] = \
            row[map_short_to_cols_nodes[col]]
        
# Add in number of instances in each bucket as '# inst' col
disjset_avg_bb_df[inst_col_name] = disjset_num_inst_avg

display(disjset_avg_bb_df.loc[:,disjset_avg_bb_df.columns.get_level_values(0)==node_col_header].head(8))
#display(avg_bb_df.loc[(bb_classes[0], bb_buckets[1], bb_metrics[0]),:])
display(disjset_avg_bb_df.loc[(disjset_bb_classes, bb_buckets, bb_metrics[0]),:])
display(disjset_avg_bb_df)


# Section 5: rounds results

## Select rounds instances

In [None]:
## Read in rounds data
#df_rounds = pd.read_csv(RESULTS_DIR + '/' + 'vpc-rounds.csv', index_col=0)
df_rounds = initialize_df(RESULTS_DIR + '/' + 'vpc-rounds.csv')

## Change disj_terms (second level of index) to -1
df_rounds.index = df_rounds.index.set_levels([-1], level=1)

## Update column types
col_list = [col_best_disj_obj, col_worst_disj_obj]
for col in col_list:
    df_rounds[col] = pd.to_numeric(df_rounds[col])

## Create new column for number of disjunctive terms since original one is now index
df_rounds[col_num_disj_terms] = df_rounds.index.get_level_values(1)

## Identify pure binary instances, which are those where 'CLEANED BINARY' column equals 'CLEANED COLS'
df_rounds[col_pure_binary] = (df_rounds[col_binary] == df_rounds[col_num_cols])

## Identify mixed binary instances, which are those where 'CLEANED GEN INT' column = 0
df_rounds[col_mixed_binary] = (df_rounds[col_gen_int] == 0)

## Recompute average running times
curr_df = df_rounds
solver_stub = solver_stubs[1]
    
# Split values in 'ALL REF TIME' into new columns for 'REF TIME (SEED)' where 'SEED' takes values 628 * [1,2,3,4,5,6,7]
df_timing = curr_df['ALL '+solver+' TIME'].str.split(';', expand=True)
df_timing.columns = [solver+' TIME (%d)' % (i+1) for i in range(df_timing.shape[1])]
df_timing = df_timing.astype(float)
df_timing_cols = df_timing.columns[df_timing.columns.str.contains(re.escape(solver)+r' TIME (.+)')]

# Do the same for nodes
df_nodes = curr_df['ALL '+solver+' NODES'].str.split(';', expand=True)
df_nodes.columns = [solver+' NODES (%d)' % (i+1) for i in range(df_nodes.shape[1])]
df_nodes = df_nodes.astype(float)
df_nodes_cols = df_nodes.columns[df_nodes.columns.str.contains(re.escape(solver)+r' NODES (.+)')]

# Select entries in which the max time is greater than MAX_TIME
selected_entries = df_timing > (MAX_TIME - EPS)

# Add min and max of 'REF TIME (SEED)' columns
curr_df[solver+' TIME MIN'] = df_timing[df_timing_cols].min(axis=1)
curr_df[solver+' TIME MAX'] = df_timing[df_timing_cols].max(axis=1)

# Find average of 'REF TIME (SEED)' columns after adjusting for timeout
df_timing[selected_entries] = TIMEOUT_TIME_FACTOR * MAX_TIME
# df_timing[solver+' TIME AVG'] = df_timing.mean(axis=1)
curr_df['AVG '+solver+' TIME'] = df_timing[df_timing_cols].mean(axis=1)

## Repeat for nodes
# Add min and max of 'REF NODES (SEED)' columns
curr_df[solver+' NODES MIN'] = df_nodes[df_nodes_cols].min(axis=1)
curr_df[solver+' NODES MAX'] = df_nodes[df_nodes_cols].max(axis=1)

# Find average of 'REF NODES (SEED)' columns after adjusting for timeout
selected_entries.columns = df_nodes_cols
df_nodes[selected_entries] *= TIMEOUT_NODE_FACTOR
# df_nodes[solver+' NODES AVG'] = df_nodes[df_nodes_cols].mean(axis=1)
curr_df['AVG '+solver+' NODES'] = df_nodes[df_nodes_cols].mean(axis=1)

# Append df_timing to df
curr_df = pd.concat([curr_df, df_timing], axis=1)

# Append df_nodes to df
curr_df = pd.concat([curr_df, df_nodes], axis=1)

## Add col for avg time with cuts
df_rounds[refv_w_cut_time_col] = df_rounds[refv_time_col] + df_rounds[col_vpc_gen_time]

display(df_rounds.head(2))
display(df_rounds.loc['mas284_presolved'][obj_val_col_list])

In [None]:
# Create df_nodes dataframe, in which each row is an instance, and each column is a solver (with or w/o cuts, and with a fixed random seed)
# These will be pulled from 'ALL REF TIME' and 'ALL REF+V TIME' columns
inst_set = list(instances)
# inst_set = ['bm23_presolved', 'cvs16r106-72_presolved', 'cvs16r128-89_presolved', 'mine-90-10_presolved', 'misc03_presolved', 'neos-1058477_presolved', 'neos-3083819-nubu_presolved', 'neos-593853_presolved', 'neos-880324_presolved', 'ns1688347_presolved']
PRINT_SKIP_REASON = False
PRINT_NEW_SKIP_REASON = True

rounds_num_gap_errors = 0

selected_rounds_gap_instances_dict = {} # dictionary of (original index, instance)
selected_rounds_time_instances_dict = {} # dictionary of (original index, instance)
for i, inst in enumerate(inst_set):
  print("{}/{}".format(i+1,len(inst_set)), end='\r', flush=True)
  skip_instance = False
  prev_selected_instance = inst in selected_gap_instances
  CURR_PRINT_SKIP_REASON = PRINT_SKIP_REASON or (PRINT_NEW_SKIP_REASON and prev_selected_instance)

  # # Check that -1 depth exists
  # if not (-1 in df.loc[inst].index):
  #   # if PRINT_SKIP_REASON:
  #   print("Skipping instance {:d} -- {}: no -1 depth.".format(
  #       i, inst
  #   ))
  #   skip_instance = True
  #   continue
  if not inst in df_rounds.index:
    if CURR_PRINT_SKIP_REASON:
      print("Skipping instance {:d} -- {}: no rounds results.".format(
          i, inst
      ))
    skip_instance = True
    continue
  
  curr_df = df_rounds.loc[(inst,-1)]

  # Ensure nrows and ncols is not too many
  nrows = curr_df[col_num_rows]
  ncols = curr_df[col_num_cols]
  if (nrows > MAX_ROWS) or (ncols > MAX_COLS):
    if CURR_PRINT_SKIP_REASON:
        print("Skipping instance {:d} -- {}: nrows = {:d} > {:d} or ncols = {:d} > {:d}.".format(
                i, inst, nrows, ncols, MAX_ROWS, MAX_COLS))
    skip_instance = True

  # Ensure IP objective value is known
  ip_obj = np.float64(df_ipopt.loc[inst,col_ip_obj])
  if not isinstance(ip_obj,float):
    if CURR_PRINT_SKIP_REASON:
        print(
            "Skipping instance {:d} -- {}: IP objective value ({}) is not detected to be a float value.".format(
            i, inst, ip_obj))
    skip_instance = True

  check_ip_obj = curr_df[col_ip_obj]
  if not is_val(ip_obj,check_ip_obj):
    print("*** ERROR: Instance {:d} -- {}: IP objective value ({}) does not match value in dataframe ({}). REPLACING WITH KNOWN VALUE!".format(
        i, inst, ip_obj, check_ip_obj))
    df_rounds.loc[(inst,-1),col_ip_obj] = ip_obj
    # skip_instance = True

  # Check that LP opt < IP opt
  lp_obj = np.float64(df_preprocess.loc[remove_presolved_from_name(inst),col_cleaned_lp_obj])
  YES_GAP = (ip_obj - lp_obj) >= EPS
  if not YES_GAP:
    print("*** ERROR: Instance {:d} -- {}: not YES GAP (lp = {:.10f}; ip = {:.10f}, diff = {:.2f})".format(i, inst, lp_obj, ip_obj, ip_obj-lp_obj))
    skip_instance = True
    rounds_num_gap_errors += 1

  # Check that ExitReason != OPTIMAL_SOLUTION_FOUND
  exitreason = curr_df['ExitReason']
  if exitreason == 'OPTIMAL_SOLUTION_FOUND':
    if CURR_PRINT_SKIP_REASON:
      print("Skipping instance {:d} -- {}: optimal IP solution found.".format(
          i, inst
      ))
    skip_instance = True

  # Check that VPCs were generated
  num_vpc = curr_df[col_num_vpc]
  if num_vpc == 0:
    if CURR_PRINT_SKIP_REASON:
      print("Skipping instance {:d} -- {}: no VPCs generated.".format(
          i, inst
      ))
    skip_instance = True

  if not skip_instance:
    selected_rounds_gap_instances_dict[inst] = i

    # If both ref and refv times are < MAX_TIME - EPS, include in nodes experiments
    ref_time = curr_df[ref_time_col]
    refv_time = curr_df[refv_time_col]
    if (ref_time < MAX_TIME - EPS) and (refv_time < MAX_TIME - EPS):
      selected_rounds_time_instances_dict[inst] = i

selected_rounds_gap_instances = list(selected_rounds_gap_instances_dict.keys())
print("Total number of errors: {}".format(rounds_num_gap_errors))
print("Num instances selected for rounds gap closed results = {:d}".format(len(selected_rounds_gap_instances)))

selected_rounds_time_instances = list(selected_rounds_time_instances_dict.keys())
print("Num instances selected for rounds nodes results = {:d}".format(len(selected_rounds_time_instances)))

### List instances that are in exactly one of the two sets `selected_gap_instances` and `selected_rounds_gap_instances`

In [None]:
# Symmetric difference between two sets selected_gap_instances and selected_rounds_gap_instances
print("Instances in selected_rounds_gap_instances that are not in selected_gap_instances:")
newly_selected_instances = [inst for inst in selected_rounds_gap_instances if inst not in selected_gap_instances]
print(newly_selected_instances)

# Remove newly selected instances
for inst in newly_selected_instances:
  selected_rounds_gap_instances.remove(inst)

print("")
print("Instances in selected_gap_instances that are not in selected_rounds_gap_instances:")
print([inst for inst in selected_gap_instances if inst not in selected_rounds_gap_instances])

print("")
print("Instances in selected_time_instances that are not in selected_rounds_time_instances:")
print([inst for inst in selected_time_instances if inst not in selected_rounds_time_instances])

print("")
print("Instances in selected_rounds_time_instances that are not in selected_time_instances:")
print([inst for inst in selected_rounds_time_instances if inst not in selected_time_instances])

print("")
print("Instances in selected_rounds_time_instances that are not in selected_disjset_time_instances:")
print([inst for inst in selected_rounds_time_instances if inst not in selected_disjset_time_instances])

#### DEBUG: why instance is missing from rounds results

In [None]:
# # Check if instance is in df_rounds and identify reason it was not selected if yes
# inst = 'lotsize_presolved'
# if inst in df_rounds.index:
#   print("Instance {} is in df_rounds.".format(inst))
#   curr_df = df_rounds.loc[(inst,-1)]
#   ip_obj = np.float64(df_ipopt.loc[inst,col_ip_obj])
#   check_ip_obj = curr_df[col_ip_obj]
#   if not is_val(ip_obj,check_ip_obj):
#     print("*** ERROR: Instance {}: IP objective value ({}) does not match value in dataframe ({}). NEED TO REPLACE WITH KNOWN VALUE! (NOT DONE HERE.)".format(inst, ip_obj, check_ip_obj))
#     # df_rounds.loc[(inst,-1),col_ip_obj] = ip_obj
#   exitreason = curr_df['ExitReason']
#   if exitreason == 'OPTIMAL_SOLUTION_FOUND':
#     print("Instance {} excluded because optimal IP solution found.".format(inst))
#   num_vpc = curr_df[col_num_vpc]
#   if num_vpc == 0:
#     print("Instance {} excluded because no VPCs generated.".format(inst))
# else:
#     print("Instance {} is not in df_rounds.".format(inst))

# # Check timing
# solver = 'REF+V'
# df_rounds.loc[inst, 'ALL '+solver+' TIME'].str.split(';', expand=True)


## rounds gap summary

### `rounds_gap_df`: Analyze rounds gap closed

In [None]:
# Create subset of dataframe relevant to gap closed
rounds_gap_df = df_rounds.loc[selected_rounds_gap_instances, 
                [
                    col_num_disj_terms,
                    col_num_rows,
                    col_num_cols
                ]
                +
                obj_val_col_list
                +
                [
                    col_num_gmic,
                    col_num_vpc,
                    col_num_obj,
                    col_exit_reason
                ]
               ]

# Take only -1 depth
rounds_gap_df = rounds_gap_df.xs(-1, level='disj_terms')

# Calculate some missing % gap closed columns
# gap closed = 100 * (post_cut_opt_val - lp_opt_val) / (ip_opt_val - lp_opt_val)
for cut_type in reg_cut_type_long_list:
    col = cut_type + ' ' + obj_stub
    if col not in df.columns:
        if cut_type == 'MAX(GMIC,VPC)':
            # Add max(G,V) column
            rounds_gap_df[cut_type + ' ' + pct_gap_closed_stub] = \
                np.maximum(
                    rounds_gap_df['GMIC' + ' ' + pct_gap_closed_stub],
                    rounds_gap_df[ 'VPC' + ' ' + pct_gap_closed_stub]
                )
        continue
    rounds_gap_df[cut_type + ' ' + pct_gap_closed_stub] = calc_gap_closed(rounds_gap_df, col)

# Compare against reference solver
for stat_type in solver_stat_list[:3]:
    for solver_type in solver_stubs:
        for cut_type in solver_cut_type_stubs:
            col = stat_type + ' ' + solver_type + ' ' + cut_type
            rounds_gap_df[col + ' ' + pct_gap_closed_stub] = calc_gap_closed(rounds_gap_df, col)

# Rename each column with its shortname
rounds_gap_df.rename(columns=map_cols_to_short_gap,inplace=True)

# Fill in values of col_first_cut_pass_gap_ref and col_last_cut_pass_gap_ref from disjset_gap_df
for inst in selected_rounds_gap_instances:
    for col in [col_first_cut_pass_gap_ref, col_last_cut_pass_gap_ref]:
        col = map_cols_to_short_gap[col]
        rounds_gap_df.loc[inst,col] = disjset_gap_df.loc[inst,col]

tmp_inst_set = ['mas284_presolved','maxgasflow_presolved']
display(disjset_gap_df.loc[tmp_inst_set][gap_cols_short])
display(rounds_gap_df.loc[tmp_inst_set][gap_cols_short])
display(rounds_gap_df.loc[("mas284_presolved")])

In [None]:
rounds_gap_df.to_csv('rounds_gap.csv')

### `rounds_avg_gap_df`: summary of results with rounds instances

In [None]:
## TABLE 1.2: average percent gap closed across different combinations of cuts
## Create rounds_avg_gap_df = average gap closed across instances
idx = pd.MultiIndex.from_product(
    [ [all_set_name, good_vpc_set_name, binary_set_name], [avg_row_name, wins_row_name] ],
    names = ['Set', '']
)
    
ncols = len(gap_cols_short)
nrows = len(idx)

col = rounds_gap_df['V'].astype(float)
rounds_good_vpc_df = rounds_gap_df[col >= 10.]
rounds_binary_instances_df = rounds_gap_df.loc[ [inst for inst in pure_binary_instances if inst in selected_rounds_gap_instances ] ]

data = np.zeros((nrows, ncols), dtype=float)
data[0,:] = [rounds_gap_df[col].mean() for col in gap_cols_short]
data[2,:] = [rounds_good_vpc_df[col].mean() for col in gap_cols_short]
data[4,:] = [rounds_binary_instances_df[col].mean() for col in gap_cols_short]

# display(best_gap_df.head())
rounds_avg_gap_df = pd.DataFrame(
    data,
    columns = gap_cols_short,
    index = idx,
    dtype = object
)

inst_col_name = '# inst'
rounds_avg_gap_df[inst_col_name] = [len(rounds_gap_df), 0, len(rounds_good_vpc_df), 0, len(rounds_binary_instances_df), 0]

rounds_avg_gap_df.iloc[1] = ["" for i in range(ncols+1)]
rounds_avg_gap_df.iloc[3] = ["" for i in range(ncols+1)]
rounds_avg_gap_df.iloc[5] = ["" for i in range(ncols+1)]

display(rounds_avg_gap_df)

### `rounds_wins_df`: number of wins across methods

In [None]:
## Create num wins df
# x wins over y for an instance if x > y + EPS
#shortcols = avg_gap_df.columns[0:-1]
rounds_wins_df = pd.DataFrame(
    np.zeros((len(gap_cols_short), len(gap_cols_short)), dtype=int),
    columns = gap_cols_short,
    index = gap_cols_short,
    dtype = int,
)

WINS_EPS = GAP_DIFF_EPS

from itertools import permutations
for (ind1, ind2) in permutations(range(len(gap_cols_short)), 2):
    rounds_wins_df.at[gap_cols_short[ind1],gap_cols_short[ind2]] =\
        int(sum(rounds_gap_df[gap_cols_short[ind1]] > rounds_gap_df[gap_cols_short[ind2]] + WINS_EPS))
    rounds_wins_df.at[gap_cols_short[ind2],gap_cols_short[ind1]] =\
        int(sum(rounds_gap_df[gap_cols_short[ind2]] > rounds_gap_df[gap_cols_short[ind1]] + WINS_EPS))

# Sets we are considering
# all_set = 'Wins (All)'
# good_vpc_set = 'Wins (V ≥ 10%)'
rounds_all_set = (all_set_name,wins_row_name)
rounds_good_vpc_set = (good_vpc_set_name,wins_row_name)
rounds_binary_set = (binary_set_name,wins_row_name)

# "G" are wins relative to "V"
shortrefcol = 'V'
#refcol = 'VPC % GAP CLOSED'
#refcol = map_short_to_cols[shortrefcol]
refcol = shortrefcol
shortdestcol = 'G'
#col = 'GMIC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
rounds_avg_gap_df.at[rounds_all_set,shortdestcol] = rounds_wins_df.at[shortdestcol,shortrefcol]
rounds_avg_gap_df.at[rounds_good_vpc_set,shortdestcol] = sum(rounds_good_vpc_df[destcol] > rounds_good_vpc_df[refcol] + WINS_EPS)
rounds_avg_gap_df.at[rounds_binary_set,shortdestcol] = sum(rounds_binary_instances_df[destcol] > rounds_binary_instances_df[refcol] + WINS_EPS)

# "DB", "V", "V+G": wins are relative to "G"
shortrefcol = 'G'
#refcol = 'GMIC % GAP CLOSED'
#refcol = map_short_to_cols[shortrefcol]
refcol = shortrefcol
shortdestcol = 'DB'
#col = 'BEST DISJ % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
rounds_avg_gap_df.at[rounds_all_set,shortdestcol] = rounds_wins_df.at[shortdestcol,shortrefcol]
rounds_avg_gap_df.at[rounds_good_vpc_set,shortdestcol] = sum(rounds_good_vpc_df[destcol] > rounds_good_vpc_df[refcol] + WINS_EPS)
rounds_avg_gap_df.at[rounds_binary_set,shortdestcol] = sum(rounds_binary_instances_df[destcol] > rounds_binary_instances_df[refcol] + WINS_EPS)

shortdestcol = 'V'
#col = 'VPC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
rounds_avg_gap_df.at[rounds_all_set,shortdestcol] = rounds_wins_df.at[shortdestcol,shortrefcol]
rounds_avg_gap_df.at[rounds_good_vpc_set,shortdestcol] = sum(rounds_good_vpc_df[destcol] > rounds_good_vpc_df[refcol] + WINS_EPS)
rounds_avg_gap_df.at[rounds_binary_set,shortdestcol] = sum(rounds_binary_instances_df[destcol] > rounds_binary_instances_df[refcol] + WINS_EPS)

shortdestcol = 'V+G'
#col = 'VPC+GMIC % GAP CLOSED'
#col = map_short_to_cols[shortcol]
destcol = shortdestcol
rounds_avg_gap_df.at[rounds_all_set,shortdestcol] = rounds_wins_df.at[shortdestcol,shortrefcol]
rounds_avg_gap_df.at[rounds_good_vpc_set,shortdestcol] = sum(rounds_good_vpc_df[destcol] > rounds_good_vpc_df[refcol] + WINS_EPS)
rounds_avg_gap_df.at[rounds_binary_set,shortdestcol] = sum(rounds_binary_instances_df[destcol] > rounds_binary_instances_df[refcol] + WINS_EPS)

# "V+GurF" are wins relative to "GurF"
shortrefcol = 'GurF'
refcol = shortrefcol
shortdestcol = 'V+GurF'
destcol = shortdestcol
#col = map_short_to_cols[shortcol]
rounds_avg_gap_df.at[rounds_all_set,shortdestcol] = rounds_wins_df.at[shortdestcol,shortrefcol]
rounds_avg_gap_df.at[rounds_good_vpc_set,shortdestcol] = sum(rounds_good_vpc_df[destcol] > rounds_good_vpc_df[refcol] + WINS_EPS)
rounds_avg_gap_df.at[rounds_binary_set,shortdestcol] = sum(rounds_binary_instances_df[destcol] > rounds_binary_instances_df[refcol] + WINS_EPS)

# "V+GurL" are wins relative to "GurL"
shortrefcol = 'GurL'
refcol = shortrefcol
shortdestcol = 'V+GurL'
destcol = shortdestcol
rounds_wins_df.at[shortdestcol,shortrefcol] = int(sum(rounds_gap_df[destcol] > rounds_gap_df[refcol] + WINS_EPS))
rounds_wins_df.at[shortrefcol,shortdestcol] = int(sum(rounds_gap_df[refcol] > rounds_gap_df[destcol] + WINS_EPS))
rounds_avg_gap_df.at[rounds_all_set,shortdestcol] = rounds_wins_df.at[shortdestcol,shortrefcol]
rounds_avg_gap_df.at[rounds_good_vpc_set,shortdestcol] = sum(rounds_good_vpc_df[destcol] > rounds_good_vpc_df[refcol] + WINS_EPS)
rounds_avg_gap_df.at[rounds_binary_set,shortdestcol] = sum(rounds_binary_instances_df[destcol] > rounds_binary_instances_df[refcol] + WINS_EPS)

# Count number of instances that have V+G > 0
shortdestcol = inst_col_name
#col = 'V+GurL'
destcol = 'V+G'
rounds_avg_gap_df.at[rounds_all_set,shortdestcol] = sum(rounds_gap_df[destcol] > WINS_EPS)
rounds_avg_gap_df.at[rounds_good_vpc_set,shortdestcol] = sum(rounds_good_vpc_df[destcol] > WINS_EPS)
rounds_avg_gap_df.at[rounds_binary_set,shortdestcol] = sum(rounds_binary_instances_df[destcol] > WINS_EPS)

display(rounds_avg_gap_df)
display(rounds_wins_df)

### Update Table 2 `gap_by_size_df` with rounds results

In [None]:
# Create a new row with label "Rounds"
rounds_row = pd.DataFrame(columns=gap_by_size_df.columns, index=["Rounds"])

# Calculate the average values for each column from the `rounds` dataframe
rounds_row = rounds_avg_gap_df.loc['All',gap_by_size_df.columns][0:1].to_numpy()

# Append the new row to the `gap_by_size` dataframe
gap_by_size_df.loc['Rounds'] = rounds_row[0]

# Display the updated `gap_by_size` dataframe
display(gap_by_size_df)


### How long did it take to generate these cuts, on average, compared to just one round with half the cut limit?

In [None]:
intersection_disjset_rounds_gap_instances = list(set(selected_disjset_gap_instances) & set(selected_rounds_gap_instances))
disjset_avg_gen_time = df_disjset.loc[intersection_disjset_rounds_gap_instances,[col_vpc_gen_time]].mean().iloc[0]
rounds_avg_gen_time = df_rounds.loc[intersection_disjset_rounds_gap_instances,[col_vpc_gen_time]].mean().iloc[0]
print("Average VPC generation time for disjset instances: {:.2f}".format(disjset_avg_gen_time))
print("Average VPC generation time for rounds instances: {:.2f}".format(rounds_avg_gen_time))

# Repeat for instances by disj size
curr_df = df.loc[intersection_disjset_rounds_gap_instances,[col_vpc_gen_time]]
for size in sizes:
    # report average time to generate vpcs from df for this size
    curr_size_avg_gen_time = curr_df.xs(size, level='disj_terms')[col_vpc_gen_time].mean()
    print("Average VPC generation time for instances of size {:d}: {:.2f}".format(size,curr_size_avg_gen_time))

## rounds timing summary

### `rounds_timing`

In [None]:
inst_set = selected_rounds_time_instances
NUM_SEEDS = 7
rounds_timing_cols = ['ALL' + ' ' + solver_type + ' ' + time_stub for solver_type in solver_stubs] + [ref_time_col, refv_time_col] + [col_vpc_gen_time] + [col_num_vpc]
new_rounds_timing_cols = [ 
        [
            solver_type + ' ' + '(%d)' % seed + ' ' + time_stub
            for seed in range(1,NUM_SEEDS+1)
        ]
        for solver_type in solver_stubs
    ]
new_rounds_wins_cols = [
    "Wins ({} over {})".format(solver_stubs[0], solver_stubs[1]),
    "Wins ({} over {})".format(solver_stubs[1], solver_stubs[0])
]
rounds_timing_df = df_rounds.loc[inst_set, rounds_timing_cols]

# Replace values of rounds_timing_df associated to ref with the values from disjset_timing_df for the same columns
cols = ['ALL' + ' ' + solver_stubs[0] + ' ' + time_stub, ref_time_col]
rounds_timing_df.loc[:,cols] = disjset_timing_df.loc[:,cols]

# display(disjet_timing_df.head())

# Add columns solver_type (%d) + ' ' + time_stub for the 7 random seeds
# These will be parsed from the ALL REF TIME and ALL REF+V TIME columns,
# which contain semicolon-separated values
for seed_ind in range(NUM_SEEDS):
    for solver_ind in range(len(solver_stubs)):
        solver_type = solver_stubs[solver_ind]
        orig_col = rounds_timing_cols[solver_ind]
        new_col = new_rounds_timing_cols[solver_ind][seed_ind]
        rounds_timing_df[new_col] = rounds_timing_df[orig_col].str.split(';').str[seed_ind]
        rounds_timing_df[new_col] = rounds_timing_df[new_col].astype(float)

# Add wins columns
for solver_ind in range(len(solver_stubs)):
    other_solver_ind = 1 - solver_ind

    rounds_timing_df[new_rounds_wins_cols[solver_ind]] = sum(
        rounds_timing_df[new_rounds_timing_cols[other_solver_ind][seed_ind]]
        > 
        WIN_BY_TIME_FACTOR * rounds_timing_df[new_rounds_timing_cols[solver_ind][seed_ind]]
        for seed_ind in range(NUM_SEEDS)
    )

display(rounds_timing_df.head())

In [None]:
rounds_timing_df.to_csv("rounds_timing.csv")

### `rounds_time_geomean_df`

In [None]:
# Row names
rounds_bb_classes = ['Rounds','RoundsBinary']
rounds_bb_classes_lists = [ selected_rounds_time_instances, [inst for inst in pure_binary_instances if inst in selected_rounds_time_instances] ]
rounds_num_bb_classes = len(rounds_bb_classes)

# Rows are geomean and wins for each bucket
rounds_bb_row_names = pd.MultiIndex.from_product(
    [rounds_bb_classes, bb_buckets, bb_metrics],
    names=['class', 'bucket', 'metric'])
rounds_time_geomean_cols = [
    [
        new_rounds_timing_cols[solver_ind][seed_ind] 
        for seed_ind in range(NUM_SEEDS) 
        for solver_ind in range(len(solver_stubs))
    ]
    + [ref_time_col, refv_time_col]
    + [col_vpc_gen_time]
    + new_rounds_wins_cols
]
rounds_time_geomean_df = pd.DataFrame(
    columns = rounds_time_geomean_cols,
    index = rounds_bb_row_names,
    dtype = float
)

# Make all columns "object" type to allow for integer values
rounds_time_geomean_df.loc[:,(rounds_time_geomean_cols[0])] = rounds_time_geomean_df.loc[:,(rounds_time_geomean_cols[0])].astype(object)

# Prepare num_inst columns
rounds_num_inst_by_seed_bucket_class = [
    np.zeros(len(rounds_time_geomean_df),dtype = np.int64)
    for _ in range(NUM_SEEDS) 
    # for _ in range(num_buckets) 
    # for _ in range(rounds_num_bb_classes)
]
rounds_num_inst_avg = np.zeros(len(rounds_time_geomean_df),dtype = np.int64)
rounds_num_ref_seed_wins = np.zeros(len(rounds_time_geomean_df),dtype = np.int64)
rounds_num_refv_seed_wins = np.zeros(len(rounds_time_geomean_df),dtype = np.int64)

# For each class, calculate geomean
for seed_ind in range(NUM_SEEDS):
    row_ind = 0
    for class_ind in range(rounds_num_bb_classes):
        print("\nClass: {}".format(rounds_bb_classes[class_ind]))
        curr_df = rounds_timing_df.loc[rounds_bb_classes_lists[class_ind]]
        seed_cols = [ new_rounds_timing_cols[solver_ind][seed_ind] for solver_ind in range(len(solver_stubs)) ]
        seed_curr_df = curr_df[seed_cols]

        # For every instance in which curr_by_depth_df[col] >= MAX_TIME - EPS, multiply by TIMEOUT_TIME_FACTOR
        for col_ind in range(len(seed_cols)):
            time_col = seed_cols[col_ind]
            seed_curr_df.loc[seed_curr_df[time_col] >= MAX_TIME - EPS, time_col] = TIMEOUT_TIME_FACTOR * MAX_TIME

        for bucket_ind in range(num_buckets):
            print("Bucket: [{:d},{:d})".format(bucket_min[bucket_ind],bucket_max[bucket_ind]))

            # Take subset of instances for which both solvers (with this seed) solve the instance in the time frame for the bucket
            seed_curr_df = seed_curr_df[
                (seed_curr_df[seed_cols].min(axis=1) >= bucket_min[bucket_ind])
                & (seed_curr_df[seed_cols].min(axis=1) < bucket_max[bucket_ind] - EPS)
            ]

            # Calculate geomean
            metric_ind = 0
            rounds_time_geomean_df.loc[
                (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                seed_cols
            ] = [
                    geometric_mean(seed_curr_df[col] + SHIFT_TIME) - SHIFT_TIME
                    for col in seed_cols
                ]
            
            # Calculate wins
            metric_ind = 1
            for solver_ind in range(len(solver_stubs)):
                other_solver_ind = 1 - solver_ind
                rounds_time_geomean_df.loc[
                    (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                    seed_cols[solver_ind]
                ] = sum(
                    seed_curr_df[seed_cols[other_solver_ind]]
                    > 
                    WIN_BY_TIME_FACTOR * seed_curr_df[seed_cols[solver_ind]]
                )

            print("row {:d}: {:d}".format(row_ind,len(seed_curr_df)))
            rounds_num_inst_by_seed_bucket_class[seed_ind][row_ind:row_ind+len(bb_metrics)] = [len(seed_curr_df)] * len(bb_metrics)

            row_ind += len(bb_metrics)

            # for solver_ind in range(len(solver_stubs)):
            #     other_solver_ind = 1 - solver_ind
            #     rounds_time_geomean_df.loc[
            #         (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
            #         new_rounds_wins_cols[solver_ind]
            #     ] = sum(
            #         seed_curr_df[new_rounds_timing_cols[other_solver_ind][seed_ind]]
            #         > 
            #         WIN_BY_TIME_FACTOR * seed_curr_df[new_rounds_timing_cols[solver_ind][seed_ind]]
            #     )


row_ind = 0
only_time_cols = [ ref_time_col, refv_time_col ]
cols = time_cols_long
wins_cols = new_rounds_wins_cols
for class_ind in range(rounds_num_bb_classes):
    print("\nClass: {}".format(rounds_bb_classes[class_ind]))
    curr_df = rounds_timing_df.loc[rounds_bb_classes_lists[class_ind]]

    # For every instance in which curr_by_depth_df[col] >= MAX_TIME - EPS, multiply by TIMEOUT_TIME_FACTOR
    for col_ind in range(len(only_time_cols)):
        time_col = only_time_cols[col_ind]
        curr_df.loc[curr_df[time_col] >= MAX_TIME - EPS, time_col] = TIMEOUT_TIME_FACTOR * MAX_TIME
        
    for bucket_ind in range(num_buckets):
        print("Bucket: [{:d},{:d})".format(bucket_min[bucket_ind],bucket_max[bucket_ind]))

        # Take subset of instances for which both solvers (with this seed) solve the instance in the time frame for the bucket
        curr_df = curr_df[
            (curr_df[only_time_cols].min(axis=1) >= bucket_min[bucket_ind])
            & (curr_df[only_time_cols].min(axis=1) < bucket_max[bucket_ind] - EPS)
        ]

        # Calculate geomean
        metric_ind = 0
        rounds_time_geomean_df.loc[
            (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
            cols
        ] = [
                geometric_mean(curr_df[col] + SHIFT_TIME) - SHIFT_TIME
                for col in cols
            ]
        
        # Calculate wins
        metric_ind = 1
        for solver_ind in range(len(solver_stubs)):
            other_solver_ind = 1 - solver_ind
            rounds_time_geomean_df.loc[
                (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                only_time_cols[solver_ind]
            ] = sum(
                curr_df[only_time_cols[other_solver_ind]]
                > 
                WIN_BY_TIME_FACTOR * curr_df[only_time_cols[solver_ind]]
            )

            # Also report average number of seeds for which REF and REF+V win
            rounds_time_geomean_df.loc[
                (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[0:len(bb_metrics)]),
                wins_cols[solver_ind]
            ] = sum(curr_df[wins_cols[solver_ind]]) / len(curr_df)

        print("row {:d}: {:d}".format(row_ind,len(curr_df)))
        rounds_num_inst_avg[row_ind:row_ind+len(bb_metrics)] = [len(curr_df)] * len(bb_metrics)

        row_ind += len(bb_metrics)

# Insert columns with # inst (seed)
for seed_ind in range(NUM_SEEDS):
    rounds_time_geomean_df.insert(
        2 + seed_ind * (len(solver_stubs) + 1),
        "# inst ({:d})".format(seed_ind+1),
        rounds_num_inst_by_seed_bucket_class[seed_ind]
    )

# Insert columns with # inst (avg)
rounds_time_geomean_df.insert(
    2 + NUM_SEEDS * (len(solver_stubs) + 1),
    "# inst (avg)",
    rounds_num_inst_avg
)

rounds_time_geomean_df
    

## rounds nodes summary

### rounds_nodes

In [None]:
inst_set = selected_rounds_time_instances

rounds_nodes_cols = \
    ['ALL' + ' ' + solver_type + ' ' + nodes_stub for solver_type in solver_stubs] \
    + \
    [ref_nodes_col, refv_nodes_col] \
    + \
    [col_num_vpc]
new_rounds_nodes_cols = [ 
        [
            solver_type + ' ' + '(%d)' % seed + ' ' + nodes_stub
            for seed in range(1,NUM_SEEDS+1)
        ]
        for solver_type in solver_stubs
    ]
new_rounds_wins_cols = [
    "Wins ({} over {})".format(solver_stubs[0], solver_stubs[1]),
    "Wins ({} over {})".format(solver_stubs[1], solver_stubs[0])
]
rounds_nodes_df = df_rounds.loc[inst_set, rounds_nodes_cols]
# display(disjet_timing_df.head())

# Replace values of rounds_timing_df associated to ref with the values from disjset_timing_df for the same columns
cols = ['ALL' + ' ' + solver_stubs[0] + ' ' + nodes_stub, ref_nodes_col]
rounds_nodes_df.loc[:,cols] = disjset_nodes_df.loc[:,cols]

# Add columns solver_type (%d) + ' ' + time_stub for the 7 random seeds
# These will be parsed from the ALL REF TIME and ALL REF+V TIME columns,
# which contain semicolon-separated values
for seed_ind in range(NUM_SEEDS):
    for solver_ind in range(len(solver_stubs)):
        solver_type = solver_stubs[solver_ind]
        orig_col = rounds_nodes_cols[solver_ind]
        new_col = new_rounds_nodes_cols[solver_ind][seed_ind]
        rounds_nodes_df[new_col] = rounds_nodes_df[orig_col].str.split(';').str[seed_ind]
        rounds_nodes_df[new_col] = rounds_nodes_df[new_col].astype(float)

# Add wins columns
for solver_ind in range(len(solver_stubs)):
    other_solver_ind = 1 - solver_ind

    rounds_nodes_df[new_rounds_wins_cols[solver_ind]] = sum(
        rounds_nodes_df[new_rounds_nodes_cols[other_solver_ind][seed_ind]]
        > 
        WIN_BY_TIME_FACTOR * rounds_nodes_df[new_rounds_nodes_cols[solver_ind][seed_ind]]
        for seed_ind in range(NUM_SEEDS)
    )

display(rounds_nodes_df.head())

In [None]:
rounds_nodes_df.to_csv("rounds_nodes.csv")

### rounds_nodes_geomean_df

In [None]:
# Row names
rounds_nodes_geomean_cols = [
    [
        new_rounds_nodes_cols[solver_ind][seed_ind] 
        for seed_ind in range(NUM_SEEDS) 
        for solver_ind in range(len(solver_stubs))
    ]
    + [ref_nodes_col, refv_nodes_col]
    + new_rounds_wins_cols
]
rounds_nodes_geomean_df = pd.DataFrame(
    columns = rounds_nodes_geomean_cols,
    index = rounds_bb_row_names,
    dtype = float
)

# Make all columns "object" type to allow for integer values
rounds_nodes_geomean_df.loc[:,(rounds_nodes_geomean_cols[0])] = rounds_nodes_geomean_df.loc[:,(rounds_nodes_geomean_cols[0])].astype(object)

# Prepare num_inst columns
rounds_num_inst_by_seed_bucket_class = [
    np.zeros(len(rounds_nodes_geomean_df),dtype = np.int64)
    for _ in range(NUM_SEEDS) 
]
rounds_num_inst_avg = np.zeros(len(rounds_nodes_geomean_df),dtype = np.int64)
rounds_num_ref_seed_wins = np.zeros(len(rounds_nodes_geomean_df),dtype = np.int64)
rounds_num_refv_seed_wins = np.zeros(len(rounds_nodes_geomean_df),dtype = np.int64)

# For each class, calculate geomean
for seed_ind in range(NUM_SEEDS):
    row_ind = 0
    for class_ind in range(rounds_num_bb_classes):
        print("\nClass: {}".format(rounds_bb_classes[class_ind]))
        curr_df = rounds_nodes_df.loc[rounds_bb_classes_lists[class_ind]]
        seed_cols = [ new_rounds_nodes_cols[solver_ind][seed_ind] for solver_ind in range(len(solver_stubs)) ]
        seed_curr_df = curr_df[seed_cols]
        
        time_curr_df = rounds_timing_df.loc[rounds_bb_classes_lists[class_ind]]
        time_seed_cols = [ new_rounds_timing_cols[solver_ind][seed_ind] for solver_ind in range(len(solver_stubs)) ]
        seed_time_curr_df = time_curr_df[time_seed_cols]

        # For every instance in which curr_by_depth_df[col] >= MAX_TIME - EPS, multiply nodes processed by TIMEOUT_NODE_FACTOR
        for col_ind in range(len(time_seed_cols)):
            time_col = time_seed_cols[col_ind]
            node_col = seed_cols[col_ind]
            curr_selected_indices = seed_time_curr_df[time_col] >= MAX_TIME - EPS
            seed_curr_df.loc[curr_selected_indices, node_col] = TIMEOUT_NODE_FACTOR * seed_curr_df[node_col]

        for bucket_ind in range(num_buckets):
            print("Bucket: [{:d},{:d})".format(bucket_min[bucket_ind],bucket_max[bucket_ind]))

            # Take subset of instances for which both solvers (with this seed) solve the instance in the time frame for the bucket
            curr_selected_indices = \
                (seed_time_curr_df[time_seed_cols].min(axis=1) >= bucket_min[bucket_ind]) \
                & (seed_time_curr_df[time_seed_cols].min(axis=1) < bucket_max[bucket_ind] - EPS)
            seed_curr_df = seed_curr_df[curr_selected_indices]
            seed_time_curr_df = seed_time_curr_df[curr_selected_indices]

            # Calculate geomean
            metric_ind = 0
            rounds_nodes_geomean_df.loc[
                (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                seed_cols
            ] = [
                    geometric_mean(seed_curr_df[col] + SHIFT_NODES) - SHIFT_NODES
                    for col in seed_cols
                ]
            
            # Calculate wins
            metric_ind = 1
            for solver_ind in range(len(solver_stubs)):
                other_solver_ind = 1 - solver_ind
                rounds_nodes_geomean_df.loc[
                    (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                    seed_cols[solver_ind]
                ] = sum(
                    seed_curr_df[seed_cols[other_solver_ind]]
                    > 
                    WIN_BY_NODES_FACTOR * seed_curr_df[seed_cols[solver_ind]]
                )

            print("row {:d}: {:d}".format(row_ind,len(seed_curr_df)))
            rounds_num_inst_by_seed_bucket_class[seed_ind][row_ind:row_ind+len(bb_metrics)] = [len(seed_curr_df)] * len(bb_metrics)

            row_ind += len(bb_metrics)

row_ind = 0
nodes_cols = [ ref_nodes_col, refv_nodes_col ]
time_cols = [ ref_time_col, refv_time_col ]
wins_cols = new_rounds_wins_cols
for class_ind in range(rounds_num_bb_classes):
    print("\nClass: {}".format(rounds_bb_classes[class_ind]))
    curr_df = rounds_nodes_df.loc[rounds_bb_classes_lists[class_ind]]
    time_curr_df = rounds_timing_df.loc[rounds_bb_classes_lists[class_ind]]

    # For every instance in which curr_by_depth_df[col] >= MAX_TIME - EPS, multiply nodes processed by TIMEOUT_NODE_FACTOR
    for col_ind in range(len(time_cols)):
        time_col = time_cols[col_ind]
        node_col = nodes_cols[col_ind]
        curr_selected_indices = time_curr_df[time_col] >= MAX_TIME - EPS
        curr_df.loc[curr_selected_indices, node_col] = TIMEOUT_NODE_FACTOR * curr_df[node_col]
        
    for bucket_ind in range(num_buckets):
        print("Bucket: [{:d},{:d})".format(bucket_min[bucket_ind],bucket_max[bucket_ind]))

        # Take subset of instances for which both solvers (with this seed) solve the instance in the time frame for the bucket
        curr_selected_indices = \
            (time_curr_df[time_cols].min(axis=1) >= bucket_min[bucket_ind]) \
            & (time_curr_df[time_cols].min(axis=1) < bucket_max[bucket_ind] - EPS)
        curr_df = curr_df[curr_selected_indices]
        time_curr_df = time_curr_df[curr_selected_indices]

        # Calculate geomean
        metric_ind = 0
        rounds_nodes_geomean_df.loc[
            (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
            nodes_cols
        ] = [
                geometric_mean(curr_df[col] + SHIFT_NODES) - SHIFT_NODES
                for col in nodes_cols
            ]
        
        # Calculate wins
        metric_ind = 1
        for solver_ind in range(len(solver_stubs)):
            other_solver_ind = 1 - solver_ind
            rounds_nodes_geomean_df.loc[
                (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[metric_ind]),
                nodes_cols[solver_ind]
            ] = sum(
                curr_df[nodes_cols[other_solver_ind]]
                > 
                WIN_BY_NODES_FACTOR * curr_df[nodes_cols[solver_ind]]
            )

            # Also report average number of seeds for which REF and REF+V win
            rounds_nodes_geomean_df.loc[
                (rounds_bb_classes[class_ind], bb_buckets[bucket_ind], bb_metrics[0:len(bb_metrics)]),
                wins_cols[solver_ind]
            ] = sum(curr_df[wins_cols[solver_ind]]) / len(curr_df)

        print("row {:d}: {:d}".format(row_ind,len(curr_df)))
        rounds_num_inst_avg[row_ind:row_ind+len(bb_metrics)] = [len(curr_df)] * len(bb_metrics)

        row_ind += len(bb_metrics)

# Insert columns with # inst (seed)
for seed_ind in range(NUM_SEEDS):
    rounds_nodes_geomean_df.insert(
        2 + seed_ind * (len(solver_stubs) + 1),
        "# inst ({:d})".format(seed_ind+1),
        rounds_num_inst_by_seed_bucket_class[seed_ind]
    )

# Insert columns with # inst (avg)
rounds_nodes_geomean_df.insert(
    2 + NUM_SEEDS * (len(solver_stubs) + 1),
    "# inst (avg)",
    rounds_num_inst_avg
)

rounds_nodes_geomean_df


## Table 3: `rounds_avg_bb_df` for rounds instances

In [None]:
## Set up empty `rounds_avg_bb_df`

rounds_time_cols_short = time_cols_short
rounds_node_cols_short = node_cols_short

rounds_avg_bb_cols = pd.MultiIndex.from_arrays(
    [
      [time_col_header]*len(rounds_time_cols_short) + [node_col_header]*len(rounds_node_cols_short),
      rounds_time_cols_short + rounds_node_cols_short
    ],
    names = ['criterion', 'type'])

#bb_row_names = pd.MultiIndex.from_product([bb_buckets, bb_row_names], names=['bucket', 'metric'])
rounds_bb_row_names = pd.MultiIndex.from_product(
    [rounds_bb_classes, bb_buckets, bb_metrics],
    names=['class', 'bucket', 'metric'])

rounds_avg_bb_df = pd.DataFrame(
    columns = rounds_avg_bb_cols,
    index = rounds_bb_row_names,
    dtype = float
)

# Make all columns "object" type to allow for integer values
rounds_avg_bb_df.loc[:,(time_col_header,rounds_time_cols_short)] = rounds_avg_bb_df.loc[:,(time_col_header,rounds_time_cols_short)].astype(object)
rounds_avg_bb_df.loc[:,(node_col_header,rounds_node_cols_short)] = rounds_avg_bb_df.loc[:,(node_col_header,rounds_node_cols_short)].astype(object)

num_inst = np.zeros(len(rounds_avg_bb_df),dtype = np.int64)
row_ind = 0
cols = rounds_time_cols_short + rounds_node_cols_short

# Fill in values from rounds_time_geomean_df and rounds_nodes_geomean_df (from the ref and refv cols)
# Fill in values from rounds_time_geomean_df
for index, row in rounds_time_geomean_df.iterrows():
    class_val, bucket_val, metric_val = index
    for col in rounds_time_cols_short:
        rounds_avg_bb_df.loc[(class_val, bucket_val, metric_val), (time_col_header, col)] = \
            row[map_short_to_cols_time[col]]

# Fill in values from rounds_nodes_geomean_df
for index, row in rounds_nodes_geomean_df.iterrows():
    class_val, bucket_val, metric_val = index
    for col in rounds_node_cols_short:
        rounds_avg_bb_df.loc[(class_val, bucket_val, metric_val), (node_col_header, col)] = \
            row[map_short_to_cols_nodes[col]]
        
# Fill in num inst
rounds_avg_bb_df[inst_col_name] = rounds_num_inst_avg

display(rounds_avg_bb_df.loc[:,rounds_avg_bb_df.columns.get_level_values(0)==node_col_header].head(6))
#display(avg_bb_df.loc[(bb_classes[0], bb_buckets[1], bb_metrics[0]),:])
display(rounds_avg_bb_df.loc[(rounds_bb_classes, bb_buckets, bb_metrics[0]),:])
display(rounds_avg_bb_df)


## Create `avg_bb_df` which will have values from disjset, rounds, and avg_bb_df_by_depth

In [None]:
# Merge disjset, rounds, and avg_bb_df_by_depth for 'All' class
# avg_bb_df = disjset_avg_bb_df.loc[(disjset_bb_classes[0], bb_buckets, bb_metrics),:]
avg_bb_df = pd.concat([
        disjset_avg_bb_df.loc[(disjset_bb_classes[0], bb_buckets, bb_metrics),:],
        rounds_avg_bb_df.loc[(rounds_bb_classes[0], bb_buckets, bb_metrics),:],
        avg_bb_by_depth_df.loc[:,avg_bb_by_depth_df.columns != ('Time (s)', 'Total')]
    ], 
    axis=0)

# Fill missing values with 0
avg_bb_df = avg_bb_df.fillna(0)

# Display the resulting dataframe
avg_bb_df.head(20)


# Section 6: Objective and time analysis

## `obj_and_time_df`: objectives, successes, fails, and time per obj or cut

In [None]:
inst_set = best_gap_df.index
# inst_set = ['10teams_presolved', 'bm23_presolved', 'vpm1_presolved']

# Define rows to add
inst_depth_set = [(inst, best_gap_df.loc[inst, 'BEST VPC DISJ']) for inst in inst_set]

# Define columns to add
fail_rate_col_name = 'Fail rate (%)'
time_col_name = 'Time (s)'
sec_per_obj_col_name = '(s) / obj'
sec_per_cut_col_name = '(s) / cut'
obj_and_time_new_cols = [
    fail_rate_col_name,
    time_col_name,
    sec_per_obj_col_name,
    sec_per_cut_col_name,
]

obj_and_time_df = df.loc[inst_depth_set,[col_num_obj, col_num_vpc, col_num_fails]].copy(deep=True)
obj_and_time_df[fail_rate_col_name] = 100. * obj_and_time_df[col_num_fails] / obj_and_time_df[col_num_obj]
obj_and_time_df[time_col_name] = df[col_vpc_gen_time]
obj_and_time_df[sec_per_obj_col_name] = obj_and_time_df[time_col_name] / obj_and_time_df[col_num_obj]
obj_and_time_df[sec_per_cut_col_name] = obj_and_time_df[time_col_name] / obj_and_time_df[col_num_vpc]

# Replace Fail rate = NaN when all cuts are one-sided cuts
SKIP_CHAR = '-'
obj_and_time_df.fillna(SKIP_CHAR, inplace = True)
obj_and_time_df.replace(np.inf, SKIP_CHAR, inplace = True)

# Add average row
# obj_and_time_df.loc['Average'] = 0
obj_and_time_df.loc['Average', obj_and_time_new_cols] =\
    [obj_and_time_df[obj_and_time_df[col] != SKIP_CHAR][col].mean() for col in obj_and_time_new_cols]
# for col in obj_and_time_new_cols:
#     obj_and_time_df.at[('Average',0),col] =\
#         obj_and_time_df[obj_and_time_df[col] != SKIP_CHAR][col].mean()

obj_and_time_df.loc['Average',[col_num_obj, col_num_vpc, col_num_fails]] = ""

display(obj_and_time_df)
# obj_and_time_df[obj_and_time_df['NUM CUTS'] == 0]
# obj_and_time_df[obj_and_time_df['(s) / obj'] > 100000]
# obj_and_time_df.loc['neos18_presolved']
# obj_and_time_df[obj_and_time_df['(s) / obj'] != SKIP_CHAR]['(s) / obj'].max()

In [None]:
selected_col = obj_and_time_new_cols[0]
print(selected_col)

# Print string entries for column
curr_col = obj_and_time_df[selected_col]

# Check if any entries of curr_col cannot be a float
meanvals = 0.0
maxval = 0.0
numvals = 0
for val in curr_col:
  try:
    floatval = np.float64(val)
    meanvals += floatval
    numvals += 1
    if floatval > maxval:
      maxval = floatval
  except:
    print("val {} not convertible".format(val))
print(meanvals / numvals)
print(maxval)
  

In [None]:
### DEBUG finding max difference in time between TOTAL_TIME and sum of individual times
# cuts_cols = [col for col in df.columns if col.startswith('NUM CUTS')]
# time_cols = [
#     'INIT_SOLVE_TIME',
#     'VPC_GEN_TIME',
#     'VPC_APPLY_TIME',
#     'BB_TIME',
#     'TOTAL_TIME'
# ]
# display(df.loc['bell3b_presolved',['NUM OBJ', 'NUM FAILS'] + cuts_cols])

# obj_and_time_df = df.loc[inst_depth_set].copy(deep = True)['NUM OBJ', 'NUM CUTS', 'NUM FAILS', 'VPC_GEN_TIME']
# display(obj_and_time_df)

# max_diff_time = 0.
# max_diff_inst = ''
# for inst in best_gap_df.index:
#     depth = best_gap_df.loc[inst, 'BEST VPC DISJ']
#     curr_num_obj   = df.loc[(inst,depth)]['NUM OBJ']
#     curr_num_vpc   = df.loc[(inst,depth)]['NUM VPC']
#     curr_num_1side = df.loc[(inst,depth)]['NUM CUTS ONE_SIDED']
#     curr_num_fails = df.loc[(inst,depth)]['NUM FAILS']
#     if curr_num_vpc + curr_num_fails != curr_num_obj + curr_num_1side:
#         raise ValueError("{}: curr_num_vpc ({:d}) + curr_num_fails ({:d}) != curr_num_obj ({:d}) + curr_num_1side ({:d})".format(inst, curr_num_vpc, curr_num_fails, curr_num_obj, curr_num_1side))
    
#     curr_fail_pct = 100. * curr_num_fails / curr_num_obj
#     curr_init_solve = df.loc[(inst,depth)]['INIT_SOLVE_TIME']
#     curr_vpc_gen = df.loc[(inst,depth)]['VPC_GEN_TIME']
#     curr_vpc_apply = df.loc[(inst,depth)]['VPC_APPLY_TIME']
#     curr_bb_time = df.loc[(inst,depth)]['BB_TIME']
#     curr_total_time = df.loc[(inst,depth)]['TOTAL_TIME']

#     curr_diff_time = curr_total_time - (curr_init_solve + curr_vpc_gen + curr_vpc_apply + curr_bb_time)
#     if curr_diff_time < -EPS:
#         display(df.loc[inst,time_cols])
#         raise ValueError("{} (depth {:d}): curr_diff_time {} < 0.".format(inst,depth,curr_diff_time))
    
#     if max_diff_time < curr_diff_time:
#         max_diff_inst = inst
#         max_diff_time = curr_diff_time

# print("Max diff time = {} for inst {}".format(max_diff_time,max_diff_inst))
# display(df.loc[(max_diff_inst,best_gap_df.loc[max_diff_inst, 'BEST VPC DISJ']),time_cols])


## `best_disj_gap_df`: Number of times a particular depth achieves best result and beats baseline by at least EPS

In [None]:
long_cols_to_compare = {
    col_vpc:                      col_gmic,
    col_vpc_gmic:                 col_gmic,
    col_first_cut_pass_gap_ref_v: col_first_cut_pass_gap_ref,
    col_last_cut_pass_gap_ref_v:  col_last_cut_pass_gap_ref,
}
short_cols_to_compare = [map_cols_to_short_gap[col] for col in long_cols_to_compare.keys()]
row_no_improvement = 'No improvement'

best_disj_gap_df = pd.DataFrame(
    columns = short_cols_to_compare,
    index = [row_no_improvement] + [0] + sizes,
    dtype = int,
)

for curr_depth in [0] + sizes:
    curr_depth_df = selected_gap_df.xs(curr_depth,level='disj_terms')
    for col in long_cols_to_compare.keys():
        shortcol = map_cols_to_short_gap[col]
        refcol = long_cols_to_compare[col]
        
        # Calculate num times this depth yielded the best result
        curr_num_wins = sum(
            (curr_depth_df[col] == best_gap_df[shortcol]) & 
            (best_gap_df[shortcol] > best_gap_df[map_cols_to_short_gap[refcol]] + GAP_DIFF_EPS)
        )
        best_disj_gap_df.at[curr_depth,shortcol] = curr_num_wins
        
# Add no improvement row
curr_depth = row_no_improvement
for col in long_cols_to_compare.keys():
    shortcol = map_cols_to_short_gap[col]
    refcol = long_cols_to_compare[col]

    # Calculate num times no improvement over the baseline
    curr_num_wins = sum(
            (best_gap_df[shortcol] <= best_gap_df[map_cols_to_short_gap[refcol]] + GAP_DIFF_EPS)
        )
    best_disj_gap_df.at[curr_depth,shortcol] = curr_num_wins


# Reindex to add "leaves" to index
idx = [row_no_improvement] + ['Best'] + [str(size) + " leaves" for size in sizes]
reidx = {old_id : new_id for old_id, new_id in zip(best_disj_gap_df.index,idx)}
best_disj_gap_df.rename(reidx, inplace=True)

# Remove best row (it is good to verify this is the same as the relevant entries in win_df or Table 1)
best_disj_gap_df.drop('Best', axis=0, inplace=True)

# Make sure all cols are int
best_disj_gap_df = best_disj_gap_df.astype(int)

best_disj_gap_df

## `best_disj_time_df`: Number of times depth is best and improvement is at least 10%

In [None]:
row_no_improvement = 'No improvement'

# sizes_to_check = [0] + sizes
sizes_to_check = sizes

best_disj_time_df = pd.DataFrame(
    columns = bb_classes,
    index = [row_no_improvement] + sizes_to_check,
    dtype = int,
)

cols_time_by_depth = time_cols_long[:2] #[ref_time_col, refv_time_col, refv_w_cut_time_col]
curr_df = df.loc[selected_time_instances, cols_time_by_depth + [col_num_vpc]]
curr_df = curr_df[curr_df.index.get_level_values(1) > 0]

# Calculate minimum value for each column, per instance (index level 0)
best_time_df = curr_df.groupby(level=0).min()
display(best_time_df)

# Report shifted geometric mean of both columns
for col in cols_time_by_depth:
    print("{}: {}".format(col, geometric_mean(best_time_df[col] + SHIFT_TIME) - SHIFT_TIME))

inst_sets = [selected_time_instances, all6_instances, binary_x_time_instances]

# display(curr_df.xs(2,level='disj_terms'))
# display(curr_df[curr_df.index.get_level_values(1) == 2])

for curr_size in sizes_to_check:
    curr_by_depth_df = curr_df.xs(curr_size,level='disj_terms')

    # Check if the value for this depth is roughly the best value overall (in best_time_df)
    bb_class_ind = 0
    for inst_set in inst_sets:
        curr_class_df = curr_by_depth_df.loc[inst_set]
        curr_best_df = best_time_df.loc[inst_set]

        # Give wiggle room of ~5% for the best time
        curr_num_wins = sum(
            (curr_class_df[refv_time_col] < 1.05 * curr_best_df[refv_time_col]) & 
            (curr_best_df[ref_time_col] > WIN_BY_TIME_FACTOR * curr_best_df[refv_time_col])
        )
        best_disj_time_df.at[curr_size,bb_classes[bb_class_ind]] = curr_num_wins
        
        # Increment bb_class_ind for next set
        bb_class_ind += 1

# Add no improvement row
# Calculate num times no improvement over the baseline
curr_size = row_no_improvement

bb_class_ind = 0
for inst_set in inst_sets:
    curr_best_df = best_time_df.loc[inst_set]
    curr_num_wins = sum(
            (curr_best_df[ref_time_col] <= WIN_BY_TIME_FACTOR * curr_best_df[refv_time_col])
        )
    best_disj_time_df.at[curr_size,bb_classes[bb_class_ind]] = curr_num_wins
    bb_class_ind += 1

# Convert to int
best_disj_time_df = best_disj_time_df.astype(int)

# Reindex to add "leaves" to index
idx = [row_no_improvement] + [str(size) + " leaves" for size in sizes]
reidx = {old_id : new_id for old_id, new_id in zip(best_disj_time_df.index,idx)}
best_disj_time_df.rename(reidx, inplace=True)

# Remove best row (it is good to verify this is the same as the relevant entries in win_df or Table 1)
# best_disj_time_df.drop('Best', axis=0, inplace=True)

best_disj_time_df

## `best_disj_nodes_df`: Number of times depth is best for nodes

In [None]:
# cols_nodes_by_depth = nodes_cols[:2] #[ref_time_col, refv_time_col, refv_w_cut_time_col]
# curr_df = df.loc[selected_time_instances, cols_time_by_depth + cols_nodes_by_depth + [col_num_vpc]]
# curr_df = curr_df[curr_df.index.get_level_values(1) > 0]

# # Calculate minimum value for each column, per instance (index level 0)
# curr_argmin_df = curr_df.groupby(level=0).idxmin()
# best_df = curr_df.loc[curr_argmin_df[refv_time_col]]
#cols_nodes_by_depth

In [None]:
row_no_improvement = 'No improvement'

# sizes_to_check = [0] + sizes
sizes_to_check = sizes

best_disj_nodes_df = pd.DataFrame(
    columns = bb_classes,
    index = [row_no_improvement] + sizes_to_check,
    dtype = int,
)

cols_nodes_by_depth = nodes_cols[:2] #[ref_time_col, refv_time_col, refv_w_cut_time_col]
curr_df = df.loc[selected_time_instances, cols_nodes_by_depth + [col_num_vpc]]
curr_df = curr_df[curr_df.index.get_level_values(1) > 0]

# Calculate minimum value for each column, per instance (index level 0)
best_nodes_df = curr_df.groupby(level=0).min()
display(best_nodes_df)

# Report shifted geometric mean of both columns
for col in cols_nodes_by_depth:
    print("{}: {}".format(col, geometric_mean(best_nodes_df[col] + SHIFT_NODES) - SHIFT_NODES))

inst_sets = [selected_time_instances, all6_instances, binary_x_time_instances]

# display(curr_df.xs(2,level='disj_terms'))
# display(curr_df[curr_df.index.get_level_values(1) == 2])

for curr_size in sizes_to_check:
    curr_by_depth_df = curr_df.xs(curr_size,level='disj_terms')

    # Check if the value for this depth is roughly the best value overall (in best_time_df)
    bb_class_ind = 0
    for inst_set in inst_sets:
        curr_class_df = curr_by_depth_df.loc[inst_set]
        curr_best_df = best_nodes_df.loc[inst_set]

        # Give wiggle room of ~5% for the best time
        curr_num_wins = sum(
            (curr_class_df[refv_nodes_col] <= curr_best_df[refv_nodes_col]) & 
            (curr_best_df[ref_nodes_col] > WIN_BY_NODES_FACTOR * curr_best_df[refv_nodes_col])
        )
        best_disj_nodes_df.at[curr_size,bb_classes[bb_class_ind]] = curr_num_wins
        
        # Increment bb_class_ind for next set
        bb_class_ind += 1

# Add no improvement row
# Calculate num times no improvement over the baseline
curr_size = row_no_improvement

bb_class_ind = 0
for inst_set in inst_sets:
    curr_best_df = best_nodes_df.loc[inst_set]
    curr_num_wins = sum(
            (curr_best_df[ref_nodes_col] <= WIN_BY_NODES_FACTOR * curr_best_df[refv_nodes_col])
        )
    best_disj_nodes_df.at[curr_size,bb_classes[bb_class_ind]] = curr_num_wins
    bb_class_ind += 1

# Convert to int
best_disj_nodes_df = best_disj_nodes_df.astype(int)

# Reindex to add "leaves" to index
idx = [row_no_improvement] + [str(size) + " leaves" for size in sizes]
reidx = {old_id : new_id for old_id, new_id in zip(best_disj_nodes_df.index,idx)}
best_disj_nodes_df.rename(reidx, inplace=True)

# Remove best row (it is good to verify this is the same as the relevant entries in win_df or Table 1)
# best_disj_time_df.drop('Best', axis=0, inplace=True)

best_disj_nodes_df

## `density_df`

In [None]:
rows = [
    '\# inst w/VPCs and time < 3600s',
    '\# wins by time',
    'Avg min cut density',
    'Avg max cut density',
    'Avg avg cut density',
    'Avg avg cut density (win by time)',
    'Avg avg cut density (lose by time)',
]

columns = ['V ({:d})'.format(size) for size in sizes]

density_df = pd.DataFrame(
    columns = columns,
    index = rows,
    dtype = float
)

# Calculate stats for 6 trees instances by depth
# inst_set = all6_instances_dict.keys()
inst_set = selected_time_instances
support_cols = [col for col in df.columns if "SUPPORT VPC" in col]
selected_cols = [ref_time_col, refv_time_col]+support_cols+[col_num_cols,col_num_vpc,ref_timeout_col,refv_timeout_col]
selected_cols = list(set(selected_cols)) # remove duplicates
curr_df = df.loc[inst_set,selected_cols]

# Take only indices that are > 0
curr_df = curr_df[curr_df.index.get_level_values(1) > 0]

# curr_df0 = curr_df.xs(0,level='disj_terms')

for curr_size_ind in range(0,len(sizes)):
    # Select only this depth
    # curr_by_depth_df = curr_df[curr_df.index.get_level_values(1) == sizes[curr_size_ind]]
    curr_by_depth_df = curr_df.xs(sizes[curr_size_ind], level='disj_terms')
    
    # Remove instances that take more than an hour
    INSTANCES_TO_KEEP = curr_by_depth_df[[ref_timeout_col, refv_timeout_col]].min(axis=1) < MAX_TIME
    curr_by_depth_df = curr_by_depth_df[INSTANCES_TO_KEEP]

    # Count number of instances having cuts
    curr_row_ind = 0
    density_df.iloc[curr_row_ind,curr_size_ind] = sum(curr_by_depth_df[col_num_vpc] > 0)
    
    # Mean of min, max, avg density
    curr_row_ind = 2
    for col_ind in range(len(support_cols)):
        curr_series = curr_by_depth_df[support_cols[col_ind]] / curr_by_depth_df[col_num_cols]
        density_df.iloc[curr_row_ind,curr_size_ind] = curr_series.mean()
        curr_row_ind += 1

    ## Count wins1 (should be same as in avg_bb_by_depth_df)
    # A win in terms of time is counted when the ``Gur1'' baseline seconds taken 
    # is at least 10\% slower, to account for some variability in runtimes.
    # A win in terms of nodes is when the ``Gur1'' baseline number of nodes is higher.
    curr_wins_df = curr_by_depth_df[curr_by_depth_df[ref_time_col] > WIN_BY_TIME_FACTOR * curr_by_depth_df[refv_time_col]]
    curr_row_ind = 1
    density_df.iloc[curr_row_ind,curr_size_ind] = len(curr_wins_df)
    curr_row_ind = len(rows)-2
    density_df.iloc[curr_row_ind,curr_size_ind] = (curr_wins_df[support_cols[2]] / curr_wins_df[col_num_cols]).mean()

    # curr_lose_df = curr_by_depth_df[1.1*curr_df0[gur1time_col] < curr_by_depth_df[gur1vtime_col]]
    curr_lose_df = curr_by_depth_df[WIN_BY_TIME_FACTOR * curr_by_depth_df[ref_time_col] < curr_by_depth_df[refv_time_col]]
    #curr_lose_df = curr_by_depth_df[curr_by_depth_df[ref_time_col] <= WIN_BY_TIME_FACTOR * curr_by_depth_df[refv_time_col]]
    curr_row_ind = len(rows)-1
    density_df.iloc[curr_row_ind,curr_size_ind] = (curr_lose_df[support_cols[2]] / curr_lose_df[col_num_cols]).mean()

density_df

## `obj_fails_df`

In [None]:
inst_set = best_gap_df.index
# inst_set = ['10teams_presolved', 'bm23_presolved', 'vpm1_presolved']

# Define rows to add
inst_depth_set = [(inst, best_gap_df.loc[inst, 'BEST VPC DISJ']) for inst in inst_set]

rows = [
    '\# inst w/obj',
    '\# inst w/succ obj',
    '\# inst no obj',
    '\# inst all obj fail',
    '\# inst all obj succ',
    '\% obj fails',
    '\% fails dup',
    '\% fails unbdd',
    '\% fails tilim',
    '\% fails dyn',
    '\% fails all ones',
    '\% fails post-GMIC obj',
    '\% fails DB',
    '\# obj / cut',
    '(s) / obj',
    '(s) / cut',
]

columns = ['V ({:d})'.format(size) for size in sizes]

obj_fails_df = pd.DataFrame(
    columns = columns,
    index = rows,
    dtype = float
)

# Choose columns to pull
selected_cols = [
    col_num_obj,
    col_num_vpc,
    'NUM FAILS DUMMY_OBJ',
    'NUM FAILS ALL_ONES',
    'NUM FAILS CUT_VERTICES',
    'NUM FAILS ITER_BILINEAR',
    'NUM FAILS UNIT_VECTORS',
    'NUM FAILS DISJ_LB',
    'NUM FAILS TIGHT_POINTS',
    'NUM FAILS TIGHT_RAYS',
    'NUM FAILS TIGHT_POINTS2',
    'NUM FAILS TIGHT_RAYS2',
    'NUM FAILS USER',
    'NUM FAILS OBJ_CUT',
    'NUM FAILS ONE_SIDED',
    'NUM FAILS',
    'ABANDONED',
    'BAD_DYNAMISM',
    'BAD_SUPPORT',
    'BAD_VIOLATION',
    'CUT_LIMIT',
    'DUAL_INFEASIBLE',
    'DUPLICATE_SIC',
    'DUPLICATE_VPC',
    'ITERATION_LIMIT',
    'ORTHOGONALITY_SIC',
    'ORTHOGONALITY_VPC',
    'PRIMAL_INFEASIBLE',
    'TIME_LIMIT',
    'NUMERICAL_ISSUES_WARNING',
    'DLB_EQUALS_DUB_NO_OBJ',
    'DLB_EQUALS_LPOPT_NO_OBJ',
    'PRIMAL_INFEASIBLE_NO_OBJ',
    'NUMERICAL_ISSUES_NO_OBJ',
    'UNKNOWN',
    col_vpc_gen_time,
]

curr_df = df.loc[inst_set,selected_cols]
curr_df0 = curr_df.xs(0,level='disj_terms')

for curr_size_ind in range(0,len(sizes)):
    # Select only this depth
    # curr_by_depth_df = curr_df[curr_df.index.get_level_values(1) == sizes[curr_size_ind]]
    curr_by_depth_df = curr_df.xs(sizes[curr_size_ind], level='disj_terms')

    # Num inst with objectives tried
    obj_fails_df.iloc[0,curr_size_ind] =\
        sum(curr_by_depth_df[col_num_obj] > 0)

    # Num inst with successful objectives
    obj_fails_df.iloc[1,curr_size_ind] =\
        sum((curr_by_depth_df[col_num_obj] > 0) & (curr_by_depth_df[col_num_vpc] > 0))

    # Num inst with no objectives tried
    obj_fails_df.iloc[2,curr_size_ind] =\
        sum(curr_by_depth_df[col_num_obj] == 0)

    # Num inst with objectives tried but all failed
    obj_fails_df.iloc[3,curr_size_ind] =\
        sum((curr_by_depth_df[col_num_obj] > 0) & (curr_by_depth_df[col_num_vpc] == 0))

    # Num inst with objectives tried and all succeeded
    obj_fails_df.iloc[4,curr_size_ind] =\
        sum((curr_by_depth_df[col_num_obj] > 0) & (curr_by_depth_df[col_num_obj] == curr_by_depth_df[col_num_vpc]))

    # Percent objective failures
    inst_w_obj_df = curr_by_depth_df[curr_by_depth_df[col_num_obj] > 0]
    obj_fails_df.iloc[5,curr_size_ind] =\
        (100. * inst_w_obj_df[col_num_fails] / inst_w_obj_df[col_num_obj]).mean()

    ## Percent of failures caused by:
    inst_w_fails_df = curr_by_depth_df[curr_by_depth_df[col_num_fails] > 0]

    # duplicates
    obj_fails_df.iloc[6,curr_size_ind] =\
        (100. * (inst_w_fails_df['DUPLICATE_VPC']+inst_w_fails_df['DUPLICATE_SIC']) / inst_w_fails_df[col_num_fails]).mean()

    # unbdd
    obj_fails_df.iloc[7,curr_size_ind] =\
        (100. * (inst_w_fails_df['DUAL_INFEASIBLE']) / inst_w_fails_df[col_num_fails]).mean()

    # tilim
    obj_fails_df.iloc[8,curr_size_ind] =\
        (100. * (inst_w_fails_df['TIME_LIMIT']) / inst_w_fails_df[col_num_fails]).mean()
    
    # dynamism
    obj_fails_df.iloc[9,curr_size_ind] =\
        (100. * (inst_w_fails_df['BAD_DYNAMISM']) / inst_w_fails_df[col_num_fails]).mean()

    # all-ones
    obj_fails_df.iloc[10,curr_size_ind] =\
        (100. * (inst_w_fails_df['NUM FAILS ALL_ONES']) / inst_w_fails_df[col_num_fails]).mean()

    # post-GMIC
    obj_fails_df.iloc[11,curr_size_ind] =\
        (100. * (inst_w_fails_df['NUM FAILS ITER_BILINEAR']) / inst_w_fails_df[col_num_fails]).mean()

    # disj_lb
    obj_fails_df.iloc[12,curr_size_ind] =\
        (100. * (inst_w_fails_df['NUM FAILS DISJ_LB']) / inst_w_fails_df[col_num_fails]).mean()
    
    # num obj / cut
    inst_w_cuts_df = curr_by_depth_df[curr_by_depth_df[col_num_vpc] > 0]
    obj_fails_df.iloc[13,curr_size_ind] = (inst_w_cuts_df[col_num_obj] / inst_w_cuts_df[col_num_vpc]).mean()

    # (s) / obj
    obj_fails_df.iloc[14,curr_size_ind] = (inst_w_obj_df[col_vpc_gen_time] / inst_w_obj_df[col_num_obj]).mean()

    # (s) / cut
    obj_fails_df.iloc[15,curr_size_ind] = (inst_w_cuts_df[col_vpc_gen_time] / inst_w_cuts_df[col_num_vpc]).mean()

obj_fails_df

## `active_cuts_df`: when generated cuts are active, by objective

In [None]:
inst_set = best_gap_df.index
# inst_set = ['10teams_presolved', 'bm23_presolved', 'vpm1_presolved']

# Define rows to add
inst_depth_set = [(inst, best_gap_df.loc[inst, 'BEST VPC DISJ']) for inst in inst_set]

rows = [
    '\% active GMIC',
    '\% active VPC',
    '\% cuts one-sided',
    '\% active one-sided',
    '\% cuts all ones',
    '\% active all ones',
    '\% cuts post-GMIC opt',
    '\% active post-GMIC opt',
    '\% cuts DB',
    '\% active DB',
]

columns = ['V+G ({:d})'.format(size) for size in sizes]

active_cuts_df = pd.DataFrame(
    columns = columns,
    index = rows,
    dtype = float
)

# Choose columns to pull
active_gmic_col = 'ACTIVE GMIC (all cuts)'
active_vpc_col = 'ACTIVE VPC (all cuts)'
selected_cols = [
    col_num_gmic,
    col_num_vpc,
    active_gmic_col,
    active_vpc_col,
] + [col for col in df.columns if "NUM CUTS " in col] + [col for col in df.columns if "NUM ACTIVE" in col]

curr_df = df.loc[inst_set,selected_cols]

num_inst_with_one_sided_cuts = [0 for size in sizes]
for curr_size_ind in range(0,len(sizes)):
    # Select only this depth
    # curr_by_depth_df = curr_df[curr_df.index.get_level_values(1) == sizes[curr_size_ind]]
    curr_by_depth_df = curr_df.xs(sizes[curr_size_ind], level='disj_terms')

    # active gmic
    active_cuts_df.iloc[0,curr_size_ind] =\
        (100. * curr_by_depth_df[active_gmic_col] / curr_by_depth_df[col_num_gmic]).mean()

    # active vpc
    active_cuts_df.iloc[1,curr_size_ind] =\
        (100. * curr_by_depth_df[active_vpc_col] / curr_by_depth_df[col_num_vpc]).mean()

    # percent of active cuts among those generated by a specific objective type
    obj_types = ['ONE_SIDED', 'ALL_ONES', 'ITER_BILINEAR', 'DISJ_LB']
    curr_row_index = 2
    inst_w_vpc = curr_by_depth_df[curr_by_depth_df['NUM VPC'] > 0]
    for obj in obj_types:
        active_cuts_df.iloc[curr_row_index,curr_size_ind] =\
            (100. * inst_w_vpc['NUM CUTS '+obj] / inst_w_vpc[col_num_vpc]).mean()
        curr_row_index += 1

        inst_w_cuts = inst_w_vpc[inst_w_vpc['NUM CUTS '+obj] > 0]
        
        active_cuts_df.iloc[curr_row_index,curr_size_ind] =\
            (100. * inst_w_cuts['NUM ACTIVE '+obj] / inst_w_cuts['NUM CUTS '+obj]).mean()
        curr_row_index += 1
    
    # num one-sided cuts
    num_inst_with_one_sided_cuts[curr_size_ind] = sum(curr_by_depth_df['NUM CUTS ONE_SIDED'] > 0)

display(active_cuts_df)

print("Num inst with one-sided cuts (should be same across partial trees) =",num_inst_with_one_sided_cuts)
print("Total num one-sided cuts =", sum(curr_by_depth_df['NUM CUTS ONE_SIDED']))

# Section 7: Export tables to LaTeX

## Format Table 1: gap closed and num wins

In [None]:
# Format Table 1: gap closed and num wins

# Create copy of table then remove values we do not want (wins for 'G)
# TABLE1 = avg_df.copy(deep=True)[[inst_col_name, 'G', 'DB', 'V', 'V+G', 'GurF', 'V+GurF', 'GurL', 'V+GurL']]
TABLE1 = avg_gap_df.copy(deep=True)[[inst_col_name]+gap_cols_short]

TABLE1['G'].loc[:,wins_row_name] = ""

# Process the column with # inst to only report number of instances for each set
TABLE1[inst_col_name].loc[:,wins_row_name] = ""
val = TABLE1[inst_col_name].loc[all_set_name,avg_row_name]
TABLE1[inst_col_name].loc[all_set_name,avg_row_name] = \
    create_multirow_string(str(val), extra_format=r"\tablenum[table-format=3]")
val = TABLE1[inst_col_name].loc[good_vpc_set_name,avg_row_name]
TABLE1[inst_col_name].loc[good_vpc_set_name,avg_row_name] = \
    create_multirow_string(str(val), extra_format=r"\tablenum[table-format=3]")
val = TABLE1[inst_col_name].loc[binary_set_name,avg_row_name]
TABLE1[inst_col_name].loc[binary_set_name,avg_row_name] = \
    create_multirow_string(str(val), extra_format=r"\tablenum[table-format=3]")

# Reset index to appear as cols
TABLE1.reset_index(inplace=True)

# Place column with # inst as second column
inst_col = TABLE1[inst_col_name]
TABLE1.drop(columns=[inst_col_name], inplace=True)
TABLE1.insert(loc=1, column=inst_col_name, value=inst_col)

# Set column should have multirow
setseries = TABLE1['Set']
format_col_as_multirow(setseries)

# for i in TABLE1.index:
#     curr_name = tex_escape(str(i))
#     print("Changing {} to {}".format(i, curr_name))
#     TABLE1.rename({i: curr_name}, inplace=True)
# print("")

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE1.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE1.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
# styler.format({
#     ("Numeric", "Integers"): '\${}',
#     ("Numeric", "Floats"): '{:.3f}',
#     ("Non-Numeric", "Strings"): str.upper
# })
# styler.format_index(escape="latex", axis=0).format_index(escape="latex", axis=1)
# styler.hide(level=0,axis=0)
table1_str = TABLE1.style.\
    hide(axis=0).\
    format(formatter = int_format).\
    to_latex(
        #@{}l@{\hskip 5pt}
        column_format="""
        @{}l@{}
        S[table-format=2.0,table-auto-round,table-number-alignment=center]
        l
        *{1}{S[table-auto-round]}
        H
        *{8}{S[table-auto-round]}
        @{}""",
        hrules = True,
        #clines = "skip-last;data",
        sparse_index = True,
        multirow_align = "c",
        # float_format="%.2f", 
        # escape=False, 
        siunitx=True,
        # index_names=False,
        #columns=['\# inst', 'G', 'R', 'DB', 'V', 'max(G,V)', 'V+G', 'GurF', 'V+GurF', 'GurL', 'V+GurL']
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "tab:gap-closed-summary",
        caption = """
            Summary statistics for percent gap closed by VPCs.
            The wins row reports how many instances close at least $\epsilon$ more gap when comparing DB, V, V+G to G on its own, V+GurF to GurF, and V+GurL to GurL.
        """,
        )

# Add a midrule between the two sets; the "3" is hand-coded but can be automated
table1_str = add_midrule(table1_str, -3)
table1_str = add_midrule(table1_str, -6)

# Adjustbox environment sets width to pagewidth
table1_str = add_adjustbox_environment(table1_str)

# Set default siunitx options for this table
table1_str = add_sisetup(table1_str)

print(table1_str)

## Format Table 2: depth x gap

In [None]:
# Format Table 2: percent gap closed by depth
TABLE2 = gap_by_size_df.copy(deep=True)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE2.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE2.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
table2_str = TABLE2.style.\
    format(formatter = int_format).\
    to_latex(
        column_format="""
        @{}l
        *{6}{S[table-auto-round]}
        @{}""",
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "tab:depth",
        caption = """
            Average percent gap closed broken down by the number of leaf nodes used to construct the partial branch-and-bound tree,
            for VPCs with and without GMICs, as well as at the root by \Gurobi{} after the first and last round of cuts. 
            ``Best'' refers to the maximum gap closed across all partial tree sizes.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table2_str = add_adjustbox_environment(table2_str)

# Set default siunitx options for this table
table2_str = add_sisetup(table2_str)

# Add midrule after 2nd row and before last two rows
toprule_row_ind = table2_str.count("\n", 0, table2_str.index("\\toprule")) + 1
bottomrule_row_ind = table2_str.count("\n", 0, table2_str.index("\\bottomrule")) + 1
table2_str = add_midrule(table2_str, toprule_row_ind + 3)
table2_str = add_midrule(table2_str, bottomrule_row_ind + (-2))

print(table2_str)

## Format Table 3: summary of b&b results

In [None]:
# Display full avg_bb_df for reference

# Set the maximum number of rows to be displayed
pd.set_option('display.max_rows', None)

# Display only up to two significant digits
pd.options.display.float_format = '{:.2f}'.format

# Display the DataFrame
display(avg_bb_df)

# Reset the maximum number of rows to be displayed
pd.reset_option('display.max_rows')


In [None]:
# Format Table 3: summary of b&b results
TABLE3 = avg_bb_df.copy(deep=True)

# Retrieve classes, buckets, and metrics
final_bb_classes = avg_bb_df.index.get_level_values(0).unique()
final_bb_buckets = avg_bb_df.index.get_level_values(1).unique()
final_bb_metrics = avg_bb_df.index.get_level_values(2).unique()

# display(final_bb_classes)
# display(final_bb_buckets)
# display(final_bb_metrics)

# Indices of wins metrics
wins_metric_ind = [ ind for ind, metric in enumerate(final_bb_metrics) if metric.startswith('Wins') ]

# Change dtype of inst_col_name to string
TABLE3.loc[:, inst_col_name] = TABLE3.loc[:, inst_col_name].astype(str)

# Process the column with # inst to only report number of instances for each set
TABLE3.loc[(slice(None), slice(None), final_bb_metrics[wins_metric_ind]), inst_col_name] = ""

for curr_class in final_bb_classes:
    for curr_bucket in final_bb_buckets:
        curr_name = (curr_class, curr_bucket, final_bb_metrics[0])
        val = TABLE3.loc[curr_name, inst_col_name]
        TABLE3.loc[curr_name, inst_col_name] = \
            create_multirow_string(str(val), num_rows = len(final_bb_metrics), extra_format=r"\tablenum[table-format=3]")

# Set num wins in int format or enclose in braces (center)
# tmp_df = TABLE3.loc[(slice(None), slice(None), bb_metrics[1:3]),time_col_header].applymap(int_format, num_digits=6)
tmp_df = TABLE3.loc[(slice(None), slice(None), final_bb_metrics[wins_metric_ind]),time_col_header].map(int_format, num_digits=4, add_phantom=True)
# tmp_df = TABLE3.loc[(slice(None), slice(None), bb_metrics[1:3]),time_col_header].applymap(enclose_in_braces)
tmp_df.columns = pd.MultiIndex.from_product([[time_col_header],tmp_df.columns])
TABLE3.loc[(slice(None), slice(None), final_bb_metrics[wins_metric_ind]),time_col_header] = TABLE3.loc[(slice(None), slice(None), final_bb_metrics[wins_metric_ind]),time_col_header].astype(str)
TABLE3.loc[(slice(None), slice(None), final_bb_metrics[wins_metric_ind]),time_col_header] = tmp_df

# tmp_df = TABLE3.loc[(slice(None), slice(None), bb_metrics[1:3]),node_col_header].applymap(int_format, num_digits=6)
tmp_df = TABLE3.loc[(slice(None), slice(None), bb_metrics[1:3]),node_col_header].map(int_format, num_digits=6, add_phantom=False)
# tmp_df = TABLE3.loc[(slice(None), slice(None), bb_metrics[1:3]),node_col_header].applymap(enclose_in_braces)
tmp_df.columns = pd.MultiIndex.from_product([[node_col_header],tmp_df.columns])
TABLE3.loc[(slice(None), slice(None), final_bb_metrics[wins_metric_ind]),node_col_header] = tmp_df

# Remove unnecessary entries, which are the wins entries for the last time col
TABLE3.loc[(slice(None), slice(None), final_bb_metrics[wins_metric_ind]),([time_col_header,node_col_header],map_cols_to_short_time[col_vpc_gen_time])] = ""

# Reset index to appear as cols
TABLE3.reset_index(inplace=True)

# Add new col combining class and bucket in one
class_bucket_col = "\multirow{" + str(len(final_bb_metrics)) + "}{*}{\shortstack[l]{" + TABLE3['class'] + "\\\\\\relax " + TABLE3['bucket'] + "}}"
for i in range(len(class_bucket_col)):
    if i%len(final_bb_metrics)!=0:
        class_bucket_col[i] = ""
TABLE3.drop(columns = ['class', 'bucket'], inplace = True, level = 0)
TABLE3.insert(loc=0, column="Set", value=class_bucket_col)

# Place column with # inst as second column
inst_col = TABLE3[inst_col_name]
TABLE3.drop(columns=[inst_col_name], inplace=True, level=0)
TABLE3.insert(loc=1, column=inst_col_name, value=inst_col)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE3.columns:
    if isinstance(col, tuple):
        for lvl_ind, lvl_col in enumerate(col):
            curr_col = tex_escape(str(lvl_col))
            TABLE3.rename({lvl_col: curr_col}, inplace=True, axis=1, level=lvl_ind)
    else:
        # curr_col = '{' + tex_escape(col) + '}'
        curr_col = tex_escape(str(col))
        TABLE3.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
    # format(formatter = int_format).\
table3_str = TABLE3.style.\
    hide(axis=0).\
    to_latex(
        column_format="""
        @{}l    % set
        c       % inst
        l       % stat
        *{2}{S[table-auto-round,table-format=4.2]}
        *{2}{H}
        *{2}{S[table-auto-round,table-format=4.2]}        
        *{2}{S[table-auto-round,table-format=6.0]}
        *{1}{H}
        *{1}{S[table-auto-round,table-format=6.0]}
        @{}}""",
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "tab:bb-summary",
        caption = """
            Summary statistics for time to solve instances with branch-and-bound.
        """,
        )

# Add a midrule between the two sets; the "9" is hand-coded but can be automated
for ind in [-57,-49,-41,-33,-25,-17,-9]:
    table3_str = add_midrule(table3_str, ind)

# Adjustbox environment sets width to pagewidth
table3_str = add_adjustbox_environment(table3_str)

# Set default siunitx options for this table
table3_str = add_sisetup(table3_str, table_format="4.2")

print(table3_str)

## Format Table 4: number of leaf nodes yielding the best result for each experiment per instance

In [None]:
best_disj_gap_df

In [None]:
best_disj_time_df

In [None]:
best_disj_nodes_df

In [None]:
# Format Table 4: frequency of when each size is best
TABLE4 = pd.concat([best_disj_gap_df,best_disj_time_df,best_disj_nodes_df],axis=1)
# TABLE4 = best_disj_gap_df.copy(deep=True)
# TABLE4.drop('Best', axis=0, inplace=True)

# Create new column index
TABLE4.columns = pd.MultiIndex.from_tuples(
    [('Gap',col) for col in best_disj_gap_df.columns]
    + [('Time',col) for col in best_disj_time_df.columns]
    + [('Nodes',col) for col in best_disj_nodes_df.columns]
)

# From Time and Nodes, Drop all but 'All' column
TABLE4.drop(columns=[col for col in TABLE4.columns if col[0] == 'Time' and col[1] != 'All'], inplace=True)
TABLE4.drop(columns=[col for col in TABLE4.columns if col[0] == 'Nodes' and col[1] != 'All'], inplace=True)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE4.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE4.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
    # hide(axis=0).\
table4_str = TABLE4.style.\
    to_latex(
        column_format="""@{}l*4{S}*1{S}@{}""",
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:size",
        caption = """
            Number of leaf nodes yielding the best result for each experiment per instance.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table9_str = add_adjustbox_environment(table9_str)

# table9_str = add_midrule(table9_str, -2)

# Set default siunitx options for this table
table4_str = add_sisetup(table4_str, table_format="3.0")

print(table4_str)

In [None]:
# # Compute summary metrics: shifted geometric mean and wins for ref_time_col, refv_time_col, ref_nodes_col, and refv_nodes_col
# curr_df = df.loc[selected_time_instances, [col_num_vpc] + cols_time_by_depth + cols_nodes_by_depth]
# curr_df = curr_df[curr_df.index.get_level_values(1) > 0]

# # Calculate minimum value for each column, per instance (index level 0)
# curr_argmin_df = curr_df.groupby(level=0).idxmin()
# best_df = curr_df.loc[curr_argmin_df[refv_time_col]]

# # Sort by increasing value of cols_times_by_depth[1
# print("Sorting by increasing value of",cols_time_by_depth[1],"...")
# best_df = best_df.sort_values(by=cols_time_by_depth[1])

# best_df.head()

# # Move 'disj_terms' (second level of index) to column
# best_df.reset_index(level=1, inplace=True)
# best_df.head()

# # Set up best_df_summary_metrics_df to store the summary metrics
# best_df_summary_metrics_df = best_df.copy(deep=True)

# # Compute shifted geometric mean
# cols_for_shifted_time_gmean = [col for col in best_df.columns if 'TIME' in col]
# cols_for_shifted_nodes_gmean = [col for col in best_df.columns if 'NODES' in col]

# # Apply shift to each column
# for col in cols_for_shifted_time_gmean:
#     best_df_summary_metrics_df[col] = best_df_summary_metrics_df[col] + SHIFT_TIME
# for col in cols_for_shifted_nodes_gmean:
#     best_df_summary_metrics_df[col] = best_df_summary_metrics_df[col] + SHIFT_NODES

# # Compute shifted geometric mean for time
# best_df_summary_metrics_df.loc['Gmean'] = best_df_summary_metrics_df[cols_for_shifted_time_gmean+cols_for_shifted_nodes_gmean].apply(geometric_mean, axis=0)

# # Change shift back
# for col in cols_for_shifted_time_gmean:
#     best_df_summary_metrics_df[col] = best_df_summary_metrics_df[col] - SHIFT_TIME
# for col in cols_for_shifted_nodes_gmean:
#     best_df_summary_metrics_df[col] = best_df_summary_metrics_df[col] - SHIFT_NODES
# best_df_summary_metrics_df.loc['Gmean', cols_for_shifted_time_gmean] = best_df_summary_metrics_df.loc['Gmean', cols_for_shifted_time_gmean] - SHIFT_TIME
# best_df_summary_metrics_df.loc['Gmean', cols_for_shifted_nodes_gmean] = best_df_summary_metrics_df.loc['Gmean', cols_for_shifted_nodes_gmean] - SHIFT_NODES

# # Compute wins for ref_time_col, refv_time_col, ref_nodes_col, and refv_nodes_col
# best_df_summary_metrics_df.loc['Wins', cols_for_shifted_time_gmean] = \
#     [ 
#       int(sum( best_df_summary_metrics_df[refv_time_col] > WIN_BY_TIME_FACTOR * best_df_summary_metrics_df[ref_time_col] )),
#       int(sum( best_df_summary_metrics_df[ref_time_col] > WIN_BY_TIME_FACTOR * best_df_summary_metrics_df[refv_time_col] )),
#       int(sum( best_df_summary_metrics_df[ref_time_col] > WIN_BY_TIME_FACTOR * (best_df_summary_metrics_df[refv_time_col] + best_df_summary_metrics_df[col_vpc_gen_time]) )),
#     ]

# best_df_summary_metrics_df.loc['Wins', cols_for_shifted_nodes_gmean] = \
#     [ 
#       int(sum(best_df_summary_metrics_df[refv_nodes_col] > best_df_summary_metrics_df[ref_nodes_col])),
#       int(sum(best_df_summary_metrics_df[refcol] > best_df_summary_metrics_df[refv_nodes_col]))
#     ]

# best_df_summary_metrics_df.loc[['Gmean','Wins']]

## Format Table 5: density statistics

In [None]:
# Format Table 10: density statistics
TABLE5 = density_df.copy(deep=True)

# Make # inst and wins row int
#tmp_df = TABLE10.iloc[0:2].applymap(int_format, num_digits=3, add_phantom=False)
tmp_df = TABLE5.iloc[0:2].map(int_format, num_digits=3, add_phantom=False)

TABLE5.iloc[0:2] = tmp_df
# tmp_df = TABLE10.loc['\# wins by time',:].apply(int_format, num_digits=2, add_phantom=False)
# TABLE10.loc['\# wins by time'] = tmp_df

# Finally, apply the desired style
    # hide(axis=0).\
table5_str = TABLE5.style.\
    to_latex(
        column_format="""@{}l*{6}{S[table-format=0.3,table-auto-round,table-number-alignment=center]}@{}""",
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:density",
        caption = """
            Statistics about the density of generated cuts broken down by partial tree size.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table5_str = add_adjustbox_environment(table5_str)

# table5_str = add_midrule(table5_str, -2)

# Set default siunitx options for this table
table5_str = add_sisetup(table5_str, table_format="0.3")

print(table5_str)

## Format Table 6: failures

In [None]:
# Format Table 11: failures
TABLE6 = obj_fails_df.copy(deep=True)

# Make num inst rows int
tmp_df = TABLE6.iloc[0:5].map(int_format, num_digits=3, add_phantom=False)
TABLE6.iloc[0:5] = tmp_df

# Finally, apply the desired style
    # hide(axis=0).\
table6_str = TABLE6.style.\
    to_latex(
        column_format="""@{}l*{6}{S[table-format=2.2,table-auto-round,table-number-alignment=center]}@{}""",
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:objectives",
        caption = """
            Statistics about the objectives leading to failures, broken down by partial tree size used for cut generation.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table11_str = add_adjustbox_environment(table11_str)

table6_str = add_midrule(table6_str, -4)
table6_str = add_midrule(table6_str, -8)
table6_str = add_midrule(table6_str, -13)
table6_str = add_midrule(table6_str, -15)

# Set default siunitx options for this table
table6_str = add_sisetup(table6_str, table_format="2.2")

print(table6_str)

## Format Table 7: active cuts

In [None]:
# Format Table 12: active cuts
TABLE7 = active_cuts_df.copy(deep=True)

# Finally, apply the desired style
    # hide(axis=0).\
table7_str = TABLE7.style.\
    to_latex(
        column_format="""@{}l*{6}{S[table-format=3.2,table-auto-round,table-number-alignment=center]}@{}""",
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:activity",
        caption = """
            Statistics about when generated cuts are active, broken down by partial tree size.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table7_str = add_adjustbox_environment(table7_str)

# Set default siunitx options for this table
table7_str = add_sisetup(table7_str, table_format="2.2")

print(table7_str)

## Format Table 8: objective + time analysis per instance

In [None]:
# Format Table 8: obj and time analysis
TABLE8 = obj_and_time_df.copy(deep=True)

# Move instance names into a column
TABLE8.reset_index(inplace=True)
TABLE8.drop('disj_terms',axis=1,inplace=True)

# Create new column index
TABLE8.columns = pd.MultiIndex.from_tuples(
    [('','Instance'),
    ('Objectives','Obj'),
    ('Objectives','Succ'),
    ('Objectives','Fails'),
    ('Objectives','\% fails'),
    ('Time (s)','Total'),
    ('Time (s)','(s) / obj'),
    ('Time (s)','(s) / cut')]
)

# Format instance column correctly
TABLE8[('',"Instance")] = TABLE8[('',"Instance")].apply(remove_presolved_from_name)
TABLE8[('',"Instance")] = TABLE8[('',"Instance")].apply(tex_escape)

# Format SKIP_CHAR correctly
for col in TABLE8.columns:
    TABLE8[col] = TABLE8[col].apply(enclose_in_braces, val_to_match=SKIP_CHAR)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE8.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE8.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
table8_str = TABLE8.style.\
    hide(axis=0).\
    to_latex(
        column_format="""
        @{}
        l
        *{3}{S[table-format=3.0,table-auto-round,table-number-alignment=center]}
        *{1}{S[table-format=2.1,table-auto-round,table-number-alignment=center]}
        *{1}{S[table-format=4.1,table-auto-round,table-number-alignment=center]}
        *{2}{S[table-format=4.1,table-auto-round,table-number-alignment=center]}
        @{}""",
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:obj-and-time-best",
        caption = """
            Information about objectives and time to generate cuts corresponding to the results in Table~\ref{app:tab:gap-closed}.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table8_str = add_adjustbox_environment(table8_str)

table8_str = add_midrule(table8_str, -2)

# Set default siunitx options for this table
table8_str = add_sisetup(table8_str)

print(table8_str)

## Prepare Table 9: rejected instances

In [None]:
df_rejection_reason.to_csv('rejection_reason.csv', index=True)

#### Verbose version

In [None]:
## *Verbose version*: For each instance that was not selected, print the reason
df_rejection_reason_rejected = df_rejection_reason[df_rejection_reason['SELECTED_GAP'] == False]
rejected_instance_list = df_rejection_reason_rejected.index
rejected_instance_list.name = 'Instance'
cols = ['Set', 'Reason']
df_rejected_instances = pd.DataFrame(columns=cols, index=rejected_instance_list)
df_rejected_instances['Set'] = df_ipopt.loc[rejected_instance_list,'SET']
df_rejected_instances.loc[df_rejection_reason_rejected['OPTIMAL_SOLUTION_FOUND'] > 0, 
                            'Reason'] = "Integer-optimal solution found constructing partial tree"
df_rejected_instances.loc[(df_rejection_reason_rejected['OPTIMAL_SOLUTION_FOUND'] == 0) 
                                & (df_rejection_reason_rejected['LP=DLB=DUB'] == 6), 
                            'Reason'] = "Max leaf value = LP value"
df_rejected_instances.loc[(df_rejection_reason_rejected['OPTIMAL_SOLUTION_FOUND'] == 0) 
                                & (df_rejection_reason_rejected['LP=DLB=DUB'] < 6) 
                                & (df_rejection_reason_rejected['LP=DLB=DUB'] + df_rejection_reason_rejected['PRLP_INFEASIBLE'] == 6), 
                            'Reason'] = "Max leaf value = LP value or PRLP primal infeasible"
df_rejected_instances.loc[(df_rejection_reason_rejected['OPTIMAL_SOLUTION_FOUND'] == 0)
                                & (df_rejection_reason_rejected['LP=DLB=DUB'] < 6) 
                                & (df_rejection_reason_rejected['LP=DLB=DUB'] + df_rejection_reason_rejected['PRLP_INFEASIBLE'] < 6)
                                & (df_rejection_reason_rejected['LP=DLB=DUB'] + df_rejection_reason_rejected['PRLP_INFEASIBLE'] + df_rejection_reason_rejected['PRLP_TIME_LIMIT'] == 6), 
                            'Reason'] = "Max leaf value = LP value or PRLP primal infeasible / hits time limit"
df_rejected_instances.loc[df_rejection_reason_rejected['<7_ATTEMPTS'] > 0, 
                            'Reason'] = "Numerical issues"
display(df_rejected_instances.head())
col_format = """@{}*{2}{l}X@{}"""

tmp_df_remaining_rejected_instances = df_rejection_reason.loc[df_rejected_instances[df_rejected_instances['Reason'].isna()].index]
if len(tmp_df_remaining_rejected_instances) > 0:
    display(tmp_df_remaining_rejected_instances)

#### Succinct version

In [None]:
## *Succinct version*: For each instance that was not selected, print the reason
df_rejected_instances = df_status_by_depth.loc[df_rejection_reason[df_rejection_reason['SELECTED_GAP'] == False].index]
df_rejected_instances.insert(loc = 0, column = 'Set', value = df_ipopt.loc[rejected_instance_list,'SET'])
col_format="""@{}*{2}{l}*{6}{c}@{}"""

### Print Table 9

In [None]:
# Format Table 4: rejected instances reasons
TABLE9 = df_rejected_instances.copy(deep=True)
TABLE9.reset_index(inplace=True)

TABLE9["Instance"] = TABLE9["Instance"].apply(remove_presolved_from_name)
TABLE9["Instance"] = TABLE9["Instance"].apply(tex_escape)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE9.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE9.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
    # format_index(escape="latex", axis=0).\
table9_str = TABLE9.style.\
    hide(axis=0).\
    to_latex(
        column_format=col_format,
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=False,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:discarded-instances",
        caption = """
            Instances that were not considered with reason for being discarded.
        """,
        )
        
print(table9_str)

In [None]:
len(TABLE9)

### DEBUG: Test Table 9 code and make sure "set" is properly identified

In [None]:
#### DEBUG
# df_rejection_reason[df_rejection_reason['NUM_WITH_OBJS'] != df_rejection_reason['NUM_WITH_CUTS']]
# df_rejection_reason[(df_rejection_reason['NUM_WITH_CUTS'] > 0) & (df_rejection_reason['DLB=DUB'] > 0) & (df_rejection_reason['OPTIMAL_SOLUTION_FOUND'] == 0)]
# df_rejection_reason[(df_rejection_reason['LP=DLB=DUB'] == 6)]

# inst = 'chromaticindex32-8_presolved'
# # df_rejection_reason.loc[inst]
# tmp = df_bb.loc[(inst,64)]
# tmp[25:50]

# len(df_rejection_reason[df_rejection_reason['SELECTED'] == True])
# inst = 'berlin_5_8_0_presolved'
# gap_df.loc[inst]
#df_rejection_reason.loc['bnatt400_presolved']

In [None]:
##### DEBUG: Verify "Set" col is correct
for inst in rejected_instance_list:
    curr_set = df_ipopt.loc[inst,'SET']
    has_error = False
    if isinstance(curr_set, pd.Series):
        # check that all sets are same, then just take first
        first_set = curr_set[0]
        for tmp_set in curr_set:
            if tmp_set != first_set:
                print("*** ERROR: not all sets are equal ({} != {})".format(first_set, tmp_set))
                has_error = True
                break
        curr_set = first_set
    ref_set = df_rejected_instances.loc[inst, 'Set']
    if ref_set != curr_set:
        print("*** ERROR: for inst {}, df_rej_inst set {} != df_ipopt set {}".format(inst, ref_set, curr_set))
        has_error = True
    
    if has_error:
        break

## Format Table 10: app:tab:gap-closed: full gap closed results

In [None]:
# Take all but column named 'R' of all_gap_results_df
subset_all_gap_results_df = all_gap_results_df.drop(columns=('% gap closed','R'), inplace=False)
subset_all_gap_results_df.head(5)

In [None]:
# Format Table 10: full gap closed results
TABLE10 = subset_all_gap_results_df.copy(deep=True)

# Set wins row to be integer valued
TABLE10.loc['Wins'] = TABLE10.loc['Wins'].apply(int_format)
# TABLE5.iloc[len(TABLE5)-1] = TABLE5.iloc[len(TABLE5)-1].apply(int_format)

# Move instance names into a column
TABLE10.reset_index(inplace=True, col_level=1)

TABLE10[('',"Instance")] = TABLE10[('',"Instance")].apply(remove_presolved_from_name)
TABLE10[('',"Instance")] = TABLE10[('',"Instance")].apply(tex_escape)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE10.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE10.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
    # format(formatter = int_format).\
table10_str = TABLE10.style.\
    hide(axis=0).\
    to_latex(
        column_format="""
	@{}l*{2}{S[table-format=4.0,table-auto-round,table-number-alignment=center]}
	*{2}{S[table-format=4.0,table-auto-round,table-number-alignment=center]}
	*{8}{S[table-auto-round]}
	@{}
        """,
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:gap-closed",
        caption = """
            Percent gap closed by instance for GMICs (G), VPCs (V), both VPCs and GMICs used together, 
            and the bound implied by the partial branch-and-bound tree with 64 leaf nodes (DB).
            Also shown are the sizes of the instances, the number of cuts added, and the percent gap closed by 
            \Gurobi{} at the root (after one round (GurF) and after the last round (GurL)). 
            Entries in which DB appears to be 0.00 are actually small strictly positive numbers.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table10_str = add_adjustbox_environment(table10_str)

# Set default siunitx options for this table
table10_str = add_sisetup(table10_str)

# Add a midrule between the instances and 3 summary rows; the "5" is hand-coded but can be automated
table10_str = add_midrule(table10_str, -5)

print(table10_str)

## Format Table 11: "all" time/nodes results

In [None]:
# Format Table 11: "all" time/nodes results
TABLE11 = best_df_summary_metrics_df.copy(deep=True)

# Remove nan values from 'Wins' row
rows_to_remove_nan = ['Gmean','Wins']
TABLE11.loc[rows_to_remove_nan] = TABLE11.loc[rows_to_remove_nan].fillna("")

# Set wins row to be integer valued
TABLE11.loc['Wins'] = TABLE11.loc['Wins'].apply(int_format)

# Move instance names into a column
TABLE11.reset_index(inplace=True, col_level=1)
TABLE11.rename(columns={'INSTANCE': 'Instance'}, inplace=True)

# Remove presolved from name and escape
TABLE11[('',"Instance")] = TABLE11[('',"Instance")].apply(remove_presolved_from_name)
TABLE11[('',"Instance")] = TABLE11[('',"Instance")].apply(tex_escape)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE11.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE11.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
    # format(formatter = int_format).\
table11_str = TABLE11.style.\
    hide(axis=0).\
    to_latex(
        column_format="""
          @{}l % instance
          *{1}{S[table-format=4.0,table-auto-round,table-number-alignment=center]} % # terms
          *{1}{S[table-format=4.0,table-auto-round,table-number-alignment=center]} % # cuts
          *{1}{S[table-format=4.2,table-auto-round]} % Gur
          *{2}{S[table-format=4.2,table-auto-round]} % V, Gen
          *{2}{S[table-format=8.0,table-auto-round,table-number-alignment=center]} % Nodes
          @{}
        """,
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:bb",
        caption = """
            Time (in seconds) and number nodes taken to solve each instance,
            for the disjunction size with best solving time with VPCs per instance.
            The table is sorted by column 4 (``V'' under ``Time (s)'').
        """,
        )

# Set default siunitx options for this table
table11_str = add_sisetup(table11_str)

# Add a midrule between the instances and 3 summary rows; the "6" is hand-coded but can be automated
table11_str = add_midrule(table11_str, -5)
# table11_str = add_midrule(table11_str, -10)

print(table11_str)

In [None]:
%%script false --no-raise-error

# Format Table 6: "all" time/nodes results
TABLE6 = all_bb_results_df.copy(deep=True)

# Rename summary rows to reflect the set
rename_metrics_all = {metric : metric + ' (All)' for metric in bb_metrics}
TABLE6.rename(rename_metrics_all, inplace=True)

# Add summary rows from 6 trees set
summary_metrics_6trees = all6_bb_results_df.tail(3).copy(deep=True)
rename_metrics_6trees = {metric : metric + ' (6 trees)' for metric in bb_metrics}
summary_metrics_6trees.rename(rename_metrics_6trees, inplace=True)

TABLE6 = pd.concat([TABLE6, summary_metrics_6trees])

# Drop rows, cols, (time,V7)
TABLE6.drop([('','Rows'),('','Cols'),(node_col_header,map_cols_to_short_time[mintime_col])], axis=1, inplace=True)

# Set wins row to be integer valued
TABLE6.loc['Wins1 (All)'] = TABLE6.loc['Wins1 (All)'].apply(int_format)
TABLE6.loc['Wins7 (All)'] = TABLE6.loc['Wins7 (All)'].apply(int_format)
TABLE6.loc['Wins1 (6 trees)'] = TABLE6.loc['Wins1 (6 trees)'].apply(int_format)
TABLE6.loc['Wins7 (6 trees)'] = TABLE6.loc['Wins7 (6 trees)'].apply(int_format)
# TABLE6.iloc[len(TABLE6)-1] = TABLE6.iloc[len(TABLE6)-1].apply(int_format)

# Move instance names into a column
TABLE6.reset_index(inplace=True, col_level=1)

# Store indices of rows of 6-tree instances
six_trees_instances = list(all6_instances_dict.keys())
# mask = TABLE6[('','Instance')].isin(six_trees_instances)
# six_trees_indices = TABLE6.loc[mask, :].index.tolist()

# Remove presolved from name and escape
TABLE6[('',"Instance")] = TABLE6[('',"Instance")].apply(remove_presolved_from_name)
TABLE6[('',"Instance")] = TABLE6[('',"Instance")].apply(tex_escape)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE6.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE6.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
    # format(formatter = int_format).\
table6_str = TABLE6.style.\
    hide(axis=0).\
    to_latex(
        column_format="""
	@{}l % instance
	*{1}{S[table-format=4.0,table-auto-round,table-number-alignment=center]} % # cuts
	*{2}{S[table-format=4.2,table-auto-round]} % Gur1, Gur7
	*{2}{H} % V, Total
	*{2}{S[table-format=4.2,table-auto-round]} % V7, Total7
	*{3}{S[table-format=8.0,table-auto-round,table-number-alignment=center]} % Nodes
	@{}
        """,
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:bb",
        caption = """
            Time (in seconds) and number nodes taken to solve each instance.
            The table is sorted by column 4 (``V'' under ``Time (s)'').
            ``Gur1'' indicates \Gurobi{} run with one random seed.
            ``Gur7'' indicates the minimum from seven runs of \Gurobi{} with different random seeds.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table6_str = add_adjustbox_environment(table6_str)

# Set default siunitx options for this table
table6_str = add_sisetup(table6_str)

# Add a midrule between the instances and 3 summary rows; the "6" is hand-coded but can be automated
table6_str = add_midrule(table6_str, -6)
table6_str = add_midrule(table6_str, -10)

# Add color to six tree instances
splitlines = table6_str.splitlines()
for i in range(len(splitlines)):
    line = splitlines[i]
    curr_line = line.split('&')
    if len(curr_line) > 0 and curr_line[0].strip()+'_presolved' in six_trees_instances:
        splitlines[i] = '\\rowcolor{lightgray!30} ' + line
table6_str = '\n'.join(splitlines).replace('NaN', '')

print(table6_str)

## XXX Format Table X: "6 trees" time/nodes results

In [None]:
%%script false --no-raise-error

# Format Table 7: "6 trees" time/nodes results
TABLEX = all6_bb_results_df.copy(deep=True)

# Set wins row to be integer valued
TABLEX.loc['Wins1'] = TABLEX.loc['Wins1'].apply(int_format)
TABLEX.loc['Wins7'] = TABLEX.loc['Wins7'].apply(int_format)
# TABLEX.iloc[len(TABLEX)-1] = TABLEX.iloc[len(TABLEX)-1].apply(int_format)

# Move instance names into a column
TABLEX.reset_index(inplace=True, col_level=1)

TABLEX[('',"Instance")] = TABLEX[('',"Instance")].apply(remove_presolved_from_name)
TABLEX[('',"Instance")] = TABLEX[('',"Instance")].apply(tex_escape)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLEX.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLEX.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
    # format(formatter = int_format).\
tableX_str = TABLEX.style.\
    hide(axis=0).\
    to_latex(
        column_format="""@{}l*{2}{c}*{2}{c}H*{8}{c}@{}""",
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:bb-7trees",
        caption = """
  Time (in seconds) and number nodes taken to solve each of the instances for which all six branch-and-bound trees successfully yielded VPCs.
  %The columns with V1x are those in which we do not terminate the VPC computation as soon as the time exceeds \Gurobi{}'s time.  
  The table is sorted by column 4 (``V7'' under ``Time (s)'').
  ``Gur1'' indicates Gurobi run with one random seed.
  ``Gur7'' indicates the minimum from seven runs of Gurobi with different random seeds.
        """,
        )

# Adjustbox environment sets width to pagewidth
# tableX_str = add_adjustbox_environment(tableX_str)

# Set default siunitx options for this table
tableX_str = add_sisetup(tableX_str)

# Add a midrule between the instances and 3 summary rows; the "6" is hand-coded but can be automated
tableX_str = add_midrule(tableX_str, -6)

print(tableX_str)

## XXX Format Table X: (now Table 3) b&b summary by depth

In [None]:
# # Format Table 7: summary of b&b results
# TABLE7 = avg_bb_by_depth_df.copy(deep=True)

# # Remove unnecessary entries
# # TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:]),([time_col_header,node_col_header],map_cols_to_short_time[gur1time_col])] = ""
# # TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[2]),([time_col_header,node_col_header],map_cols_to_short_time[gur7time_col])] = ""

# # Process the column with # inst to only report number of instances for each set
# TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:]), inst_col_name] = ""

# for curr_class in bb_classes_by_depth:
#     for curr_bucket in bb_buckets_by_depth:
#         curr_name = (curr_class, curr_bucket, bb_metrics_by_depth[0])
#         val = TABLE7.loc[curr_name, inst_col_name]
#         TABLE7.loc[curr_name, inst_col_name] = \
#             create_multirow_string(str(val), num_rows = 2, extra_format=r"\tablenum[table-format=3]")

# # Set num wins in int format or enclose in braces (center)
# # tmp_df = TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:3]),time_col_header].applymap(int_format, num_digits=6)
# tmp_df = TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:3]),time_col_header].applymap(int_format, num_digits=4, add_phantom=True)
# # tmp_df = TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:3]),time_col_header].applymap(enclose_in_braces)
# tmp_df.columns = pd.MultiIndex.from_product([[time_col_header],tmp_df.columns])
# TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:3]),time_col_header] = tmp_df

# # tmp_df = TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:3]),node_col_header].applymap(int_format, num_digits=6)
# tmp_df = TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:3]),node_col_header].applymap(int_format, num_digits=6, add_phantom=False)
# # tmp_df = TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:3]),node_col_header ].applymap(enclose_in_braces)
# tmp_df.columns = pd.MultiIndex.from_product([[node_col_header],tmp_df.columns])
# TABLE7.loc[(slice(None), slice(None), bb_metrics_by_depth[1:3]),node_col_header] = tmp_df

# # Reset index to appear as cols
# TABLE7.reset_index(inplace=True)

# # Add new col combining class and bucket in one
# class_bucket_col = "\multirow{2}{*}{\shortstack[l]{" + TABLE7['class'] + "\\\\\\relax " + TABLE7['bucket'] + "}}"
# for i in range(len(class_bucket_col)):
#     if i%len(bb_metrics_by_depth)!=0:
#         class_bucket_col[i] = ""
# TABLE7.drop(columns = ['class', 'bucket'], inplace = True, level = 0)
# TABLE7.insert(loc=0, column="Set", value=class_bucket_col)

# # Place column with # inst as second column
# inst_col = TABLE7[inst_col_name]
# TABLE7.drop(columns=[inst_col_name], inplace=True, level=0)
# TABLE7.insert(loc=1, column=inst_col_name, value=inst_col)

# # If we are not using the automatic tex-escaper, we need to do it ourselves
# for col in TABLE7.columns:
#     if isinstance(col, tuple):
#         for lvl_ind, lvl_col in enumerate(col):
#             curr_col = tex_escape(str(lvl_col))
#             TABLE7.rename({lvl_col: curr_col}, inplace=True, axis=1, level=lvl_ind)
#     else:
#         # curr_col = '{' + tex_escape(col) + '}'
#         # curr_col = col
#         curr_col = tex_escape(str(col))
#         TABLE7.rename({col: curr_col}, inplace=True, axis=1)

# # Finally, apply the desired style
#     # format(formatter = int_format).\
# table7_str = TABLE7.style.\
#     hide(axis=0).\
#     to_latex(
#         column_format="""
#         @{}l    % set
#         c       % inst
#         l       % stat
#         *{3}{S[table-auto-round,table-format=4.2]}
#         *{2}{S[table-auto-round,table-format=6.0]}
#         @{}""",
#         hrules = True,
#         sparse_index = True,
#         multirow_align = "c",
#         siunitx=True,
#         convert_css = True,
#         environment = "table",
#         position_float = "centering",
#         label = "tab:bb-summary",
#         caption = """
#             Summary statistics for time to solve instances with branch-and-bound.
#         """,
#         )

# # Add a midrule between the two sets; the "9" is hand-coded but can be automated
# table7_str = add_midrule(table7_str, -41)
# table7_str = add_midrule(table7_str, -33)
# table7_str = add_midrule(table7_str, -25)
# table7_str = add_midrule(table7_str, -17)
# table7_str = add_midrule(table7_str, -9)

# # Adjustbox environment sets width to pagewidth
# table7_str = add_adjustbox_environment(table7_str)

# # Set default siunitx options for this table
# table7_str = add_sisetup(table7_str, table_format="4.2")

# print(table7_str)

## Format Table 13: instances with best bb improvement from VPCs

In [None]:
%%script false --no-raise-error
# Format Table 13: "best" time/nodes results
TABLE13 = all_bb_results_df.copy(deep=True)

# Drop all rows in which (time_col_header, 'Gur7') is ''
TABLE13 = TABLE13[TABLE13[(time_col_header, 'Gur7')] != '']

# TABLE13[('','V7-Gur7')]
# Change column (time_col_header, V7) to be float valued
#TABLE13[(time_col_header, 'V7')] = TABLE13[(time_col_header, 'V7')].apply(float_format, num_digits=4)
TABLE13 = TABLE13.astype({(time_col_header, 'V7'): float})
TABLE13 = TABLE13.astype({(time_col_header, 'Gur7'): float})

# Add new column for difference between V7 and Gur7
TABLE13[('Time (s)','V7 - Gur7')] = TABLE13[('Time (s)','V7')] - TABLE13[('Time (s)','Gur7')]

# Sort by V7 - Gur7
TABLE13.sort_values(by=[(time_col_header,'V7 - Gur7')], inplace=True)

TABLE13.head(15)

In [None]:
%%script false --no-raise-error
### DEBUG DEBUG DEBUG
# inst = 'cost266-UUE_presolved'
# hawea instance
inst = 'neos-3592146-hawea_presolved'
if inst in all_bb_results_df.index:
    display(all_bb_results_df.loc[inst])

In [None]:
%%script false --no-raise-error
# Format Table 13: "best" time/nodes results
TABLE13 = all_bb_results_df.copy(deep=True)

# Drop all rows in which (time_col_header, 'Gur7') is ''
TABLE13 = TABLE13[TABLE13[(time_col_header, 'Gur7')] != '']

# TABLE13[('','V7-Gur7')]
# Change column (time_col_header, V7) to be float valued
#TABLE13[(time_col_header, 'V7')] = TABLE13[(time_col_header, 'V7')].apply(float_format, num_digits=4)
TABLE13 = TABLE13.astype({(time_col_header, 'V7'): float})
TABLE13 = TABLE13.astype({(time_col_header, 'Gur7'): float})

# Add new column for difference between V7 and Gur7
TABLE13[('Time (s)','V7 - Gur7')] = TABLE13[('Time (s)','V7')] - TABLE13[('Time (s)','Gur7')]

# Sort by V7 - Gur7
TABLE13.sort_values(by=[(time_col_header,'V7 - Gur7')], inplace=True)

# Rename summary rows to reflect the set
rename_metrics_all = {metric : metric + ' (All)' for metric in bb_metrics}
TABLE13.rename(rename_metrics_all, inplace=True)

# Add summary rows from 6 trees set
summary_metrics_6trees = all6_bb_results_df.tail(3).copy(deep=True)
rename_metrics_6trees = {metric : metric + ' (6 trees)' for metric in bb_metrics}
summary_metrics_6trees.rename(rename_metrics_6trees, inplace=True)

TABLE13 = pd.concat([TABLE13, summary_metrics_6trees])

# Drop rows, cols, (time,V7)
TABLE13.drop([('','Rows'),('','Cols'),(node_col_header,map_cols_to_short_time[mintime_col])], axis=1, inplace=True)

# Set wins row to be integer valued
TABLE13.loc['Wins1 (All)'] = TABLE13.loc['Wins1 (All)'].apply(int_format)
# TABLE13.loc['Wins7 (All)'] = TABLE13.loc['Wins7 (All)'].apply(int_format)
# TABLE13.loc['Wins1 (6 trees)'] = TABLE13.loc['Wins1 (6 trees)'].apply(int_format)
# TABLE13.loc['Wins7 (6 trees)'] = TABLE13.loc['Wins7 (6 trees)'].apply(int_format)
# TABLE13.iloc[len(TABLE13)-1] = TABLE13.iloc[len(TABLE13)-1].apply(int_format)

# Move instance names into a column
TABLE13.reset_index(inplace=True, col_level=1)

# Store indices of rows of 6-tree instances
six_trees_instances = list(all6_instances_dict.keys())
# mask = TABLE13[('','Instance')].isin(six_trees_instances)
# six_trees_indices = TABLE13.loc[mask, :].index.tolist()

# Remove presolved from name and escape
TABLE13[('',"Instance")] = TABLE13[('',"Instance")].apply(remove_presolved_from_name)
TABLE13[('',"Instance")] = TABLE13[('',"Instance")].apply(tex_escape)

# If we are not using the automatic tex-escaper, we need to do it ourselves
for col in TABLE13.columns:
    # curr_col = '{' + tex_escape(col) + '}'
    curr_col = tex_escape(str(col))
    TABLE13.rename({col: curr_col}, inplace=True, axis=1)

# Finally, apply the desired style
    # format(formatter = int_format).\
table13_str = TABLE13.style.\
    hide(axis=0).\
    to_latex(
        column_format="""
	@{}l % instance
	*{1}{S[table-format=4.0,table-auto-round,table-number-alignment=center]} % # cuts
	*{2}{S[table-format=4.2,table-auto-round]} % Gur1, Gur7
	*{2}{H} % V, Total
	*{2}{S[table-format=4.2,table-auto-round]} % V7, Total7
	*{3}{S[table-format=8.0,table-auto-round,table-number-alignment=center]} % Nodes
	@{}
        """,
        hrules = True,
        sparse_index = True,
        multirow_align = "c",
        siunitx=True,
        convert_css = True,
        environment = "table",
        position_float = "centering",
        label = "app:tab:bb",
        caption = """
            Time (in seconds) and number nodes taken to solve each instance.
            The table is sorted by column 4 (``V'' under ``Time (s)'').
            ``Gur1'' indicates \Gurobi{} run with one random seed.
            ``Gur7'' indicates the minimum from seven runs of \Gurobi{} with different random seeds.
        """,
        )

# Adjustbox environment sets width to pagewidth
# table13_str = add_adjustbox_environment(table13_str)

# Set default siunitx options for this table
table13_str = add_sisetup(table13_str)

# Add a midrule between the instances and 3 summary rows; the "6" is hand-coded but can be automated
table13_str = add_midrule(table13_str, -6)
table13_str = add_midrule(table13_str, -10)

# Add color to six tree instances
splitlines = table13_str.splitlines()
for i in range(len(splitlines)):
    line = splitlines[i]
    curr_line = line.split('&')
    if len(curr_line) > 0 and curr_line[0].strip()+'_presolved' in six_trees_instances:
        splitlines[i] = '\\rowcolor{lightgray!30} ' + line
table13_str = '\n'.join(splitlines).replace('NaN', '')

print(table13_str)

# OLD Section 3: Time tables

## `time_df`: Create subset of dataframe relevant to time

In [None]:
%%script false --no-raise-error

## Create subset of dataframe relevant to time
time_df = df.loc[:,
                [
                    'NUM DISJ TERMS',
                    'ROWS',
                    'COLS',
                    'LP OBJ',
                    'IP OBJ',
                    'FIRST REF OBJ',
                    'AVG REF OBJ',
                    'BEST REF OBJ',
                    'FIRST REF+V OBJ',
                    'AVG REF+V OBJ',
                    'FIRST REF BOUND',
                    'AVG REF BOUND',
                    'BEST REF BOUND',
                    'FIRST REF+V BOUND',
                    'AVG REF+V BOUND',
                    'FIRST REF ITERS',
                    'AVG REF ITERS',
                    'BEST REF ITERS',
                    'FIRST REF+V ITERS',
                    'AVG REF+V ITERS',
                    'FIRST REF NODES',
                    'AVG REF NODES',
                    'BEST REF NODES',
                    'FIRST REF+V NODES',
                    'AVG REF+V NODES',
                    'FIRST REF TIME',
                    'AVG REF TIME',
                    'BEST REF TIME',
                    'FIRST REF+V TIME',
                    'AVG REF+V TIME',
                    'VPC_GEN_TIME',
                    'NUM GMIC',
                    'NUM VPC',
                    'NUM OBJ',
                    'ALL REF TIME',
                    'ALL REF+V TIME',
                    'ExitReason']
               ]
#display(time_df.loc[("bm23_presolved",2)])

## Prepare short/long column names for time dfs
1. First run of Gurobi without VPCs
2. Best among 7 runs of Gurobi without VPCs
3. First run of Gurobi with VPCs for each disjunction size
4. First run of Gurobi with VPCs for each disjunction size, adding cut generation time
5. Best run across first Gurobi without VPCs and first Gurobi with VPCs (across all terms)

In [None]:
%%script false --no-raise-error

# Gur1/Gur7 names
gur1_col_stub = 'AVG REF' # Should we change to AVG?
gur7_col_stub = 'BEST REF'
# gur_w_v_col_stub = 'AVG REF'
gur1v_col_stub = gur1_col_stub + '+V'
gur1v_w_cut_col_stub = gur1v_col_stub + ' W/CUTGEN'

# gur1time: first run of Gurobi without VPCs
gur1time_col = gur1_col_stub + ' TIME'
gur1nodes_col = gur1_col_stub + ' NODES'

# gur7time: best among 7 runs of Gurobi without VPCs
gur7time_col = gur7_col_stub + ' TIME'
gur7nodes_col = gur7_col_stub + ' NODES'

# gur1vtime: first run of Gurobi w/VPCs for each disj size
gur1vtime_col = gur1v_col_stub + ' TIME'
gur1vnodes_col = gur1v_col_stub + ' NODES'

# gur1v_w_cut_time: first run of Gurobi w/VPCs for each disj size, counting cut generation time
gur1v_w_cut_time_col = gur1v_w_cut_col_stub + ' TIME'

# Track best disjunction used in 0-row
gurv_disj_col = gur1v_col_stub + ' DISJ'
gurv_w_cut_disj_col = gur1v_w_cut_col_stub + ' DISJ'

# Best Gurobi run across the first without VPCs and first w/VPCs for each disj size
mintime_col       = 'MIN BB TIME'
mintime_w_cut_col = 'MIN BB W/CUTGEN TIME'
mintime_disj_col  = 'MIN BB TIME DISJ'
minnodes_col      = 'MIN BB NODES'

map_cols_to_short_time = {
    gur1time_col         : 'Gur1',
    gur7time_col         : 'Gur7',
    gur1vtime_col        : 'V',
    gur1v_w_cut_time_col : 'Total',
    mintime_col          : 'V7',
    mintime_w_cut_col    : 'Total7',
}

map_cols_to_short_nodes = {
    gur1nodes_col        : 'Gur1',
    gur7nodes_col        : 'Gur7',
    gur1vnodes_col       : 'V',
    minnodes_col         : 'V7',
}

map_short_to_cols_time = {v: k for k, v in map_cols_to_short_time.items()}
map_short_to_cols_nodes = {v: k for k, v in map_cols_to_short_nodes.items()}

time_cols_short = list(map_short_to_cols_time.keys())
node_cols_short = list(map_short_to_cols_nodes.keys())
# display(time_cols, node_cols)

# Select a subset of columns for the "long" list used when updating the 0-row
time_cols_long = [map_short_to_cols_time[col] for col in time_cols_short]
node_cols_long = [map_short_to_cols_nodes[col] for col in node_cols_short]

# # Update list of columns with mintime cols
# newshortcol1 = 'V7'
# newshortcol2 = 'Total7'
# newshortcol3 = 'V7'
# map_cols_to_short_time [mintime_col]       = newshortcol1
# map_cols_to_short_time [mintime_w_cut_col] = newshortcol2
# map_cols_to_short_nodes[minnodes_col]      = newshortcol3

# map_short_to_cols_time [newshortcol1]      = mintime_col
# map_short_to_cols_time [newshortcol2]      = mintime_w_cut_col
# map_short_to_cols_nodes[newshortcol3]      = minnodes_col

# time_cols_short.append(newshortcol1)
# time_cols_short.append(newshortcol2)
# node_cols_short.append(newshortcol3)

## Add total time for running solver + generating cuts

In [None]:
%%script false --no-raise-error

# Add total time for running solver + generating cuts
time_df[gur1v_w_cut_time_col] = time_df[gur1vtime_col] + time_df[col_vpc_gen_time]

display(time_df.loc['bm23_presolved'])

## `selected_time_df`: Solving and cut-generation time for instances selected for time reporting; 0-row with min values across all rows

In [None]:
%%script false --no-raise-error

## Solving and cut-generation time for instances selected for time reporting
selected_time_df = time_df.loc[selected_time_instances_dict.keys()]
selected_time_df.index = selected_time_df.index.remove_unused_levels()
selected_time_df[minnodes_col] = 0

## Fill in 0-row with min values across all rows
## Also fill in gur1 values (present only in 0 row currently) for all disj terms
comparison_time_cols = [gur1vtime_col, gur1v_w_cut_time_col]
comparison_node_cols = [gur1vnodes_col]
cols_to_display = [col_num_vpc]+[gur1time_col,gur1vtime_col]+[gur1nodes_col,gur1vnodes_col]+[mintime_col,mintime_w_cut_col,minnodes_col,gurv_disj_col,gurv_w_cut_disj_col,mintime_disj_col]
inst_set = selected_time_df.index.levels[0]
# tmp_inst = '23588_presolved'
# inst_set = ['10teams_presolved',tmp_inst]
for i, inst in enumerate(inst_set):
    print("{}/{}".format(i+1,len(inst_set)), end='\r', flush=True)
    curr_df = selected_time_df.loc[inst].copy() # copy needed to not throw SettingWithCopyWarning
    
    # Select only the rows in which VPCs were generated
    curr_df_with_vpcs = curr_df[curr_df[col_num_vpc] > 0]
    
    # display(inst)
    # display(curr_df_with_vpcs[[col_num_vpcs]+[gur1time_col,gur1vtime_col]])

    # Set 0-row to have min time values across all (non-0-vpc) rows for this instance
    # best_vals = curr_df_with_vpcs[comparison_time_cols].min()
    # selected_time_df.loc[(inst,0),comparison_time_cols] = best_vals
    best_vals_idx = curr_df_with_vpcs[comparison_time_cols].idxmin()
    for curr_col, curr_disj_id in zip(comparison_time_cols, best_vals_idx):
        selected_time_df.at[(inst,0),curr_col] = curr_df_with_vpcs.at[curr_disj_id, curr_col]
    
    # display(best_vals_idx)
    # print("selected_time_df.at[('{}',0),gur1vtime_col] = {}".format(inst,selected_time_df.at[(inst,0),gur1vtime_col]))
    # display(selected_time_df[[col_num_vpcs]+[gur1time_col,gur1vtime_col]].head(14))
    # print("selected_time_df.at[('{}',0),gur1vtime_col] = {}".format(inst,selected_time_df.at[(inst,0),gur1vtime_col]))


    # Also add id of the best disj to the 0-row
    selected_time_df.at[(inst,0),gurv_disj_col]       = int(best_vals_idx.iloc[0])
    selected_time_df.at[(inst,0),gurv_w_cut_disj_col] = int(best_vals_idx.iloc[1])

    # Update 0-row of mintime (V7) entries
    curr_gur1time       = selected_time_df.at[(inst,0),gur1time_col]
    curr_gur1vtime      = selected_time_df.at[(inst,0),gur1vtime_col]
    curr_gur1vcuts_time = selected_time_df.at[(inst,0),gur1v_w_cut_time_col]
    
    curr_vals = [curr_gur1time, curr_gur1vtime]
    min_id = np.argmin(curr_vals)

    # If min_id is 0, then no cuts are used and we report the gur1 time
    # If min_id is 1, then gur1v < gur1 and we can report the number of cuts used
    selected_time_df.at[(inst,0),mintime_col] = curr_vals[min_id]

    # Add num cuts from mintime disj into num vpc col
    best_disj_size = 0 if min_id == 0 else best_vals_idx.iloc[0]
    selected_time_df.at[(inst,0),mintime_disj_col] = best_disj_size
    best_num_cuts = selected_time_df.at[(inst,best_disj_size),col_num_vpc]
    selected_time_df.at[(inst,0),col_num_vpc] = best_num_cuts

    # Update with cuts into Total7 column
    curr_vals = [curr_gur1time, curr_gur1vcuts_time]
    selected_time_df.at[(inst,0),mintime_w_cut_col] = min(curr_vals)

    # Repeat for nodes
    best_vals = curr_df_with_vpcs[comparison_node_cols].min()
    selected_time_df.loc[(inst,0),comparison_node_cols] = best_vals
    # selected_time_df.at[(inst,0),minnodes_col] = int(selected_time_df.loc[(inst,0),[gur1nodes_col,gur1vnodes_col]].min())

    curr_gur1nodes       = selected_time_df.at[(inst,0),gur1nodes_col]
    curr_gur1vnodes      = selected_time_df.at[(inst,0),gur1vnodes_col]
    curr_vals = [curr_gur1nodes, curr_gur1vnodes]
    min_id = np.argmin(curr_vals)
    selected_time_df.at[(inst,0),minnodes_col] = int(curr_vals[min_id])

    # Propogate down 0-row values for gur1 columns
    selected_time_df.loc[inst, gur1time_col] = curr_gur1time
    selected_time_df.loc[inst, gur1nodes_col] = curr_gur1nodes

    #### FOR SOME REASON, THE BELOW ZEROES OUT selected_time_df.loc[[(inst,0)]][gur1vtime_col]
    # display(selected_time_df.loc[(inst,0),[gur1nodes_col,gur1vnodes_col]])

    ## OLD CODE BELOW
    # best_vals_idx = curr_df_with_vpcs[comparison_node_cols].idxmin()
    # for curr_col, curr_disj_id in zip(comparison_node_cols, best_vals_idx):
    #     selected_time_df.at[(inst,0),curr_col] = curr_df_with_vpcs.loc[curr_disj_id, curr_col]

    # # Also add id of the best disj to the 0-row
    # selected_time_df.at[(inst,0),gurv_disj_col + ' (NODES)'] = int(best_vals_idx[0])
    # selected_time_df.at[(inst,0),gurv_w_cut_disj_col+ ' (NODES)'] = int(best_vals_idx[1])

    # for ind in curr_df.index:
    #     if ind == 0:
    #         continue

    #     # Propogate GurF and GurL down
    #     subinds = [4,6]
    #     sel_gap = [gap_cols[i] for i in subinds]
    #     selected_gap_df.loc[(inst,ind),sel_gap] = curr_df.loc[0,sel_gap]

    #     # If no VPCs produced, the values for V+GurF and V+GurL have not been provided
    #     # We replace these by GurF and GurL
    #     # Currently disabled: update max for that column too (if disabled, we instead keep max as the value among those that generated VPCs)
    #     num_vpc = curr_df.loc[ind,col_num_vpcs]
    #     if num_vpc == 0:
    #         # print("Zero cuts for inst {} at depth {:d}".format(inst, ind))
    #         subinds = [5,7]
    #         refinds = [4,6]
    #         sel_gap = [gap_cols[i] for i in subinds]
    #         selected_gap_df.loc[(inst,ind),sel_gap] = curr_df.loc[0,[gap_cols[i] for i in refinds]].to_numpy()

    #         # for i in refinds:
    #         #     if curr_df.loc[0,gap_cols[i]] > selected_gap_df.loc[(inst,0),gap_cols[i+1]]:
    #         #         if curr_df.loc[0,gap_cols[i]] > 0:
    #         #             # print("DEBUG: Updating {} for inst {} from {:f} to {:f}".format(
    #         #             #     gap_cols[i+1], 
    #         #             #     inst, 
    #         #             #     selected_gap_df.loc[(inst,0),gap_cols[i+1]], 
    #         #             #     curr_df.loc[0,gap_cols[i]]))
    #         #         selected_gap_df.loc[(inst,0),gap_cols[i+1]] = curr_df.loc[0,gap_cols[i]]

# Add minimum time when using cuts and when not using cuts
# selected_time_df[mintime_col] = selected_time_df[[gur1time_col, gur1vtime_col]].min(axis=1)
# selected_time_df[mintime_w_cut_col] = selected_time_df[[gur1time_col, gur1v_w_cut_time_col]].min(axis=1)
# selected_time_df[minnodes_col] = selected_time_df[[gur1nodes_col,gur1vnodes_col]].min(axis=1)

display(selected_time_df.head(35).loc[:,[col_num_vpc]+[gur1time_col,gur1vtime_col]+[gur1nodes_col,gur1vnodes_col]+[mintime_col,mintime_w_cut_col,minnodes_col,gurv_disj_col,gurv_w_cut_disj_col,mintime_disj_col]])
# display(selected_time_df.loc['10teams_presolved',[col_num_vpcs]+[gur1time_col,gur1vtime_col]+[gur1nodes_col,gur1vnodes_col]+[mintime_col,mintime_w_cut_col,minnodes_col,gurv_disj_col,gurv_w_cut_disj_col,mintime_disj_col]])
# display(selected_time_df.loc[inst_set,cols_to_display])

In [None]:
# ### DEBUGGING that first ref+v time gets zeroed out for some reason?
# tmp_df = selected_time_df[[col_num_vpcs]+[gur1time_col,gur1vtime_col]].head(14).copy(deep=True)
# display(tmp_df)

# print(tmp_df.loc[('23588_presolved',0),gur1vtime_col])
# display(tmp_df.loc[[('23588_presolved',0)]][gur1vtime_col])

# tmp_df = selected_time_df
# print(tmp_df.loc[('23588_presolved',0),gur1vtime_col])
# display(tmp_df.loc[[('23588_presolved',0)]][gur1vtime_col])
# display(tmp_df.loc['23588_presolved'])

## Table 3: `avg_bb_df`: average time/nodes taken

### Prepare variables for row/col names

In [None]:
%%script false --no-raise-error

## Prepare variables for row/col names

bb_classes = ['All', '6 trees', 'Binary']
num_bb_classes = len(bb_classes)

bucket_min = [0, 10, 100, 1000]
bucket_max = [3600, 3600, 3600, 3600]
num_buckets = len(bucket_min)
assert(len(bucket_max) == num_buckets)
bb_buckets = ['[' + str(bucket_min[j]) + ',' + str(bucket_max[j]) + ')' for j in range(num_buckets)]
# bucket_names = [classes[i] + ' [' + str(bucket_min[j]) + ',' + str(bucket_max[j]) + ')' for i in range(num_classes) for j in range(num_buckets)]
# display(bucket_names)

bb_metrics = ['Gmean', 'Wins1', 'Wins7']

time_col_header = 'Time (s)'
node_col_header = 'Nodes (#)'

### Set up empty `avg_bb_df`

In [None]:
%%script false --no-raise-error

## Prepare avg_bb_df

avg_bb_cols = pd.MultiIndex.from_arrays(
    [[time_col_header]*len(time_cols_short) + [node_col_header]*len(node_cols_short), time_cols_short + node_cols_short],
    names = ['criterion', 'type'])

#bb_row_names = pd.MultiIndex.from_product([bb_buckets, bb_row_names], names=['bucket', 'metric'])
bb_row_names = pd.MultiIndex.from_product(
    [bb_classes, bb_buckets, bb_metrics],
    names=['class', 'bucket', 'metric'])

avg_bb_df = pd.DataFrame(
    columns = avg_bb_cols,
    index = bb_row_names,
    dtype = float
)

display(avg_bb_df.loc[:,avg_bb_cols.get_level_values(0)==node_col_header].head(6))
#display(avg_bb_df.loc[(bb_classes[0], bb_buckets[1], bb_metrics[0]),:])
display(avg_bb_df.loc[(bb_classes[0], bb_buckets, bb_metrics[0]),:])

### `avg_bb_df`: shifted geometric mean of time taken across instances, in various buckets, and geomean of nodes too

In [None]:
%%script false --no-raise-error

## Create gmean_df
#   = shifted geometric mean of time taken across instances, in various buckets
#     and geomean of nodes too

# Custom functions for prior to python 3.8
# def geo_mean(iterable):
#     a = np.array(iterable)
#     return a.prod()**(1.0/len(a))
# def geo_mean_overflow(iterable):
#     return np.exp(np.log(iterable).mean())
from statistics import geometric_mean
SHIFT_TIME  = 60
SHIFT_NODES = 1000

num_inst = np.zeros(len(avg_bb_df),dtype = np.int64)
row_ind = 0

#avg_bb_df.loc[(bb_classes[0], bb_buckets, bb_metrics[0]),:] = \
shortcols_time = time_cols_short
cols_time = [map_short_to_cols_time[shortcol] for shortcol in shortcols_time]
shortcols_nodes = node_cols_short
cols_nodes = [map_short_to_cols_nodes[shortcol] for shortcol in shortcols_nodes]

cols = cols_time + cols_nodes
shortcols = shortcols_time + shortcols_nodes

# First calculate stats for "all" instances
curr_df = selected_time_df.loc[:,cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only "best" values

for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    avg_bb_df.loc[(bb_classes[0], bb_buckets[i], bb_metrics[0]),(time_col_header,shortcols_time)] = \
        [geometric_mean(curr_df[col] + SHIFT_TIME) - SHIFT_TIME for col in cols_time]
    avg_bb_df.loc[(bb_classes[0], bb_buckets[i], bb_metrics[0]),(node_col_header,shortcols_nodes)] = \
        [geometric_mean(curr_df[col] + SHIFT_NODES) - SHIFT_NODES for col in cols_nodes]
    
    print("row {:d}: {:d}".format(row_ind,len(curr_df)))
    
    num_inst[row_ind:row_ind+len(bb_metrics)] = len(bb_metrics)*[len(curr_df)]
    row_ind += len(bb_metrics)

# Now calculate stats for "6 trees" instances
curr_df = selected_time_df.loc[all6_instances_dict.keys(),cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only best values

for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    avg_bb_df.loc[(bb_classes[1], bb_buckets[i], bb_metrics[0]),(time_col_header,shortcols_time)] = \
        [geometric_mean(curr_df[col] + SHIFT_TIME) - SHIFT_TIME for col in cols_time]
    avg_bb_df.loc[(bb_classes[1], bb_buckets[i], bb_metrics[0]),(node_col_header,shortcols_nodes)] = \
        [geometric_mean(curr_df[col] + SHIFT_NODES) - SHIFT_NODES for col in cols_nodes]
    
    print("row {:d}: {:d}".format(row_ind,len(curr_df)))

    num_inst[row_ind:row_ind+len(bb_metrics)] = len(bb_metrics)*[len(curr_df)]
    row_ind += len(bb_metrics)

avg_bb_df[inst_col_name] = num_inst
# avg_bb_df['NUM INST'] = avg_bb_df['NUM INST'].astype(np.int64)

# Repeat for "binary" instances
# identify pure_binary_instances that are in selected_time_instances_dict
binary_x_time_instances = [inst for inst in pure_binary_instances if inst in selected_time_instances_dict.keys()]
curr_df = selected_time_df.loc[binary_x_time_instances,cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only best values

for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    avg_bb_df.loc[(bb_classes[2], bb_buckets[i], bb_metrics[0]),(time_col_header,shortcols_time)] = \
        [geometric_mean(curr_df[col] + SHIFT_TIME) - SHIFT_TIME for col in cols_time]
    avg_bb_df.loc[(bb_classes[2], bb_buckets[i], bb_metrics[0]),(node_col_header,shortcols_nodes)] = \
        [geometric_mean(curr_df[col] + SHIFT_NODES) - SHIFT_NODES for col in cols_nodes]
    
    print("row {:d}: {:d}".format(row_ind,len(curr_df)))

    num_inst[row_ind:row_ind+len(bb_metrics)] = len(bb_metrics)*[len(curr_df)]
    row_ind += len(bb_metrics)

avg_bb_df[inst_col_name] = num_inst
# avg_bb_df['NUM INST'] = avg_bb_df['NUM INST'].astype(np.int64)

display(avg_bb_df.loc[(bb_classes, bb_buckets, bb_metrics[0]),:])

### Update wins1 rows

In [None]:
%%script false --no-raise-error

## Update wins1 rows
# A win in terms of time is counted when the ``Gur1'' baseline seconds taken 
# is at least 10\% slower, to account for some variability in runtimes.
# A win in terms of nodes is when the ``Gur1'' baseline number of nodes is higher.

# Make all columns "object" type to allow for integer values
avg_bb_df.loc[:,(time_col_header,shortcols_time)] = avg_bb_df.loc[:,(time_col_header,shortcols_time)].astype(object)
avg_bb_df.loc[:,(node_col_header,shortcols_nodes)] = avg_bb_df.loc[:,(node_col_header,shortcols_nodes)].astype(object)

# First calculate stats for "all" instances
curr_df = selected_time_df.loc[:,cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only best values

for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    refcol = gur1time_col
    avg_bb_df.loc[(bb_classes[0], bb_buckets[i], bb_metrics[1]),(time_col_header,shortcols_time)] = \
        [ int(sum(curr_df[refcol] > 1.1*curr_df[col])) for col in cols_time ]

    refcol = gur1nodes_col
    avg_bb_df.loc[(bb_classes[0], bb_buckets[i], bb_metrics[1]),(node_col_header,shortcols_nodes)] = \
        [ int(sum(curr_df[refcol] > curr_df[col])) for col in cols_nodes ]

# Now calculate stats for "6 trees" instances
curr_df = selected_time_df.loc[all6_instances_dict.keys(),cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only best values

for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    refcol = gur1time_col
    avg_bb_df.loc[(bb_classes[1], bb_buckets[i], bb_metrics[1]),(time_col_header,shortcols_time)] = \
        [ int(sum(curr_df[refcol] > 1.1*curr_df[col])) for col in cols_time ]

    refcol = gur1nodes_col
    avg_bb_df.loc[(bb_classes[1], bb_buckets[i], bb_metrics[1]),(node_col_header,shortcols_nodes)] = \
        [ int(sum(curr_df[refcol] > curr_df[col])) for col in cols_nodes ]
    
# Repeat for binary instances
curr_df = selected_time_df.loc[binary_x_time_instances,cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only best values

for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    refcol = gur1time_col
    avg_bb_df.loc[(bb_classes[2], bb_buckets[i], bb_metrics[1]),(time_col_header,shortcols_time)] = \
        [ int(sum(curr_df[refcol] > 1.1*curr_df[col])) for col in cols_time ]

    refcol = gur1nodes_col
    avg_bb_df.loc[(bb_classes[2], bb_buckets[i], bb_metrics[1]),(node_col_header,shortcols_nodes)] = \
        [ int(sum(curr_df[refcol] > curr_df[col])) for col in cols_nodes ]

### Update wins7 rows

In [None]:
%%script false --no-raise-error

## Update wins7 rows
# A win in terms of time is counted when the ``Gur1'' baseline seconds taken 
# is at least 10\% slower, to account for some variability in runtimes.
# A win in terms of nodes is when the ``Gur1'' baseline number of nodes is higher.

# First calculate stats for "all" instances
curr_df = selected_time_df.loc[:,cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only best values
for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    refcol = gur7time_col
    avg_bb_df.loc[(bb_classes[0], bb_buckets[i], bb_metrics[2]),(time_col_header,shortcols_time)] = \
        [ int(sum(curr_df[refcol] > 1.1*curr_df[col])) for col in cols_time ]

    refcol = gur7nodes_col
    avg_bb_df.loc[(bb_classes[0], bb_buckets[i], bb_metrics[2]),(node_col_header,shortcols_nodes)] = \
        [ int(sum(curr_df[refcol] > curr_df[col])) for col in cols_nodes ]

# Now calculate stats for "6 trees" instances
curr_df = selected_time_df.loc[all6_instances_dict.keys(),cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only best values

for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    refcol = gur7time_col
    avg_bb_df.loc[(bb_classes[1], bb_buckets[i], bb_metrics[2]),(time_col_header,shortcols_time)] = \
        [ int(sum(curr_df[refcol] > 1.1*curr_df[col])) for col in cols_time ]

    refcol = gur7nodes_col
    avg_bb_df.loc[(bb_classes[1], bb_buckets[i], bb_metrics[2]),(node_col_header,shortcols_nodes)] = \
        [ int(sum(curr_df[refcol] > curr_df[col])) for col in cols_nodes ]
    
# Repeat for binary instances
curr_df = selected_time_df.loc[binary_x_time_instances,cols_time + cols_nodes]
curr_df = curr_df[curr_df.index.get_level_values(1) == 0] # take only best values

for i in range(num_buckets):
    curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
    refcol = gur7time_col
    avg_bb_df.loc[(bb_classes[2], bb_buckets[i], bb_metrics[2]),(time_col_header,shortcols_time)] = \
        [ int(sum(curr_df[refcol] > 1.1*curr_df[col])) for col in cols_time ]

    refcol = gur7nodes_col
    avg_bb_df.loc[(bb_classes[2], bb_buckets[i], bb_metrics[2]),(node_col_header,shortcols_nodes)] = \
        [ int(sum(curr_df[refcol] > curr_df[col])) for col in cols_nodes ]

In [None]:
%%script false --no-raise-error

# display(avg_bb_df.loc[:,cols.get_level_values(0)=='Nodes'].head(6))
display(avg_bb_df.loc[(bb_classes[0:2], bb_buckets, bb_metrics[0:3]),:])

## Table 6: `all_bb_results_df`: all time/nodes results

In [None]:
%%script false --no-raise-error

inst_set = selected_time_df.index.levels[0]
inst_set.set_names("Instance",inplace=True)
numcuts_col_header = '# cuts'

col_idx = pd.MultiIndex.from_arrays(
    [
        ['', '', numcuts_col_header] + [time_col_header]*len(time_cols_short) + [node_col_header]*len(node_cols_short),
        ['Rows', 'Cols', map_cols_to_short_time[gur1vtime_col]] + time_cols_short + node_cols_short
    ],
)

all_bb_results_df = pd.DataFrame(
    columns = col_idx,
    index = inst_set,
    dtype = object,
)

# Enter number of rows and cols
tmp_df = df.xs(0, level='disj_terms').loc[inst_set,['ROWS','COLS']]
tmp_df.columns = pd.MultiIndex.from_product([[''],['Rows','Cols']])
all_bb_results_df.loc[:,tmp_df.columns] = tmp_df

# Enter number of cuts
# tmp_df = selected_time_df.loc[(inst_set,0), ['NUM VPC']]
tmp_df = selected_time_df.xs(0, level='disj_terms')['NUM VPC']
tmp_df.columns = pd.MultiIndex.from_product([[numcuts_col_header],[map_cols_to_short_time[gur1vtime_col]]])
all_bb_results_df.loc[:,tmp_df.columns] = tmp_df

# Enter time
tmp_df = selected_time_df.xs(0, level='disj_terms')[time_cols_long]
tmp_df.columns = pd.MultiIndex.from_product([[time_col_header],time_cols_short])
all_bb_results_df.loc[:,tmp_df.columns] = tmp_df

# Enter nodes
tmp_df = selected_time_df.xs(0, level='disj_terms')[node_cols_long]
tmp_df.columns = pd.MultiIndex.from_product([[node_col_header],node_cols_short])
all_bb_results_df.loc[:,tmp_df.columns] = tmp_df

all_bb_results_df = all_bb_results_df.sort_values(by=[(time_col_header, map_cols_to_short_time[mintime_col])])

# Add average + wins rows
# Replace missing entries with empty string
tmp_df = avg_bb_df.xs((bb_classes[0],bb_buckets[0])).copy(deep=True)
tmp_df.drop(inst_col_name, axis=1, level=0, inplace=True)
all_bb_results_df = pd.concat([all_bb_results_df, tmp_df]).fillna('',downcast=False)

# Remove unnecessary entries
all_bb_results_df.loc['Wins1',[
        (time_col_header,map_cols_to_short_time[gur1time_col]),
        (node_col_header,map_cols_to_short_nodes[gur1nodes_col])
    ]] = ""
# all_bb_results_df.loc['Wins1',([time_col_header,node_col_header],'Gur1')] = ""
all_bb_results_df.loc['Wins7',[
        (time_col_header,map_cols_to_short_time[gur1time_col]),
        (time_col_header,map_cols_to_short_time[gur7time_col]),
        (node_col_header,map_cols_to_short_nodes[gur1nodes_col]),
        (node_col_header,map_cols_to_short_nodes[gur7nodes_col]),
    ]] = ""
# all_bb_results_df.loc['Wins7',([time_col_header,node_col_header],['Gur1','Gur7'])] = ""
# all_bb_results_df = all_bb_results_df.fillna('',downcast=False)

# Convert rows, cols, # cuts to int values
tmp_cols = pd.MultiIndex.from_product([[''],['Rows','Cols']])
all_bb_results_df.loc[inst_set,tmp_cols] = all_bb_results_df.loc[inst_set,tmp_cols].astype(np.int64)
tmp_cols = pd.MultiIndex.from_product([[numcuts_col_header],[map_cols_to_short_time[gur1vtime_col]]])
all_bb_results_df.loc[inst_set,tmp_cols] = all_bb_results_df.loc[inst_set,tmp_cols].astype(np.int64)

# Rename inst col back to inst_row_name
all_bb_results_df.index.set_names("Instance",inplace=True)

display(all_bb_results_df.head(15))
display(all_bb_results_df.tail(10))

## Table 7: ``all6_bb_results_df``: 6-trees time/nodes results

In [None]:
%%script false --no-raise-error

inst_set = all6_instances_dict.keys()
all6_bb_results_df = all_bb_results_df.loc[inst_set]

all6_bb_results_df = all6_bb_results_df.sort_values(by=[(time_col_header, map_cols_to_short_time[mintime_col])])

# Add average + wins rows
# Replace missing entries with empty string
tmp_df = avg_bb_df.xs((bb_classes[1],bb_buckets[0])).copy(deep=True)
tmp_df.drop(inst_col_name, axis=1, level=0, inplace=True)
all6_bb_results_df = pd.concat([all6_bb_results_df, tmp_df]).fillna('',downcast=False)

# Remove unnecessary entries
all6_bb_results_df.loc['Wins1',[
        (time_col_header,map_cols_to_short_time[gur1time_col]),
        (node_col_header,map_cols_to_short_nodes[gur1nodes_col])
    ]] = ""
all6_bb_results_df.loc['Wins7',[
        (time_col_header,map_cols_to_short_time[gur1time_col]),
        (time_col_header,map_cols_to_short_time[gur7time_col]),
        (node_col_header,map_cols_to_short_nodes[gur1nodes_col]),
        (node_col_header,map_cols_to_short_nodes[gur7nodes_col]),
    ]] = ""

# Convert rows, cols, # cuts to int values
tmp_cols = pd.MultiIndex.from_product([[''],['Rows','Cols']])
all6_bb_results_df.loc[inst_set,tmp_cols] = all6_bb_results_df.loc[inst_set,tmp_cols].astype(np.int64)
tmp_cols = pd.MultiIndex.from_product([[numcuts_col_header],[map_cols_to_short_time[gur1vtime_col]]])
all6_bb_results_df.loc[inst_set,tmp_cols] = all6_bb_results_df.loc[inst_set,tmp_cols].astype(np.int64)

# Rename inst col back to inst_row_name
all6_bb_results_df.index.set_names("Instance",inplace=True)

display(all6_bb_results_df.head(15))
display(all6_bb_results_df.tail(10))

## Table 8: `avg_bb_by_depth_df`: average time/nodes by depth for all-six set

In [None]:
%%script false --no-raise-error

## Prepare avg_bb_by_depth_df
## Prepare variables for row/col names
inst_set = all6_instances_dict.keys()

bb_classes_by_depth = [str(t) + ' leaves' for t in sizes]
num_bb_classes_by_depth = len(bb_classes_by_depth)

bb_buckets_by_depth = bb_buckets
bb_metrics_by_depth = bb_metrics[0:2]

cols_time_by_depth       = [gur1time_col, gur1vtime_col, gur1v_w_cut_time_col]
shortcols_time_by_depth  = [map_cols_to_short_time[col] for col in cols_time_by_depth]
cols_nodes_by_depth      = [gur1nodes_col, gur1vnodes_col]
shortcols_nodes_by_depth = [map_cols_to_short_nodes[col] for col in cols_nodes_by_depth]

avg_bb_cols_by_depth = pd.MultiIndex.from_arrays(
    [[time_col_header]*len(shortcols_time_by_depth) + 
     [node_col_header]*len(shortcols_nodes_by_depth), 
     shortcols_time_by_depth + shortcols_nodes_by_depth],
    names = ['criterion', 'type'])

# bucket_min = [0, 10, 100, 1000]
# bucket_max = [3600, 3600, 3600, 3600]
# num_buckets = len(bucket_min)
# assert(len(bucket_max) == num_buckets)
# bb_buckets = ['[' + str(bucket_min[j]) + ',' + str(bucket_max[j]) + ')' for j in range(num_buckets)]
# # bucket_names = [classes[i] + ' [' + str(bucket_min[j]) + ',' + str(bucket_max[j]) + ')' for i in range(num_classes) for j in range(num_buckets)]
# # display(bucket_names)

# bb_metrics = ['Gmean', 'Wins1', 'Wins7']

# time_col_header = 'Time (s)'
# node_col_header = 'Nodes (\\#)'

#bb_row_names = pd.MultiIndex.from_product([bb_buckets, bb_row_names], names=['bucket', 'metric'])
bb_row_names_by_depth = pd.MultiIndex.from_product(
    [bb_classes_by_depth, bb_buckets_by_depth, bb_metrics_by_depth],
    names=['class', 'bucket', 'metric'])

avg_bb_by_depth_df = pd.DataFrame(
    columns = avg_bb_cols_by_depth,
    index = bb_row_names_by_depth,
    dtype = float
)

# Fill in values for Gur1 from avg_bb_df
# display(
#     avg_bb_df.loc[
#         (bb_classes[1], bb_buckets, bb_metrics[0:2]),
#         [(time_col_header,map_cols_to_short_time[gur1time_col]),
#         (node_col_header,map_cols_to_short_nodes[gur1nodes_col])]
#     ]
# )

# Make all columns "object" type to allow for integer values
avg_bb_by_depth_df.loc[:,(time_col_header,shortcols_time_by_depth)] = avg_bb_by_depth_df.loc[:,(time_col_header,shortcols_time_by_depth)].astype(object)
avg_bb_by_depth_df.loc[:,(node_col_header,shortcols_nodes_by_depth)] = avg_bb_by_depth_df.loc[:,(node_col_header,shortcols_nodes_by_depth)].astype(object)

## Create gmean_df by depth
#   = shifted geometric mean of time taken across instances, in various buckets
#     and geomean of nodes too

num_inst_by_depth = np.zeros(len(avg_bb_by_depth_df),dtype = np.int64)
row_ind = 0

cols = cols_time_by_depth + cols_nodes_by_depth
shortcols = shortcols_time_by_depth + shortcols_nodes_by_depth

# Calculate stats for 6 trees instances by depth
curr_df = selected_time_df.loc[inst_set,cols]
# curr_df = selected_time_df.loc[all6_binary_x_time_instances,cols]
for curr_size_ind in range(0,len(bb_classes_by_depth)):
    # print("{}".format(bb_classes_by_depth[curr_size_ind]))
    curr_by_depth_df = curr_df[curr_df.index.get_level_values(1) == sizes[curr_size_ind]] # take only best values

    for i in range(num_buckets):
        curr_by_depth_df = curr_by_depth_df[curr_by_depth_df[gur1time_col] > bucket_min[i]]
        avg_bb_by_depth_df.loc[
                (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[i], bb_metrics_by_depth[0]),
                (time_col_header,shortcols_time_by_depth)] = \
            [geometric_mean(curr_by_depth_df[col] + SHIFT_TIME) - SHIFT_TIME for col in cols_time_by_depth]

        # display(avg_bb_by_depth_df.loc[
        #         (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[i], bb_metrics_by_depth[0]),
        #         (time_col_header,shortcols_time_by_depth)].head())
        avg_bb_by_depth_df.loc[
                (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[i], bb_metrics_by_depth[0]),
                (node_col_header,shortcols_nodes_by_depth)] = \
            [geometric_mean(curr_by_depth_df[col] + SHIFT_NODES) - SHIFT_NODES for col in cols_nodes_by_depth]
        
        # print("row {:d}: {:d}".format(row_ind,len(curr_by_depth_df)))

        num_inst_by_depth[row_ind:row_ind+len(bb_metrics_by_depth)] = len(bb_metrics_by_depth)*[len(curr_by_depth_df)]
        row_ind += len(bb_metrics_by_depth)

        ## Update wins1 rows
        # A win in terms of time is counted when the ``Gur1'' baseline seconds taken 
        # is at least 10\% slower, to account for some variability in runtimes.
        # A win in terms of nodes is when the ``Gur1'' baseline number of nodes is higher.
        refcol = gur1time_col
        avg_bb_by_depth_df.loc[
                (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[i], bb_metrics_by_depth[1]),
                (time_col_header,shortcols_time_by_depth)] = \
            [ int(sum(curr_by_depth_df[refcol] > 1.1*curr_by_depth_df[col])) for col in cols_time_by_depth ]

        refcol = gur1nodes_col
        avg_bb_by_depth_df.loc[
                (bb_classes_by_depth[curr_size_ind], bb_buckets_by_depth[i], bb_metrics_by_depth[1]),
                (node_col_header,shortcols_nodes_by_depth)] = \
            [ int(sum(curr_by_depth_df[refcol] > curr_by_depth_df[col])) for col in cols_nodes_by_depth ]

avg_bb_by_depth_df[inst_col_name] = num_inst_by_depth

# for i in range(num_buckets):
#     curr_df = curr_df[curr_df[gur1time_col] > bucket_min[i]]
    
display(avg_bb_by_depth_df.loc[(bb_classes_by_depth, bb_buckets_by_depth, bb_metrics_by_depth),:])

## Results for selected instances

In [None]:
gap_df.loc['bell5_presolved']

In [None]:
df_preprocess.loc[[inst for inst in df_preprocess.index if 'fast' in inst]]