# Section 0: Set variables, import whatever is needed, and read in data

### Global variables; import data processing, plotting; export packages and functions

In [1]:
## Global variables
EPS = 1e-7

## Set up variables containing relevant directories
import os
repos_key = 'REPOS_DIR'
try:
    REPOS_DIR = os.environ[repos_key]
    print("REPOS_DIR set to \"%s\"." % REPOS_DIR)
    HOME_DIR = os.environ['HOME']
    print("HOME_DIR set to \"%s\"." % HOME_DIR)
except KeyError:
    print("*** ERROR: %s not found!" % repos_key)

VPC_DIR = REPOS_DIR + "/vpc/"
#RESULTS_DIR = VPC_DIR + "results/saved/"
RESULTS_DIR = HOME_DIR + '/' + "results/saved/"
DATA_DIR = VPC_DIR + "data/"

ONLY_PURE_BINARY = False
ONLY_MIXED_BINARY = False

sizes = [2, 4, 8, 16, 32, 64]

REPOS_DIR set to "/Users/akazachkov/repos".
HOME_DIR set to "/Users/akazachkov".


In [2]:
## Import data processing, plotting, and export packages and functions
from IPython.display import display

from plots_helper import * # this includes matplotlib (+ params), pandas, and custom LaTeX helper functions

### `initialize_df`: common way to process each data frame that we need

In [3]:
## Common way to process each data frame that we need
def initialize_df(filename):
    """
    Create a multilevel index df out of data from file `filename`.
    """
    df = pd.read_csv(filename, sep=',', index_col=False, skiprows=1)
    df.sort_values(by = ['INSTANCE','disj_terms'], inplace=True)
    df.set_index(['INSTANCE','disj_terms'], inplace=True)
    df.replace({"\'-inf\'": -np.inf, "\'inf\'": np.inf}, inplace=True)
    return df

### `df_ipopt`: Retrieve best known IP objective values

In [4]:
## Best known IP objective values
df_ipopt = pd.read_csv(DATA_DIR + "ip_obj.csv")
df_ipopt = df_ipopt.set_index(df_ipopt[df_ipopt.columns[0]])
df_ipopt.rename(columns = {'IP Objective' : 'IP OBJ'}, inplace=True) # for consistency with other dfs
df_ipopt = df_ipopt[~df_ipopt.index.duplicated()]
display(df_ipopt.head())
display(df_ipopt['IP OBJ']['bm23_presolved'])

Unnamed: 0_level_0,Instance,IP OBJ,Set
Instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
22433,22433,21477,miplib2017
23588,23588,8090,miplib2017
10teams,10teams,924,miplib2017
2club200v15p5scn,2club200v15p5scn,-70,miplib2017
30_70_45_05_100,30_70_45_05_100,9,miplib2017


'34'

### `df_preprocess`: Results from preprocessing instances

In [5]:
## Results from preprocessing instances
df_preprocess = pd.read_csv(RESULTS_DIR + "vpc-preprocess.csv", sep=',', index_col=False, skiprows=1)
df_preprocess = df_preprocess.set_index(df_preprocess[df_preprocess.columns[0]])
display(df_preprocess.head())
display(df_preprocess.loc['bm23','CLEANED LP OBJ'])

Unnamed: 0_level_0,INSTANCE,STRATEGY,ORIG LP OBJ,CLEANED LP OBJ,ORIG FIRST GUR NODES,CLEANED FIRST GUR NODES,ORIG BEST GUR NODES,CLEANED BEST GUR NODES,ORIG FIRST GUR TIME,CLEANED FIRST GUR TIME,...,vpc_version,cbc_version,clp_version,gurobi_version,cplex_version,ExitReason,end_time_string,time elapsed,instname,Unnamed: 137
INSTANCE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22433,22433,536,21240.526171,21240.526170798898,9,12,9,12,0.236,0.137,...,#e5b66ee,#d4272be,#8294096,10.02,22.1.1,SUCCESS,Thu Nov 9 16:36:45 2023,1,22433,DONE
23588,23588,536,7649.866134,7649.866133822502,1612,654,1612,654,1.463,0.807,...,#e5b66ee,#d4272be,#8294096,10.02,22.1.1,SUCCESS,Wed Nov 8 23:51:27 2023,3,23588,DONE
10teams,10teams,536,917.0,917.0000000000003,1,1,1,1,0.723,0.526,...,#e5b66ee,#d4272be,#8294096,10.02,22.1.1,SUCCESS,Wed Nov 8 22:25:16 2023,1,10teams,DONE
2club200v15p5scn,2club200v15p5scn,536,-121.222222,-120.07692307692302,231910,137774,231910,137774,7200.002,7200.001,...,#e5b66ee,#d4272be,#8294096,10.02,22.1.1,SUCCESS,Thu Nov 9 12:06:26 2023,14401,2club200v15p5scn,DONE
30_70_45_05_100,30_70_45_05_100,536,8.1,8.09999999998854,1,1,1,1,4.036,4.394,...,#e5b66ee,#d4272be,#8294096,10.02,22.1.1,SUCCESS,Wed Nov 8 23:26:24 2023,18,30_70_45_05_100,DONE


'20.57092176323557097817'

### `df`: Results from generating VPCs for various number of disjunctive terms

In [6]:
## Results from generating VPCs for various number of disjunctive terms
df = initialize_df(RESULTS_DIR + "vpc-bb0bb.csv")
display(df.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,cutlimit,gomory,mode,partial_bb_strategy,partial_bb_keep_pruned_nodes,partial_bb_num_strong,preprocess,prlp_flip_beta,rounds,bb_mode,...,vpc_version,cbc_version,clp_version,gurobi_version,cplex_version,ExitReason,end_time_string,time elapsed,instname,Unnamed: 291
INSTANCE,disj_terms,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10teams_presolved,2,-1,-1,0,4,0,5,0,0,1,11,...,#6d36588,#3253e94,#5a54e9c,10.02,,SUCCESS,Sun Nov 12 19:03:42 2023,48,10teams_presolved,DONE
10teams_presolved,4,-1,-1,0,4,0,5,0,0,1,11,...,#6d36588,#3253e94,#5a54e9c,10.03,22.1.1,FAIL_LIMIT,Sat Nov 11 14:14:54 2023,41,10teams_presolved,DONE
10teams_presolved,8,-1,-1,0,4,0,5,0,0,1,11,...,#6d36588,#3253e94,#5a54e9c,10.02,,SUCCESS,Sun Nov 12 09:01:46 2023,694,10teams_presolved,DONE
10teams_presolved,16,-1,-1,0,4,0,5,0,0,1,11,...,#6d36588,#3253e94,#5a54e9c,10.02,,SUCCESS,Sat Nov 11 01:15:48 2023,2462,10teams_presolved,DONE
10teams_presolved,32,-1,-1,0,4,0,5,0,0,1,11,...,#6d36588,#3253e94,#5a54e9c,10.03,22.1.1,FAIL_LIMIT,Sat Nov 11 15:57:21 2023,3523,10teams_presolved,DONE


In [7]:
col_list = ["BEST DISJ OBJ", "WORST DISJ OBJ"]
for col in col_list:
    df[col] = pd.to_numeric(df[col])

df['NUM DISJ TERMS'] = df.index.get_level_values(1)

## Identify pure binary instances, which are those where 'CLEANED BINARY' column equals 'CLEANED COLS'
df['IS PURE BINARY'] = (df['BINARY'] == df['COLS'])

## Identify mixed binary instances, which are those where 'CLEANED GEN INT' column = 0
df['IS MIXED BINARY'] = (df['GEN INT'] == 0)

# col_list = ['NUM DISJ TERMS']
# for col in col_list:
#     df[col] = pd.to_numeric(df[col])

# start = 220
# end = start + 15
# print(df.columns[start:end])
# print(df.dtypes[start:end])

display(df.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,cutlimit,gomory,mode,partial_bb_strategy,partial_bb_keep_pruned_nodes,partial_bb_num_strong,preprocess,prlp_flip_beta,rounds,bb_mode,...,clp_version,gurobi_version,cplex_version,ExitReason,end_time_string,time elapsed,instname,Unnamed: 291,IS PURE BINARY,IS MIXED BINARY
INSTANCE,disj_terms,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10teams_presolved,2,-1,-1,0,4,0,5,0,0,1,11,...,#5a54e9c,10.02,,SUCCESS,Sun Nov 12 19:03:42 2023,48,10teams_presolved,DONE,True,True
10teams_presolved,4,-1,-1,0,4,0,5,0,0,1,11,...,#5a54e9c,10.03,22.1.1,FAIL_LIMIT,Sat Nov 11 14:14:54 2023,41,10teams_presolved,DONE,True,True
10teams_presolved,8,-1,-1,0,4,0,5,0,0,1,11,...,#5a54e9c,10.02,,SUCCESS,Sun Nov 12 09:01:46 2023,694,10teams_presolved,DONE,True,True
10teams_presolved,16,-1,-1,0,4,0,5,0,0,1,11,...,#5a54e9c,10.02,,SUCCESS,Sat Nov 11 01:15:48 2023,2462,10teams_presolved,DONE,True,True
10teams_presolved,32,-1,-1,0,4,0,5,0,0,1,11,...,#5a54e9c,10.03,22.1.1,FAIL_LIMIT,Sat Nov 11 15:57:21 2023,3523,10teams_presolved,DONE,True,True


### Modify `mod` and remove `stein*` instances (keep modified `stein*_nocard` instances)

In [8]:
# Remove unmodified stein instances from consideration
df.drop(index = ['stein09_presolved', 'stein15_presolved', 'stein27_presolved', 'stein45_presolved'], inplace=True)
df.index = df.index.remove_unused_levels()

# In df_preprocess and df, rename any index with `mas74` to `mas074`, and any with `mas76` to `mas076`
df_preprocess.rename(index={'mas74': 'mas074', 'mas76': 'mas076'}, inplace=True)
# df.rename(index={'mas': 'mas074', 'mas76': 'mas076'}, inplace=True)

In [9]:
# There used to be a mistake with the code for instances in which one root pass was performed
# I think this is now fixed after commit 4ed946c (2022-06-24)
#prefix_list = ["FIRST", "AVG", "BEST"]
df.loc  ["misc02_presolved",
         [
           "LP OBJ",
           "IP OBJ",
           "FIRST REF+V ROOT_PASSES",
           "FIRST REF+V BOUND",
           "AVG REF+V BOUND",
           "BEST REF+V BOUND",
           "FIRST REF+V FIRST_CUT_PASS",
           "FIRST REF+V LAST_CUT_PASS"
         ]
        ]

Unnamed: 0_level_0,LP OBJ,IP OBJ,FIRST REF+V ROOT_PASSES,FIRST REF+V BOUND,AVG REF+V BOUND,BEST REF+V BOUND,FIRST REF+V FIRST_CUT_PASS,FIRST REF+V LAST_CUT_PASS
disj_terms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,1010.0,1690.0,17,1690.0,1690.0,1690.0,1045.0,1252.692308
4,1010.0,1690.0,21,1690.0,1690.0,1690.0,1052.692308,1250.235294
8,1010.0,1690.0,1,1690.0,1690.0,1690.0,1690.0,1690.0
16,1010.0,1690.0,1,1690.0,1690.0,1690.0,1690.0,1690.0
32,1010.0,1690.0,1,1690.0,1690.0,1690.0,1690.0,1690.0
64,1010.0,1690.0,1,1690.0,1690.0,1690.0,1690.0,1690.0


### `instances`: get unique instance list

In [10]:
# Get unique instance list
if (ONLY_PURE_BINARY):
    # Select only instances in df in which column ['IS PURE BINARY'] is True
    tmp_df = df[df['IS PURE BINARY'] == True]
    tmp_df.index = tmp_df.index.remove_unused_levels()
    instances = tmp_df.index.levels[0]
elif (ONLY_MIXED_BINARY):
    tmp_df = df[df['IS MIXED BINARY'] == True]
    tmp_df.index = tmp_df.index.remove_unused_levels()
    instances = tmp_df.index.levels[0]
else:
    instances = df.index.levels[0]

instances.set_names(names = 'Instance', inplace=True)

print("Number of selected instances: ", len(instances))

Number of selected instances:  437


### TODO Create "row 0" taking values across target columns

In [None]:
# For each instance in the set "instances", 
# add row to "df" with index (instance,0), 
# find average of chosen columns from all other rows for that instance,
# with index (instance,x) with x \in {2,4,8,16,32,64}, 
# and put this calculated average value into the same column in row (instance,0)

# Create a new dataframe with index (instance,0)
df_new = pd.DataFrame(index=pd.MultiIndex.from_product([instances, [0]]), columns=df.columns)

# Temporary copy of df while we are debugging
df_debug = df.copy()

# Iterate over instances
for inst in instances:
  # Get rows with index (inst,x)
  rows = df.loc[inst, :]
  
  # Calculate average of column "AVG REF TIME"
  avg_gur_time = rows["AVG REF TIME"].mean()
  
  # Add row with index (inst,0) to df_new and set the value of "AVG GUR TIME" to the calculated average
  df_new.loc[(inst, 0), :] = rows.iloc[0, :]
  df_new.loc[(inst, 0), "AVG REF TIME"] = avg_gur_time

# Append df_new to df
df_debug = pd.concat([df_debug, df_new])

# Print df rows for bm23
df.loc["bm23_presolved"]


### `df_rejection_reason`: Track why instances were not selected for our statistics

In [None]:
rejection_reasons = [
    'SELECTED_GAP', # *not* rejected for gap experiments; _must_ be first column
    'SELECTED_TIME', # *not* rejected for time experiments; _must_ be second column
    'SELECTED_6TREES', # *not* rejected for 6trees set for time experiments; _must_ be third column
    'NUM_WITH_OBJS', # number of attempts that successfully tried solving the PRLP
    'NUM_WITH_CUTS', # number of attempts that successfully yielded cuts
    'IP_OPT_UNKNOWN', # ip opt val must be known
    'TOO_MANY_ROWS_OR_COLS', # require max(nrows, ncols) ≤ 5K
    'OPTIMAL_SOLUTION_FOUND', # optimal solution should not be found by any of the partial trees
    'LP_OPT_IS_NOT_CUT', # check if lp opt < ip opt
    'DLB=DUB', # check if disj lb < disj ub
    'LP=DLB=DUB', # require either lp opt < disj lb or disj lb < disj ub
    'PRLP_INFEASIBLE', # require PRLP is feasible and solves within timelimit for at least one of the attempts
    'PRLP_TIME_LIMIT', # require PRLP solves within timelimit for at least one of the attempts
    'NO_CUTS', # there must be cuts from at least one of the partial b&b trees
    'NO_GAP', # require that ip opt != lp opt
    'GUR_TIMEOUT', # require Gur7 < 3600 (Gurobi is able to solve the instance to optimality within an hour either with or without using VPCs)'
    '<7_ATTEMPTS', # indicates not all partial trees were successfully run
]
df_rejection_reason = pd.DataFrame(index = instances, columns = rejection_reasons, dtype=bool)
df_rejection_reason.iloc[:,3:] = False # no rejection criteria at true

for col in ['OPTIMAL_SOLUTION_FOUND']:
    df_rejection_reason[col] = df_rejection_reason[col].astype(np.int64)
for col in ['NUM_WITH_OBJS', 'NUM_WITH_CUTS', 'LP_OPT_IS_NOT_CUT', 'DLB=DUB', 'LP=DLB=DUB', 'PRLP_INFEASIBLE', 'PRLP_TIME_LIMIT']:
    df_rejection_reason[col] = df_rejection_reason[col].astype(np.int8)
display(df_rejection_reason.head())

### `map_rejection_reason_to_number`: Reference paper's rejection criteria

In [None]:
# map_rejection_reason_to_number = {
#     'OPTIMAL_SOLUTION_FOUND':   '(3)',
#     'LP=DLB=DUB':               '(4a)',
#     'PRLP_INFEASIBLE':          '(4b)',
#     'PRLP_TIME_LIMIT':          '(4c)',
#     '<7_ATTEMPTS':              '(?)',
# }
map_rejection_reason_to_number = {
    'IP_OPT_UNKNOWN':           '\\ref{selection-criterion:ip-opt-known}',
    'NO_GAP':                   '\\ref{selection-criterion:ip-opt-known}',
    'TOO_MANY_ROWS_OR_COLS':    '\\ref{selection-criterion:max-instance-size}',
    'OPTIMAL_SOLUTION_FOUND':   '\\ref{selection-criterion:partial-tree-does-not-find-opt}',
    'LP=DLB=DUB':               '\\ref{selection-criterion:cuts-are-generated:not_lp=dlb=dub}',
    'PRLP_INFEASIBLE':          '\\ref{selection-criterion:cuts-are-generated:PRLP-primal-feasible}',
    'PRLP_TIME_LIMIT':          '\\ref{selection-criterion:cuts-are-generated:PRLP-time-limit}',
    'NO_CUTS':                  '\\ref{selection-criterion:cuts-are-generated:cuts-are-generated}',
    'GUR_TIMEOUT':              'G',
    '<7_ATTEMPTS':              '?',
}

### `df_status_by_depth`: Track success or failure reason by depth

In [None]:
df_status_by_depth = pd.DataFrame(index = instances, columns = sizes, dtype=str)

DEFAULT_STATUS = map_rejection_reason_to_number['<7_ATTEMPTS']

df_status_by_depth[:] = DEFAULT_STATUS

display(df_status_by_depth.head())

# Section 1: Select instances

### `selected_gap_instances_dict` (original index, instance): Select instances for gap closed calculations

Criteria to filter gap closed instances:
* ip opt val is known
* lp opt < ip opt
* max(nrows, ncols) ≤ 5K
* optimal solution should not be found by any of the partial trees
* either lp opt < disj lb or disj lb < disj ub
* PRLP is feasible and solves within timelimit for at least one of the attempts

In [None]:
## Select instances for gap closed calculations
#
# Criteria to filter gap closed instances:
# * ip opt val is known
# * lp opt < ip opt
# * max(nrows, ncols) ≤ 5K
# * optimal solution should not be found by any of the partial trees
# * either lp opt < disj lb or disj lb < disj ub
# * PRLP is feasible and solves within timelimit for at least one of the attempts

# Constants
MAX_ROWS = 5000
MAX_COLS = MAX_ROWS
PRINT_SKIP_REASON = False
NUM_EXPECTED_ATTEMPTS = 6

# Information to save
selected_gap_instances_dict = {} # dictionary of (original index, instance)
#selected_indices = []
num_gap_errors = 0 # number of instances skipped

inst_set = instances
num_attempts = np.zeros(len(inst_set), dtype=int)

for i, inst in enumerate(inst_set):
    print("{}/{}".format(i+1,len(inst_set)), end='\r', flush=True)
    skip_instance = False
    curr_df = df.loc[inst]
    
    # Count number of times instance appears
    num_attempts[i] = len(curr_df)

    if num_attempts[i] < NUM_EXPECTED_ATTEMPTS:
        print("*** ERROR: Instance {:d} -- {}: {:d} < {:d} attempts.".format(i, inst, num_attempts[i], NUM_EXPECTED_ATTEMPTS))
        skip_instance = True
        num_gap_errors += 1
        df_rejection_reason.loc[inst, '<7_ATTEMPTS'] = True

    # Check that LP opt < IP opt
    lp_obj = np.float64(df_preprocess.loc[remove_presolved_from_name(inst),'CLEANED LP OBJ'])
    ip_obj = np.float64(df_ipopt.loc[inst,'IP OBJ'])
    YES_GAP = abs(ip_obj - lp_obj) >= 1e-7
    if not YES_GAP:
        print("*** ERROR: Instance {:d} -- {}: not YES GAP (lp = {:.10f}; ip = {:.10f})".format(i, inst, lp_obj, ip_obj))
        skip_instance = True
        num_gap_errors += 1
        df_rejection_reason.loc[inst, 'NO_GAP'] = True
        
    # Check that ExitReason != OPTIMAL_SOLUTION_FOUND
    OPT_SOL_FOUND = False
    for curr_index, row in curr_df.iterrows():
        #print(i,j, curr_df['ExitReason'])
        curr_depth = int(curr_index)
        if curr_depth == 0:
            continue
        exitreason = row['ExitReason']
        if exitreason == 'OPTIMAL_SOLUTION_FOUND' and not OPT_SOL_FOUND:
            if PRINT_SKIP_REASON:
                print("Skipping instance {:d} -- {}: optimal IP solution found at depth {:d}.".format(
                    i, inst, curr_depth
                ))
            skip_instance = True
            OPT_SOL_FOUND = True
            df_rejection_reason.loc[inst, 'OPTIMAL_SOLUTION_FOUND'] = curr_depth
        if OPT_SOL_FOUND:
            df_status_by_depth.loc[inst, curr_depth] = map_rejection_reason_to_number['OPTIMAL_SOLUTION_FOUND']
        else:
            df_status_by_depth.loc[inst, curr_depth] = ''

    # Check that best and worst bound on leaf nodes is not same (likely cause of primal infeasible PRLP)
    num_successful_attempts = 0
    has_zero = False
    terms = curr_df.index
    for curr_index in terms:
        if curr_df['NUM DISJ TERMS'][curr_index] == 0:
            has_zero = True
            continue
            
        lp_obj = curr_df['LP OBJ'][curr_index]
        ip_obj = curr_df['IP OBJ'][curr_index]
        best_disj_obj = curr_df['BEST DISJ OBJ'][curr_index]
        worst_disj_obj = curr_df['WORST DISJ OBJ'][curr_index]
        num_frac = curr_df['NUM FRAC'][curr_index]
        num_obj_tried = curr_df['NUM OBJ'][curr_index]
        num_cuts = curr_df['NUM VPC'][curr_index] # can be > 0 even if num_obj_tried = 0, b/c of OPTIMAL_SOLUTION_FOUND exit reason
        exitreason = curr_df['ExitReason'][curr_index]

        YES_GAP = abs(ip_obj - lp_obj) >= 1e-7
        LP_OPT_IS_CUT = (num_frac > 0) and YES_GAP and abs(lp_obj - worst_disj_obj) >= 1e-7
        DLB_NE_DUB = (num_frac > 0) and abs(best_disj_obj - worst_disj_obj) >= 1e-7
        df_rejection_reason.loc[inst, 'NO_GAP'] += (not YES_GAP)
        df_rejection_reason.loc[inst, 'LP_OPT_IS_NOT_CUT'] += (not LP_OPT_IS_CUT)
        df_rejection_reason.loc[inst, 'DLB=DUB'] += (not DLB_NE_DUB)
        df_rejection_reason.loc[inst, 'PRLP_INFEASIBLE'] += (exitreason == 'PRLP_INFEASIBLE')
        df_rejection_reason.loc[inst, 'PRLP_TIME_LIMIT'] += (exitreason == 'PRLP_TIME_LIMIT')
        # if not DLB_NE_DUB and num_obj_tried > 0:
        #     raise ValueError(
        #         "*** ERROR: Instance {:d} -- {}: at depth {:d}, num obj tried = {:d} (num cuts = {:d}) but lp opj {:.10f}, best_disj_obj {:.10f} = worst_disj_obj {:.10f} with exit reason {}".format(
        #             i, inst, curr_index, num_obj_tried, num_cuts, lp_obj, best_disj_obj, worst_disj_obj, curr_df['ExitReason'][curr_index]
        #         )
        #     )
        if LP_OPT_IS_CUT or DLB_NE_DUB:
            if (num_obj_tried == 0) and (exitreason not in ['PRLP_TIME_LIMIT','PRLP_INFEASIBLE','OPTIMAL_SOLUTION_FOUND','TIME_LIMIT']):
                # We should be trying objectives at this point, unless the initial PRLP timed out or was infeasible or an optimal solution was found
                raise ValueError(
                    "*** ERROR: Instance {:d} -- {}: at depth {:d}, num obj tried = 0 but lp opj {:.10f} < best_disj_obj {:.10f} < worst_disj_obj {:.10f} with exit reason {}".format(
                        i, inst, curr_index, lp_obj, best_disj_obj, worst_disj_obj, curr_df['ExitReason'][curr_index]
                    )
                )
            if num_obj_tried > 0:
                df_rejection_reason.loc[inst, 'NUM_WITH_OBJS'] += 1
                if num_cuts > 0:
                    num_successful_attempts += 1
                    df_rejection_reason.loc[inst, 'NUM_WITH_CUTS'] += 1
                else:
                    df_status_by_depth.loc[inst, curr_index] = map_rejection_reason_to_number['NO_CUTS']
            elif exitreason == 'PRLP_INFEASIBLE':
                df_status_by_depth.loc[inst, curr_index] = map_rejection_reason_to_number[exitreason]
            elif exitreason == 'PRLP_TIME_LIMIT':
                df_status_by_depth.loc[inst, curr_index] = map_rejection_reason_to_number[exitreason]
        else:
            # check that num obj tried is 0
            if (num_obj_tried > 0):
                raise ValueError(
                    "*** ERROR: Instance {:d} -- {}: at depth {:d}, num obj tried = {:d} > 0 but best_disj_obj {:f} = worst_disj_obj {:f}".format(
                        i, inst, curr_index, num_obj_tried, best_disj_obj, worst_disj_obj
                    )
                )
            df_rejection_reason.loc[inst, 'LP=DLB=DUB'] += 1
            df_status_by_depth.loc[inst, int(curr_index)] = map_rejection_reason_to_number['LP=DLB=DUB']

    # if not has_zero:
    #     raise ValueError(
    #         "*** ERROR: Instance {:d} -- {}: has no bb0 entry.".format(
    #             i, inst, curr_index
    #         )
    #     )        
    
    if num_successful_attempts == 0 and not skip_instance:
        if PRINT_SKIP_REASON:
            print("Skipping instance {:d} -- {}: best and worst bound on leaf nodes coincide for all trees, no objectives ever tried, or no objectives successfully produced cuts.".format(
                i, inst, num_attempts[i]))
        skip_instance = True
        exitreason = 'NO_CUTS'
        df_rejection_reason.loc[inst, exitreason] = True
    else:        
        # Ensure IP objective value is known
        ip_obj = curr_df['IP OBJ'][curr_df.index[0]]
        if not isinstance(ip_obj,float):
            if PRINT_SKIP_REASON:
                print(
                    "Skipping instance {:d} -- {}: IP objective value ({}) is not detected to be a float value.".format(
                    i, inst, ip_obj))
            skip_instance = True
            df_rejection_reason.loc[inst, 'IP_OPT_UNKNOWN'] = True
            
        # Ensure nrows and ncols is not too many
        nrows = curr_df.iloc[0]["ROWS"]
        ncols = curr_df.iloc[0]["COLS"]
        if (nrows > MAX_ROWS) or (ncols > MAX_COLS):
            if PRINT_SKIP_REASON:
                print("Skipping instance {:d} -- {}: nrows = {:d} > {:d} or ncols = {:d} > {:d}.".format(
                        i, inst, nrows, ncols, MAX_ROWS, MAX_COLS))
            skip_instance = True
            df_rejection_reason.loc[inst, 'TOO_MANY_ROWS_OR_COLS'] = True
    
    if not skip_instance:
        #selected_gap_instances_dict[len(selected_gap_instances_dict)] = inst
        selected_gap_instances_dict[inst] = i
    else:
        df_rejection_reason.loc[inst, 'SELECTED_GAP'] = False

num_selected_gap_instances = len(selected_gap_instances_dict)
print("Total number of errors: {}".format(num_gap_errors))
print("Total number of selected instances for gap closed reporting: {}/{:d}".format(num_selected_gap_instances,len(instances)))

### `selected_time_instances_dict` and `all6_instances_dict` (original index, instance): Select instances for time tables

Criteria to filter instances for reporting time:
* ip opt val is known
* lp opt < ip opt
* max(nrows, ncols) ≤ 5K
* optimal solution should not be found by any of the partial trees
* either lp opt < disj lb or disj lb < disj ub
* PRLP is feasible and solves within timelimit for at least one of the attempts
* Gur7 < 3600 (Gurobi is able to solve the instance to optimality within an hour either with or without using VPCs)

6 trees set
* all six partial tree sizes produced VPCs

In [None]:
## Select instances for time tables
#
# Criteria to filter instances for reporting time:
# * ip opt val is known
# * lp opt < ip opt 
# * max(nrows, ncols) ≤ 5K
# * optimal solution should not be found by any of the partial trees
# * either lp opt < disj lb or disj lb < disj ub
# * PRLP is feasible and solves within timelimit for at least one of the attempts
# * min{Gur7,V7} < 3600 (Gurobi is able to solve the instance to optimality within an hour either with or without using VPCs)
#
# 6 trees set
# * all six partial tree sizes produced VPCs

# Constants
MAX_TIME = 3600
PRINT_SKIP_REASON = False

# Information to save
selected_time_instances_dict = {}   # dictionary of (original index, instance)
all6_instances_dict = {}            # dictionary of (original index, instance)
skipped_instances_dict = {}         # dictionary of (original index, instance)
error_instances_dict = {}           # dictionary of (original index, instance)

num_timeouts = 0
num_time_errors = 0

inst_set = list(selected_gap_instances_dict.keys())
# inst_set = ['lotsize_presolved']
for i, inst in enumerate(inst_set):
    print("{}/{}".format(i+1,len(inst_set)), end='\r', flush=True)
    skip_instance = False
    curr_df = df.loc[inst]

    # Check Gur < 3600 (Gurobi is able to solve the instance to optimality within an hour without using VPCs)
    col = 'BEST REF TIME'
    mintime_gur = float(curr_df.loc[0,col].min())
    
    # Check Gur7 < 3600 (Gurobi is able to solve the instance to optimality within an hour either with or without using VPCs)
    col = 'BEST REF+V TIME'
    mintime_gur7 = float(curr_df.loc[2:64,col].min())

    mintime = min(mintime_gur, mintime_gur7)
    if mintime > MAX_TIME - EPS:
        if PRINT_SKIP_REASON:
            print("{:d}: Skipping instance {:d} -- {}: Gurobi's best time (with or without VPCs) is {:.7f} >= {:.7f}.".format(
                    len(skipped_instances_dict), i, inst, mintime, MAX_TIME-EPS
                ))
        skip_instance = True
        skipped_instances_dict[inst] = i
        num_timeouts += 1
        df_rejection_reason.loc[inst, 'GUR_TIMEOUT'] += 1
        # df_status_by_depth.loc[inst, int(curr_index)] = map_rejection_reason_to_number['GUR_TIMEOUT']

    # Check how many times VPCs were successfully generated
    num_successful_attempts = 0
    has_zero = False
    for curr_index, row in curr_df.iterrows():
        if row['NUM DISJ TERMS'] == 0:
            has_zero = True
            continue

        num_vpc = float(row['NUM VPC'])
        num_successful_attempts += (num_vpc > 0)

        if df_status_by_depth.loc[inst, int(curr_index)] == DEFAULT_STATUS:
            curr_time = float(curr_df.loc[curr_index,col])
            if curr_time > MAX_TIME - EPS:
                df_status_by_depth.loc[inst, int(curr_index)] = map_rejection_reason_to_number['GUR_TIMEOUT']

    if not has_zero:
        raise ValueError(
            "*** ERROR: Instance {:d} -- {}: has no bb0 entry.".format(
                i, inst, curr_index
            )
        )        
    
    # if num_successful_attempts == 0 and not skip_instance:
    #     if PRINT_SKIP_REASON:
    #         print("Skipping instance {:d} -- {}: no VPCs generated successfully for any number of terms.".format(i, inst, num_attempts[i]))
    #     skip_instance = True
    #     skipped_instances_dict[inst] = i

    if not skip_instance:
        if num_successful_attempts == 6:
            all6_instances_dict[inst] = i
        #selected_time_instances_dict[len(selected_time_instances_dict)] = inst
        selected_time_instances_dict[inst] = i

num_selected_time_instances = len(selected_time_instances_dict)
num_all6_instances = len(all6_instances_dict)
print("Total number of errors: {}".format(num_time_errors))
print("Total number of timeouts: {}".format(num_timeouts))
print("Total number of instances for time reporting: {}".format(num_selected_time_instances))
print("Total number of \"6 trees\" instances: {}".format(num_all6_instances))