In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def read_results_df(csv_path, drop_duplicates=True):
    df = pd.read_csv(csv_path)
    if drop_duplicates:
        df = df.drop_duplicates(subset='DataFile')
    return df

def select_random_subsets(df, prefix1_count, prefix2_count):
    # Filter rows based on prefixes
    df_prefix1 = df[df['DataFile'].str.startswith('./BPPLIB/Scholl_CSP_dat/Scholl_1')]
    df_prefix2 = df[df['DataFile'].str.startswith('./BPPLIB/Scholl_CSP_dat/Scholl_2')]

    # Randomly sample rows from each filtered DataFrame
    df_sampled_prefix1 = df_prefix1.sample(n=prefix1_count, random_state=1)
    df_sampled_prefix2 = df_prefix2.sample(n=prefix2_count, random_state=1)

    # Combine the sampled rows into one DataFrame
    return pd.concat([df_sampled_prefix1, df_sampled_prefix2])

def format_true_solutions_df(true_solutions_path):
    df = pd.read_csv(true_solutions_path)
    df.drop(columns=['Best LB', 'Best UB', 'Status'], inplace=True)
    df.rename(columns={'Name': 'DataFile'}, inplace=True)
    df = df[~df['DataFile'].str.startswith('HARD')]
    def transform_datafile_string(value):
        if value[2] == 'C':
            return f"./BPPLIB/Scholl_CSP_dat/Scholl_1/{value.replace('.txt', '.dat')}"
        elif value[2] == 'W':
            return f"./BPPLIB/Scholl_CSP_dat/Scholl_2/{value.replace('.txt', '.dat')}"
        else:
            raise Exception
    df['DataFile'] = df['DataFile'].apply(transform_datafile_string)

    return df

# Keep only those true solutions for problems we have attempted
def drop_unused_problems(results_df, true_solutions_df):
    filtered_true_solutions_df = true_solutions_df[true_solutions_df['DataFile'].isin(results_df['DataFile'])]
    return filtered_true_solutions_df

In [3]:
scholl_csv = 'scholl-results.csv'
results_df = read_results_df(scholl_csv, drop_duplicates=True)

In [4]:
scholl_1_prefix = './BPPLIB/Scholl_CSP_dat/Scholl_1'
scholl_2_prefix = './BPPLIB/Scholl_CSP_dat/Scholl_2'

print(results_df['DataFile'].str.startswith(scholl_1_prefix).sum())
print(results_df['DataFile'].str.startswith(scholl_2_prefix).sum())

173
55


In [5]:
results_df = select_random_subsets(results_df, 170, 50)
print(results_df['DataFile'].str.startswith(scholl_1_prefix).sum())
print(results_df['DataFile'].str.startswith(scholl_2_prefix).sum())
results_df

170
50


Unnamed: 0,DataFile,ObjectiveValue
107,./BPPLIB/Scholl_CSP_dat/Scholl_1/N2C2W4_R.dat,57.0
142,./BPPLIB/Scholl_CSP_dat/Scholl_1/N1C3W2_B.dat,20.0
19,./BPPLIB/Scholl_CSP_dat/Scholl_1/N1C1W4_A.dat,35.0
76,./BPPLIB/Scholl_CSP_dat/Scholl_1/N4C1W2_P.dat,317.0
53,./BPPLIB/Scholl_CSP_dat/Scholl_1/N1C2W4_L.dat,31.0
...,...,...
224,./BPPLIB/Scholl_CSP_dat/Scholl_2/N4W1B2R6.dat,217.0
208,./BPPLIB/Scholl_CSP_dat/Scholl_2/N1W2B2R4.dat,17.0
223,./BPPLIB/Scholl_CSP_dat/Scholl_2/N1W1B1R1.dat,19.0
213,./BPPLIB/Scholl_CSP_dat/Scholl_2/N4W2B2R6.dat,118.0


In [6]:
true_solutions_path = './BPPLIB/true-solutions/SCHOLL-SOLUTIONS.csv'
true_solutions_df = format_true_solutions_df(true_solutions_path)
true_solutions_df = drop_unused_problems(results_df, true_solutions_df)

NameError: name 'scholl_df' is not defined

In [None]:
true_solutions_df

Unnamed: 0,DataFile,Selected,Solution
2,./BPPLIB/Scholl_CSP_dat/Scholl_1/N1C1W1_C.dat,1,20.0
8,./BPPLIB/Scholl_CSP_dat/Scholl_1/N1C1W1_I.dat,1,25.0
13,./BPPLIB/Scholl_CSP_dat/Scholl_1/N1C1W1_N.dat,1,25.0
20,./BPPLIB/Scholl_CSP_dat/Scholl_1/N1C1W2_A.dat,1,29.0
29,./BPPLIB/Scholl_CSP_dat/Scholl_1/N1C1W2_J.dat,1,34.0
...,...,...,...
1126,./BPPLIB/Scholl_CSP_dat/Scholl_2/N4W2B2R6.dat,1,103.0
1133,./BPPLIB/Scholl_CSP_dat/Scholl_2/N4W2B3R3.dat,1,100.0
1142,./BPPLIB/Scholl_CSP_dat/Scholl_2/N4W3B1R2.dat,1,71.0
1151,./BPPLIB/Scholl_CSP_dat/Scholl_2/N4W3B2R1.dat,1,71.0
