In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
tasks1 = ['AQUA', 'GSM8K', 'SVAMP', 'ASDiv', 'StrategyQA', 'CSQA', 'ARC', 'LastLetter']
models = ['gpt3.5', 'gpt4', 'gemini', 'llama7', 'llama70']
methods = ['io', 'cot', 'refine', 'decomp', 'ours', 'optimal']
sizes = [5,8,12,15,20,25,30]
tasks = ['assign', 'knapsack', 'bin-pack', 'tsp', 'vrp', 'jsp']
tasks_full = ['Assignment', 'Knapsack', 'Bin Packing', 'Travelling Salesman', 'Vehicle Routing', 'Job Scheduling']

results = pd.read_csv('results/results.csv')

In [None]:
# Table 1
i = 0
for task in tasks:
    row = tasks_full[i]
    i += 1
    for model in ('gpt4', 'gemini'):
        base_results = results.loc[results["Task"] == task]
        base_results = base_results.loc[base_results["Model"] == model]
        
        io_results = base_results.loc[base_results["Method"] == 'io'].sort_values('Size')['Cost'].values

        for method in ('cot', 'refine', 'decomp', 'ours'):
            subresults = base_results.loc[base_results["Method"] == method].sort_values('Size')['Cost'].values
            
            imrovement = abs(subresults - io_results) / subresults
            imrovement = 100*imrovement.mean()
            imrovement = round(imrovement, 2)
            if imrovement >= 100:
                imrovement = round(imrovement, 1)

            str_imrovement = str(imrovement)
            while len(str_imrovement) < 5:
                str_imrovement += '0'
            if method == 'ours':
                row += ' & \\textbf{' + str(str_imrovement) + '}'
            else:
                row += ' & ' + str(str_imrovement)
    row += " \\\ "
    print(row)


In [None]:
# Table 2
for size in sizes:
    row = str(size)
    for task in tasks:
        base_results = results.loc[results["Size"] == size]
        base_results = base_results.loc[base_results["Task"] == task]
        base_results = base_results.loc[base_results["Model"] == 'gpt4']
        
        io_results = base_results.loc[base_results["Method"] == 'io']
        io_results_mean = io_results['Cost'].values
        
        subresults = base_results.loc[base_results["Method"] == 'ours']
        subresults_mean = subresults['Cost'].values
        
        imrovement = abs(subresults_mean - io_results_mean)/subresults_mean
        imrovement = round(100*imrovement.mean(), 2)
        if imrovement >= 100:
            imrovement = round(imrovement, 1)

        str_imrovement = str(imrovement)
        while len(str_imrovement) < 5:
            str_imrovement += '0'
        row += ' & ' + str(str_imrovement)
    row += " \\\ "
    print(row)


In [None]:
# Table 3
method_full = ['IO', 'CoT', 'Refine', 'Decomp', 'Ours']
for size in (5,8,12):
    i = 0
    for method in ('io', 'cot', 'refine', 'decomp', 'ours'):
        row = "& " + method_full[i]
        i += 1
        for task in tasks:
            base_results = results.loc[results["Size"] == size]            
            base_results = base_results.loc[base_results["Task"] == task]

            optimal_results = base_results.loc[base_results["Method"] == 'optimal']
            optimal_results_mean = optimal_results['Cost'].values
            
            subresults = base_results.loc[base_results["Model"] == 'gpt4']
            subresults = subresults.loc[subresults["Method"] == method]
            subresults_mean = subresults['Cost'].values
            
            imrovement = abs(subresults_mean - optimal_results_mean)/optimal_results_mean
            imrovement = round(100*imrovement.mean(), 2)
            if imrovement >= 100:
                imrovement = round(imrovement, 1)

            str_imrovement = str(imrovement)
            while len(str_imrovement) < 5:
                str_imrovement += '0'
            if method == 'ours':
                row += ' & \\textbf{' + str(str_imrovement) + '}'
            else:
                row += ' & ' + str(str_imrovement)
        row += " \\\ "
        print(row)


In [None]:
# Table 4
i = 0
for task in tasks:
    row = tasks_full[i]
    i += 1
    for model in models:         
        base_results = results.loc[results["Task"] == task]
        base_results = base_results.loc[base_results["Model"] == model]

        io_results = base_results.loc[base_results["Method"] == 'io']
        io_results_mean = io_results['Cost'].values

        subresults = base_results.loc[base_results["Method"] == 'ours']
        subresults_mean = subresults['Cost'].values

        imrovement = 100*abs(subresults_mean - io_results_mean)/subresults_mean
        imrovement = round(imrovement.mean(), 2)
        if imrovement >= 100:
            imrovement = round(imrovement, 1)
            
        str_imrovement = str(imrovement)
        while len(str_imrovement) < 5:
            str_imrovement += '0'
        row += ' & ' + str(str_imrovement)
    row += " \\\ "
    print(row)
# Big Tables for each model from plots_n_tables

In [None]:
# Table 6

for model in ['gpt4']:
    base_results = results.loc[results["Model"] == model]
    io_results = base_results.loc[base_results["Method"] == 'io']['Cost'].values

    for method in ('cot', 'refine', 'decomp', 'ours'):
        subresults = base_results.loc[base_results["Method"] == method]['Cost'].values
        
        imrovement = abs(subresults - io_results) / subresults
        imrovement = 100*imrovement.mean()
        imrovement = round(imrovement, 2)
        if imrovement >= 100:
            imrovement = round(imrovement, 1)
        print(method, imrovement)

In [None]:
# Big Tables
str_sizes = [str(size) for size in sizes]
for i1 in range(len(models)):
    model = models[i1]

    print()
    print(model)

    sub_table = results.loc[results['Model'] == model]
    sub_table = sub_table.loc[sub_table['Task'].isin(tasks)]
    sub_table = sub_table.loc[sub_table['Method'].isin(methods[:-1])]
    sub_table = sub_table[['Method', 'Size', 'Task', 'Cost']]
    sub_table = sub_table.pivot_table(index=['Method', 'Size'], columns='Task', values='Cost', aggfunc='mean') 
    sub_table = sub_table.reset_index()
    avg = sub_table.drop('Method', axis=1).groupby(['Size']).mean().reset_index()
    avg['Method'] = 'Avg.'
    sub_table = pd.concat([sub_table, avg], ignore_index=True)
    sub_table = sub_table.round(2)
    sub_table['Size'] = sub_table['Size'].astype(int).astype(str)
    new_method_rows = {
        'Avg.' : 'Avg.',
        'io' : "IO", 
        'cot' : "CoT", 
        'refine' : "Refine", 
        'decomp' : "Decomp", 
        'ours' : "Ours"
    }

    sub_table['Method'] = sub_table['Method'].map(new_method_rows)
    sub_table_copy = sub_table.copy()

    for size in sizes:
        sub_table = sub_table_copy.loc[sub_table_copy['Size'] == str(size)]
        sub_table = sub_table.groupby(['Size', 'Method']).mean()
        indices = pd.MultiIndex.from_product([str_sizes, 
            list(new_method_rows.values())], names=['Size', 'Method'])
        sub_table = sub_table.reindex(indices)
        
        bold_values = sub_table.min(axis=0)
        sub_table = sub_table[tasks]
        sub_table.reset_index(inplace=True)
        sub_table = sub_table.dropna().reset_index(drop=True)
        del sub_table['Size']

        # get string for latex
        latex_row = "\\multirow{6}{*}{\\begin{sideways}\\method{\\normalsize " + str(size) + " nodes}\\end{sideways}} & "
        print(latex_row)
        for i in range(len(sub_table)):
            latex_row = ""
            for column in sub_table.columns:
                if i == 0:
                    latex_row += "\cellcolor{gray!25} "
                value = sub_table.at[i, column]
                if column != 'Method':
                    value = float(value)
                    if value < 10:
                        value = round(value, 5)
                    elif value < 100:
                        value = round(value, 4)
                    elif value < 1000:
                        value = round(value, 3)
                    elif value < 10000:
                        value = round(value, 2)
                    elif value < 100000:
                        value = round(value, 1)
                    else:
                        value = round(value, 0)
                str_value = str(value)
                if column != 'Method':
                    while len(str_value) < 6:
                        str_value += '0'
                elif i != 0:
                    latex_row += "& "
                if column in bold_values and i == len(sub_table)-1:
                    latex_row += "\\textbf{" + str_value + "} & "
                else:
                    latex_row += str_value + " & "

            latex_row = latex_row[:-3] + " \\\\ "
            print(latex_row)
        print(" \\hline ")

In [None]:
# Table 5

for i1 in range(len(models)):
    model = models[i1]
    # if model != 'gpt-3.5':
    #     continue
    if model != 'gpt-4':
        continue
    sub_table = results.loc[results['Model'] == model]
    sub_table = sub_table.loc[sub_table['Task'].isin(tasks1)]
    sub_table = sub_table.loc[sub_table['Shot'].isin([np.NAN, '5', 'None'])]
    sub_table = sub_table[['Task', 'Method', 'Value']]
    sub_table = sub_table.pivot_table(index='Method', columns='Task', values='Value', aggfunc='mean')
    sub_table = sub_table.reindex(methods[:-1])  
    # get average | max values
    sub_table['Avg'] = sub_table.mean(axis=1)
    sub_table = sub_table.round(2)
    bold_values = sub_table.max(axis=0)
    # reorder rows | cols
    sub_table.reset_index(inplace=True)
    new_task_columns = ['Method'] + tasks1 + ['Avg']
    sub_table = sub_table[new_task_columns]
    new_method_rows = ['IO Prompting', 'CoT Prompting', 'Refine Prompting', 'Decomp Prompting', 'Ours']
    sub_table['Method'] = new_method_rows
    sub_table = sub_table.dropna().reset_index(drop=True)
    # get string for latex
    for i in range(len(sub_table)):
        latex_row = ""
        for column in sub_table.columns:
            str_value = str(sub_table.at[i, column])
            while len(str_value) < 5:
                str_value += '0'
            if column in bold_values and sub_table.at[i, column] == bold_values[column]:
                latex_row += "\\textbf{" + str_value + "} & "
            else:
                latex_row += str_value + " & "
        latex_row = latex_row[:-3] + " \\\\"
        print(latex_row)


In [None]:
# Figure 2
data = {}
for size in sizes:
    if size == 25:
        break
    data[size] = []
    for task in tasks:
        base_results = results.loc[results["Size"] == size]
        base_results = base_results.loc[base_results["Task"] == task]
        base_results = base_results.loc[base_results["Model"] == 'gpt4']
        
        io_results = base_results.loc[base_results["Method"] == 'io']
        io_results_mean = io_results['Cost'].values
        
        subresults = base_results.loc[base_results["Method"] == 'ours']
        subresults_mean = subresults['Cost'].values
        
        imrovement = abs(subresults_mean - io_results_mean)/subresults_mean
        imrovement = round(100*imrovement.mean(), 2)
        if imrovement >= 100:
            imrovement = round(imrovement, 1)
        data[size].append(imrovement)

x = np.arange(len(tasks)) 
width = 0.15  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout='constrained')
fig.set_size_inches(18.5, 8)
for size, improvement in data.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, improvement, width, label=size)
    ax.bar_label(rects, fmt='%.0f', fontsize=18)
    multiplier += 1
tasks_full = ["Assign", "Knapsack", "Bin Pack", "TSP", "VRP", "JSP"]
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Performance Improvement',fontsize = 35)
ax.set_title('Problem Size Effect',fontsize = 35)
ax.set_xticks(x + 2*width, tasks_full,fontsize = 30)
ax.legend(loc='upper right', ncol=7, fontsize = 28)
ax.set_ylim(35, 105)

plt.show()

In [None]:
# Figure 3
data = {}
for model in ['gpt4', 'gemini', 'gpt3.5', 'llama70', 'llama7']: 
    data[model] = []
    for task in tasks:
        base_results = results.loc[results["Task"] == task]
        base_results = base_results.loc[base_results["Model"] == model]

        io_results = base_results.loc[base_results["Method"] == 'io']
        io_results_mean = io_results['Cost'].values

        subresults = base_results.loc[base_results["Method"] == 'ours']
        subresults_mean = subresults['Cost'].values

        imrovement = 100*abs(subresults_mean - io_results_mean)/subresults_mean
        imrovement = round(imrovement.mean(), 2)
        if imrovement >= 100:
            imrovement = round(imrovement, 1)
        data[model].append(imrovement)

x = np.arange(len(tasks)) 
width = 0.15  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout='constrained')
fig.set_size_inches(18.5, 8)
model_name = {
    'gpt4' : 'GPT-4',
    'gemini' : 'Gemini-1.5',
    'gpt3.5' : 'GPT-3.5',
    'llama7' : 'Llama-2-7b',
    'llama70' : 'Llama-2-70b'
}

for model, improvement in data.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, improvement, width, label=model_name[model])
    ax.bar_label(rects, fmt='%.0f', fontsize=18)
    multiplier += 1
tasks_full = ["Assign", "Knapsack", "Bin Pack", "TSP", "VRP", "JSP"]
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Performance Improvement',fontsize = 35)
ax.set_title('Model Effect',fontsize = 35)
ax.set_xticks(x + 2*width, tasks_full,fontsize = 30)
ax.legend(loc='upper right', ncol=7, fontsize = 25)
ax.set_ylim(0, 95)

plt.show()