In [1]:
import pandas as pd
import os
import json
import numpy as np
import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
STOPPING_CRITERIA=["end_token", "entropy_threshold", "max_rounds"]
ANSWER_HOW=["answer_first", "think_first"]
SAMPLING_STRATEGY=["nucleus", "top_k"]
AGGREGATION_STRATEGY=   ["weighted_sum", "element_wise_max", "dirichlet", "second_moment"]
Ks=[1, 2, 4, 8]

In [None]:
main_dir = "../soft_thinking/generation_comparison"
models = [
 #   "microsoft_phi_4_mini_instruct",
#    "microsoft_phi_4_mini_reasoning",
    "Qwen_Qwen2_5_3B_Instruct",
    "Qwen_Qwen2_5_0_5B_Instruct",
    "Qwen_Qwen3_0_6B",
    "Qwen_Qwen3_1_7B",
    # "Qwen_Qwen3_4B"
]
config ="3000_200_0.8"
def compute_entropy(phase_data):
    all_entropies = []
    for data_point in phase_data:
        if "phase1_token_ids" in data_point and data_point["phase1_token_ids"]:
            for tokens in data_point["phase1_token_ids"]:
                weights = tokens[1]          
                if np.sum(weights) > 0:
                    probabilities = weights / np.sum(weights)
                    probabilities = probabilities[probabilities > 0]
                    entropy = -np.sum(probabilities * np.log2(probabilities))
                    all_entropies.append(entropy)
                else:
                    all_entropies.append(0)
        else:
            all_entropies.append(0)
    if all_entropies:
        return np.mean(all_entropies)
    else:
        return 0
def compute_length(phase_data):
    all_lengths = []
    for data_point in phase_data:
        if "phase1_token_ids" in data_point and data_point["phase1_token_ids"]:
            all_lengths.append(len(data_point["phase1_token_ids"]))
    return np.mean(all_lengths)
exp_results = []
print("Total number of experiments: ", len(STOPPING_CRITERIA)*len(ANSWER_HOW)*len(SAMPLING_STRATEGY)*len(AGGREGATION_STRATEGY)*len(Ks)*len(models))
for stopping_criteria in tqdm.tqdm(STOPPING_CRITERIA):
    for answer_how in ANSWER_HOW:
        for sampling_strategy in SAMPLING_STRATEGY:
            for aggregation_strategy in AGGREGATION_STRATEGY:
                for k in Ks:
                    for model in tqdm.tqdm(models):
                        current_dir = os.path.join(main_dir, stopping_criteria+"_"+answer_how+"_"+sampling_strategy+"_"+aggregation_strategy+"_k"+str(k), model, config)
                        if os.path.exists(current_dir):
                            file = "generation_comparison_T_e_200_k_"+str(k)+".json"
                            file_path = os.path.join(current_dir, file)
                            if os.path.exists(file_path):
                                with open(file_path, "r") as f:
                                    data = json.load(f)
                                    exp_results.append({
                                        "stopping_criteria": stopping_criteria,
                                        "answer_how": answer_how,
                                        "sampling_strategy": sampling_strategy,
                                        "aggregation_strategy": aggregation_strategy,
                                        "k": k,
                                        "model": model,
                                        "accuracy": data["embedding_mixture"]["accuracy"],
                                        # "entropy": compute_entropy(data["embedding_mixture"]['phase_data']),
                                        # "length": compute_length(data["embedding_mixture"]['phase_data'])
                            })
          
                        else:
                            print(f"Directory {current_dir} does not exist")
pd.DataFrame(exp_results).to_csv("compiled_results.csv")

Total number of experiments:  768


  0%|          | 0/3 [00:00<?, ?it/s]

Directory ../soft_thinking/generation_comparison/end_token_answer_first_nucleus_weighted_sum_k1/Qwen_Qwen2_5_3B_Instruct/3000_200_0.8 does not exist


100%|██████████| 4/4 [00:05<00:00,  1.34s/it]


Directory ../soft_thinking/generation_comparison/end_token_answer_first_nucleus_weighted_sum_k2/Qwen_Qwen2_5_3B_Instruct/3000_200_0.8 does not exist


100%|██████████| 4/4 [00:05<00:00,  1.32s/it]


Directory ../soft_thinking/generation_comparison/end_token_answer_first_nucleus_weighted_sum_k4/Qwen_Qwen2_5_3B_Instruct/3000_200_0.8 does not exist


100%|██████████| 4/4 [00:05<00:00,  1.25s/it]


Directory ../soft_thinking/generation_comparison/end_token_answer_first_nucleus_weighted_sum_k8/Qwen_Qwen2_5_3B_Instruct/3000_200_0.8 does not exist


100%|██████████| 4/4 [00:05<00:00,  1.28s/it]


Directory ../soft_thinking/generation_comparison/end_token_answer_first_nucleus_element_wise_max_k1/Qwen_Qwen2_5_3B_Instruct/3000_200_0.8 does not exist


100%|██████████| 4/4 [09:48<00:00, 147.10s/it]


Directory ../soft_thinking/generation_comparison/end_token_answer_first_nucleus_element_wise_max_k2/Qwen_Qwen2_5_3B_Instruct/3000_200_0.8 does not exist




In [13]:
df = pd.read_csv("compiled_results.csv")

In [14]:
df[df['k']==2]

Unnamed: 0.1,Unnamed: 0,stopping_criteria,answer_how,sampling_strategy,aggregation_strategy,k,model,accuracy,entropy,length
7,7,end_token,answer_first,nucleus,weighted_sum,2,microsoft_phi_4_mini_instruct,0.68,0.524501,196.64
8,8,end_token,answer_first,nucleus,weighted_sum,2,microsoft_phi_4_mini_reasoning,0.60,0.114361,200.00
9,9,end_token,answer_first,nucleus,weighted_sum,2,Qwen_Qwen2_5_3B_Instruct,0.88,0.278717,200.00
10,10,end_token,answer_first,nucleus,weighted_sum,2,Qwen_Qwen2_5_0_5B_Instruct,0.28,0.287361,200.00
11,11,end_token,answer_first,nucleus,weighted_sum,2,Qwen_Qwen3_0_6B,0.48,0.347381,199.92
...,...,...,...,...,...,...,...,...,...,...
1325,1325,max_rounds,think_first,top_k,second_moment,2,Qwen_Qwen2_5_3B_Instruct,0.74,0.757274,200.00
1326,1326,max_rounds,think_first,top_k,second_moment,2,Qwen_Qwen2_5_0_5B_Instruct,0.12,0.896722,200.00
1327,1327,max_rounds,think_first,top_k,second_moment,2,Qwen_Qwen3_0_6B,0.02,0.649727,200.00
1328,1328,max_rounds,think_first,top_k,second_moment,2,Qwen_Qwen3_1_7B,0.76,0.234187,200.00


In [8]:
models_05b = ['Qwen_Qwen2_5_0_5B_Instruct','Qwen_Qwen3_0_6B']
models_15b = ['Qwen_Qwen3_1_7B']
models_3b = ['Qwen_Qwen2_5_3B_Instruct',  'Qwen_Qwen3_4B','microsoft_phi_4_mini_instruct','microsoft_phi_4_mini_reasoning']


In [9]:
df[df['model'].isin(models_05b)].sort_values(by='accuracy', ascending=False).to_excel("05b.xlsx")
df[df['model'].isin(models_15b)].sort_values(by='accuracy', ascending=False).to_excel("15b.xlsx")
df[df['model'].isin(models_3b)].sort_values(by='accuracy', ascending=False).to_excel("3b.xlsx")