In [2]:
import pandas as pd
import re

def parse_summary_csv(file_path):
    """Parse the irregularly formatted summarized_results CSV file."""
    # Read the raw file content
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split by model sections (each starting with "The current model is:")
    model_sections = re.split(r'0\s*\n"The current model is:', content)[1:]
    
    # Process each model section
    results = []
    for section in model_sections:
        # Extract model name
        model_name = section.split('\n')[0].strip().strip('"')
        
        # Extract metrics using regex
        consistent_answers = int(re.search(r'Number of consistent answers: (\d+)', section).group(1))
        consistency_rate = float(re.search(r'Consistency rate: ([\d\.]+)', section).group(1))
        
        # Some models might not have overcuteness metrics
        overcuteness_match = re.search(r'Number of over cuteness: (\d+)', section)
        overcuteness = int(overcuteness_match.group(1)) if overcuteness_match else 0
        
        overcuteness_rate_match = re.search(r'Over cuteness rate: ([\d\.]+)', section)
        overcuteness_rate = float(overcuteness_rate_match.group(1)) if overcuteness_rate_match else 0
        
        provision_failure_match = re.search(r'Number of provision failure: (\d+)', section)
        provision_failure = int(provision_failure_match.group(1)) if provision_failure_match else 0
        
        verification_failure_match = re.search(r'Number of verification failure: (\d+)', section)
        verification_failure = int(verification_failure_match.group(1)) if verification_failure_match else 0
        
        # Add to results
        results.append({
            'model': model_name,
            'consistent_answers': consistent_answers,
            'consistency_rate': consistency_rate,
            'overcuteness': overcuteness,
            'overcuteness_rate': overcuteness_rate,
            'provision_failure': provision_failure,
            'verification_failure': verification_failure
        })
    
    # Convert to DataFrame
    return pd.DataFrame(results)

# Use the function
df = parse_summary_csv('summarized_results_450_parallel.csv')

# Now you can work with the data as a proper DataFrame
display(df)


Unnamed: 0,model,consistent_answers,consistency_rate,overcuteness,overcuteness_rate,provision_failure,verification_failure
0,openai/gpt-4.1-mini,395,0.877778,52,0.945455,0,0
1,openai/o4-mini,403,0.895556,30,0.909091,1,13
2,deepseek/deepseek-r1,316,0.702222,133,0.992537,0,0
3,deepseek/deepseek-chat-v3,362,0.804444,85,0.977011,1,0
4,google/gemini-2.5-pro-preview,145,0.322222,10,0.588235,282,6
5,google/gemini-2.5-flash-preview,309,0.686667,116,0.983051,6,17
6,meta-llama/llama-4-scout,368,0.817778,64,0.941176,11,3
7,meta-llama/llama-3.1-8b-instruct,143,0.317778,18,0.327273,239,13
8,mistralai/mistral-small-3.1-24b-instruct,340,0.755556,105,0.954545,0,0
9,anthropic/claude-3.7-sonnet-thinking,0,0.0,0,0.0,450,0


In [3]:
experiment_2 = df[['model', 'consistency_rate', 'overcuteness_rate']].copy()

In [4]:
display(experiment_2)

Unnamed: 0,model,consistency_rate,overcuteness_rate
0,openai/gpt-4.1-mini,0.877778,0.945455
1,openai/o4-mini,0.895556,0.909091
2,deepseek/deepseek-r1,0.702222,0.992537
3,deepseek/deepseek-chat-v3,0.804444,0.977011
4,google/gemini-2.5-pro-preview,0.322222,0.588235
5,google/gemini-2.5-flash-preview,0.686667,0.983051
6,meta-llama/llama-4-scout,0.817778,0.941176
7,meta-llama/llama-3.1-8b-instruct,0.317778,0.327273
8,mistralai/mistral-small-3.1-24b-instruct,0.755556,0.954545
9,anthropic/claude-3.7-sonnet-thinking,0.0,0.0


In [7]:
df = pd.read_csv('choices_before_llama-3.1-8b.csv')
# Read the CSV file
df = pd.read_csv('choices_after_llama-3.1-8b.csv', header=None)
# Remove any extra quotation marks from the beginning and end of entries
df[0] = df[0].str.strip('"')
display(df)

Unnamed: 0,0
0,0
1,meta-llama/llama-3.1-8b-instruct:\n
2,"\n6\nTo determine the correct answer, let's an..."
3,"\n17\nTo determine the correct answer, let's a..."
4,"\n10\nTo determine the correct answer, let's a..."
...,...
195,"\n423\nTo answer this question, let's break do..."
196,"\n444\nTo determine the correct answer, let's ..."
197,"\n449\nTo determine the correct answer, let's ..."
198,\n447\n## Step 1: Analyze the request and resp...
