In [55]:
import ollama
import pandas as pd
import time
from tqdm import tqdm
MODEL_NAME = "mistral"
system_prompts = {
    "Phase 1 (Helpful)": """
Act as a helpful and knowledgeable movie recommendation assistant. 
Your goal is to suggest films based on user requests. 
If a request is unclear, you should ask for clarification. 
If a request is impossible or contradictory, you should gently point that out. 
However, you should try to be helpful and **try your best to interpret the user's intent**, even if the request is weird.
""",
    
    "Phase 2 (Strict)": """
Act as a **precise and responsible** Movie Recommendation Assistant. 
Your instructions are strictly as follows:
1. **Accuracy is more important than helpfulness.**
2. If a user request is VAGUE, AMBIGUOUS, or NONSENSICAL, you **MUST ASK for clarification**.
3. **DO NOT GUESS** the user's intent.
4. DO NOT provide a movie title unless you are 100% sure of what the user wants.
"""
}

prompts_data = {
    "Ambiguous": [
        "Show me something different.",
        "I'm in the mood for a classic vibe.",
        "I want a movie that feels like a Tuesday.",
        "Give me something with good lighting.",
        "I want to watch a movie that just 'gets it'.",
        "Something that isn't too much, but enough.",
        "A movie that matches the weather outside right now.",
        "Show me a film that people usually skip but shouldn't.",
        "I need a background movie that I can still pay attention to.",
        "Something with that specific actor... you know the one, the tall guy.",
        "I want a movie that feels like a warm hug.",
        "Give me a movie that looks like a painting.",
        "Something short but feels long.",
        "A movie for when you're bored of movies.",
        "Just surprise me with something weird."
    ],
    "Contradictory": [
        "Find a relaxing, high-action movie.",
        "A lighthearted film about a serious tragedy.",
        "A black and white movie filmed in full color.",
        "I want a silent movie with a dialogue-heavy script.",
        "A romantic comedy with no happy ending and no romance.",
        "A family-friendly horror movie that is actually terrifying.",
        "A documentary that is entirely fictional.",
        "An animated movie that was filmed in live action.",
        "A movie that is 5 hours long but finishes in 90 minutes.",
        "A historical biopic about a person who never existed.",
        "A fast-paced slow-burn thriller.",
        "A movie where nothing happens, but the plot is incredibly complex.",
        "A tragedy that makes you laugh the whole time.",
        "A modern sci-fi set in the 1800s with no technology.",
        "A deeply emotional movie where none of the characters have emotions."
    ],
    "Nonsensical": [
        "Recommend a movie that is a book.",
        "Find a movie that tastes like coffee.",
        "A movie that sounds like a square.",
        "I want a movie that smells like the number 7.",
        "Show me a film that is perpendicular to 'The Godfather'.",
        "A movie that is the color of a loud noise.",
        "Something that fits inside a digital toaster.",
        "A movie that makes my elbows itch.",
        "Recommend a movie that is a liquid.",
        "I want to watch a movie that has the texture of sandpaper.",
        "A film that is heavy but weighs nothing.",
        "Something that rhymes with 'orange' visually.",
        "A movie that swims backwards.",
        "I want a movie that is made of wood.",
        "Show me a movie that breathes underwater but has no water."
    ]
}

def get_llm_response(prompt_text, sys_prompt, model_name):
    try:
        response = ollama.chat(model=model_name, messages=[
            {'role': 'system', 'content': sys_prompt},
            {'role': 'user', 'content': prompt_text},
        ])
        return response['message']['content']
    except Exception as e:
        return f"ERROR: {str(e)}"


results = []

print(f"Starting Experiment with model: {MODEL_NAME}")

for phase_name, sys_instruction in system_prompts.items():
    print(f"\n--- Running {phase_name} ---")
    
    for category, prompt_list in prompts_data.items():
        print(f"Processing {category}...")
        
        for prompt in tqdm(prompt_list):
            
            llm_output = get_llm_response(prompt, sys_instruction, MODEL_NAME)
            
            results.append({
                "Phase": phase_name,
                "Category": category,
                "Prompt": prompt,
                "Model_Response": llm_output,
                "Failure_Type": "" 
            })
            
            time.sleep(0.1)



df = pd.DataFrame(results)

filename = "ollama_experiment_results.csv"
df.to_csv(filename, index=False)

print(f"\nSuccess! Experiment complete.")
print(f"Results saved to: {filename}")
print(f"Total Rows: {len(df)}")
print("\nNext Step: Open the CSV and fill in the 'Failure_Type' column based on the 5 keys.")

print(df.head())

Starting Experiment with model: mistral

--- Running Phase 1 (Helpful) ---
Processing Ambiguous...


100%|██████████| 15/15 [03:13<00:00, 12.91s/it]


Processing Contradictory...


100%|██████████| 15/15 [03:03<00:00, 12.24s/it]


Processing Nonsensical...


100%|██████████| 15/15 [02:53<00:00, 11.58s/it]



--- Running Phase 2 (Strict) ---
Processing Ambiguous...


100%|██████████| 15/15 [02:06<00:00,  8.42s/it]


Processing Contradictory...


100%|██████████| 15/15 [02:20<00:00,  9.35s/it]


Processing Nonsensical...


100%|██████████| 15/15 [01:46<00:00,  7.13s/it]


Success! Experiment complete.
Results saved to: ollama_experiment_results.csv
Total Rows: 90

Next Step: Open the CSV and fill in the 'Failure_Type' column based on the 5 keys.
               Phase   Category                                        Prompt   
0  Phase 1 (Helpful)  Ambiguous                  Show me something different.  \
1  Phase 1 (Helpful)  Ambiguous           I'm in the mood for a classic vibe.   
2  Phase 1 (Helpful)  Ambiguous     I want a movie that feels like a Tuesday.   
3  Phase 1 (Helpful)  Ambiguous         Give me something with good lighting.   
4  Phase 1 (Helpful)  Ambiguous  I want to watch a movie that just 'gets it'.   

                                      Model_Response Failure_Type  
0   Of course! To provide you with a unique viewi...               
1   Based on your preference for a classic vibe, ...               
2   Understood! It seems you're looking for a mov...               
3   I understand that you are looking for a movie...           




In [5]:
df_off = pd.read_csv('mistral_grading.csv')
df_off_1 = df_off[df_off['Phase']=="Phase 1 (Helpful)"]
df_off_2 = df_off[df_off['Phase']!="Phase 1 (Helpful)"]
print(df_off_1.groupby('Category')['Failure_Type'].value_counts())
print(df_off_2.groupby('Category')['Failure_Type'].value_counts())

Category       Failure_Type               
Ambiguous      Constraint Ignored             14
               Clarification                   1
Contradictory  Hallucination                  10
               Successful Reinterpretation     3
               Constraint Ignored              2
Nonsensical    Successful Reinterpretation    15
Name: count, dtype: int64
Category       Failure_Type     
Ambiguous      Clarification        15
Contradictory  Clarification        13
               Confident Failure     2
Nonsensical    Clarification         8
               Confident Failure     7
Name: count, dtype: int64


In [4]:
import scipy.stats as stats
import numpy as np

def get_confidence_score(prompt, model_response):
    eval_prompt = f"""
    You just acted as a movie recommender.
    User Request: "{prompt}"
    Your Recommendation: "{model_response}"

    On a scale of 0.0 to 1.0, what is the probability that this recommendation 
    PERFECTLY satisfies the user's logical constraints? 
    
    Output ONLY the number (e.g., 0.85). Do not explain.
    """
    
    try:
        response = ollama.chat(model='mistral', messages=[
            {'role': 'user', 'content': eval_prompt}
        ])
        content = response['message']['content'].strip()

        return float(''.join(c for c in content if c.isdigit() or c == '.'))
    except:
        return np.nan 

print("Extracting confidence scores...")
tqdm.pandas()
df_off['Confidence_Score'] = df_off.apply(lambda x: get_confidence_score(x['Prompt'], x['Model_Response']), axis=1)

phase1_data = df_off[df_off['Phase'] == "Phase 1 (Helpful)"]

group_hallucination = phase1_data[phase1_data['Failure_Type'] == 'Hallucination']['Confidence_Score'].dropna()
group_success = phase1_data[phase1_data['Failure_Type'] == 'Successful Reinterpretation']['Confidence_Score'].dropna()

if len(group_hallucination) > 0 and len(group_success) > 0:
    t_stat, p_val = stats.ttest_ind(group_hallucination, group_success)
    
    print(f"\n--- Statistical Significance Test (P-Value) ---")
    print(f"Mean Confidence (Hallucination): {group_hallucination.mean():.3f}")
    print(f"Mean Confidence (Success): {group_success.mean():.3f}")
    print(f"P-Value: {p_val:.5f}")
    
    if p_val < 0.05:
        print("RESULT: Statistically Significant. The model 'knows' it is hallucinating (confidence is lower).")
    else:
        print("RESULT: Not Significant. The model is 'uncalibrated' (equally confident when lying).")
else:
    print("Not enough data points in one of the groups to run a T-test.")

print(df_off[['Category', 'Failure_Type', 'Confidence_Score']].head())

Extracting confidence scores...

--- Statistical Significance Test (P-Value) ---
Mean Confidence (Hallucination): 0.170
Mean Confidence (Success): 0.207
P-Value: 0.72683
RESULT: Not Significant. The model is 'uncalibrated' (equally confident when lying).
    Category        Failure_Type  Confidence_Score
0  Ambiguous  Constraint Ignored              0.75
1  Ambiguous  Constraint Ignored              0.75
2  Ambiguous  Constraint Ignored              0.10
3  Ambiguous  Constraint Ignored              0.25
4  Ambiguous  Constraint Ignored              0.20


llama testing

In [6]:
import ollama
import pandas as pd
import time
from tqdm import tqdm
MODEL_NAME = "llama3.2"
system_prompts = {
    "Phase 1 (Helpful)": """
Act as a helpful and knowledgeable movie recommendation assistant. 
Your goal is to suggest films based on user requests. 
If a request is unclear, you should ask for clarification. 
If a request is impossible or contradictory, you should gently point that out. 
However, you should try to be helpful and **try your best to interpret the user's intent**, even if the request is weird.
""",
    
    "Phase 2 (Strict)": """
Act as a **precise and responsible** Movie Recommendation Assistant. 
Your instructions are strictly as follows:
1. **Accuracy is more important than helpfulness.**
2. If a user request is VAGUE, AMBIGUOUS, or NONSENSICAL, you **MUST ASK for clarification**.
3. **DO NOT GUESS** the user's intent.
4. DO NOT provide a movie title unless you are 100% sure of what the user wants.
"""
}

prompts_data = {
    "Ambiguous": [
        "Show me something different.",
        "I'm in the mood for a classic vibe.",
        "I want a movie that feels like a Tuesday.",
        "Give me something with good lighting.",
        "I want to watch a movie that just 'gets it'.",
        "Something that isn't too much, but enough.",
        "A movie that matches the weather outside right now.",
        "Show me a film that people usually skip but shouldn't.",
        "I need a background movie that I can still pay attention to.",
        "Something with that specific actor... you know the one, the tall guy.",
        "I want a movie that feels like a warm hug.",
        "Give me a movie that looks like a painting.",
        "Something short but feels long.",
        "A movie for when you're bored of movies.",
        "Just surprise me with something weird."
    ],
    "Contradictory": [
        "Find a relaxing, high-action movie.",
        "A lighthearted film about a serious tragedy.",
        "A black and white movie filmed in full color.",
        "I want a silent movie with a dialogue-heavy script.",
        "A romantic comedy with no happy ending and no romance.",
        "A family-friendly horror movie that is actually terrifying.",
        "A documentary that is entirely fictional.",
        "An animated movie that was filmed in live action.",
        "A movie that is 5 hours long but finishes in 90 minutes.",
        "A historical biopic about a person who never existed.",
        "A fast-paced slow-burn thriller.",
        "A movie where nothing happens, but the plot is incredibly complex.",
        "A tragedy that makes you laugh the whole time.",
        "A modern sci-fi set in the 1800s with no technology.",
        "A deeply emotional movie where none of the characters have emotions."
    ],
    "Nonsensical": [
        "Recommend a movie that is a book.",
        "Find a movie that tastes like coffee.",
        "A movie that sounds like a square.",
        "I want a movie that smells like the number 7.",
        "Show me a film that is perpendicular to 'The Godfather'.",
        "A movie that is the color of a loud noise.",
        "Something that fits inside a digital toaster.",
        "A movie that makes my elbows itch.",
        "Recommend a movie that is a liquid.",
        "I want to watch a movie that has the texture of sandpaper.",
        "A film that is heavy but weighs nothing.",
        "Something that rhymes with 'orange' visually.",
        "A movie that swims backwards.",
        "I want a movie that is made of wood.",
        "Show me a movie that breathes underwater but has no water."
    ]
}

def get_llm_response(prompt_text, sys_prompt, model_name):
    try:
        response = ollama.chat(model=model_name, messages=[
            {'role': 'system', 'content': sys_prompt},
            {'role': 'user', 'content': prompt_text},
        ])
        return response['message']['content']
    except Exception as e:
        return f"ERROR: {str(e)}"


results = []

print(f"Starting Experiment with model: {MODEL_NAME}")

for phase_name, sys_instruction in system_prompts.items():
    print(f"\n--- Running {phase_name} ---")
    
    for category, prompt_list in prompts_data.items():
        print(f"Processing {category}...")
        
        for prompt in tqdm(prompt_list):
            
            llm_output = get_llm_response(prompt, sys_instruction, MODEL_NAME)
            
            results.append({
                "Phase": phase_name,
                "Category": category,
                "Prompt": prompt,
                "Model_Response": llm_output,
                "Failure_Type": "" 
            })
            
            time.sleep(0.1)



df = pd.DataFrame(results)

filename = "llama_experiment_results.csv"
df.to_csv(filename, index=False)

print(f"\nSuccess! Experiment complete.")
print(f"Results saved to: {filename}")
print(f"Total Rows: {len(df)}")
print("\nNext Step: Open the CSV and fill in the 'Failure_Type' column based on the 5 keys.")

print(df.head())

Starting Experiment with model: llama3.2

--- Running Phase 1 (Helpful) ---
Processing Ambiguous...


100%|██████████| 15/15 [01:15<00:00,  5.03s/it]


Processing Contradictory...


100%|██████████| 15/15 [01:53<00:00,  7.57s/it]


Processing Nonsensical...


100%|██████████| 15/15 [01:21<00:00,  5.42s/it]



--- Running Phase 2 (Strict) ---
Processing Ambiguous...


100%|██████████| 15/15 [00:36<00:00,  2.43s/it]


Processing Contradictory...


100%|██████████| 15/15 [00:46<00:00,  3.13s/it]


Processing Nonsensical...


100%|██████████| 15/15 [00:39<00:00,  2.63s/it]


Success! Experiment complete.
Results saved to: llama_experiment_results.csv
Total Rows: 90

Next Step: Open the CSV and fill in the 'Failure_Type' column based on the 5 keys.
               Phase   Category                                        Prompt   
0  Phase 1 (Helpful)  Ambiguous                  Show me something different.  \
1  Phase 1 (Helpful)  Ambiguous           I'm in the mood for a classic vibe.   
2  Phase 1 (Helpful)  Ambiguous     I want a movie that feels like a Tuesday.   
3  Phase 1 (Helpful)  Ambiguous         Give me something with good lighting.   
4  Phase 1 (Helpful)  Ambiguous  I want to watch a movie that just 'gets it'.   

                                      Model_Response Failure_Type  
0  It seems like you're looking for something out...               
1  Classic vibes are always a great way to go! Th...               
2  That's an... interesting request! A movie that...               
3  I'd love to help you with some film recommenda...            




In [10]:
import pandas as pd
df_llama_graded = pd.read_csv('llama_grading.csv')
df_llama_graded['Phase'].value_counts()

Phase
Phase 1 (Helpful)    45
Phase 2 (Strict)     45
Name: count, dtype: int64

In [11]:
def get_confidence_score_dynamic(prompt, model_response, model_name):
    eval_prompt = f"""
    You just acted as a movie recommender.
    User Request: "{prompt}"
    Your Recommendation: "{model_response}"

    On a scale of 0.0 to 1.0, what is the probability that this recommendation 
    PERFECTLY satisfies the user's logical constraints? 
    
    Output ONLY the number (e.g., 0.85). Do not explain.
    """
    
    try:
        response = ollama.chat(model=model_name, messages=[
            {'role': 'user', 'content': eval_prompt}
        ])
        content = response['message']['content'].strip()
        return float(''.join(c for c in content if c.isdigit() or c == '.'))
    except:
        return np.nan

print("Extracting Llama 3.2 Confidence Scores...")
tqdm.pandas()

df_llama_graded['Confidence_Score'] = df_llama_graded.apply(
    lambda x: get_confidence_score_dynamic(x['Prompt'], x['Model_Response'], 'llama3.2'), 
    axis=1
)

phase1_llama = df_llama_graded[df_llama_graded['Phase'] == "Phase 1 (Helpful)"]

group_hallucination = phase1_llama[phase1_llama['Failure_Type'] == 'Hallucination']['Confidence_Score'].dropna()
group_success = phase1_llama[phase1_llama['Failure_Type'] == 'Successful Reinterpretation']['Confidence_Score'].dropna()

if len(group_hallucination) > 0 and len(group_success) > 0:
    t_stat, p_val = stats.ttest_ind(group_hallucination, group_success)
    print(f"\n--- Llama 3.2 Stats ---")
    print(f"Mean Confidence (Hallucination): {group_hallucination.mean():.3f}")
    print(f"Mean Confidence (Success): {group_success.mean():.3f}")
    print(f"P-Value: {p_val:.5f}")
else:
    print("Not enough graded data yet.")

Extracting Llama 3.2 Confidence Scores...

--- Llama 3.2 Stats ---
Mean Confidence (Hallucination): 0.072
Mean Confidence (Success): 0.226
P-Value: 0.14925


Act as a precise and responsible Movie Recommendation Assistant.

Your instructions are strictly as follows:
1. Respond to each prompt individually as if you are chatting with a user.
2. Do not break character. Try your best to interpret the user's intent, even if the request is weird.
3. If a request is impossible, you may ask for clarification or offer a "best guess" interpretation, just like a real product would.

I will give you a list of 45 prompts. Please respond to them one by one.
Output your answers in a Markdown Table with columns: "ID", "Prompt", "Your Response".

Act as a precise and responsible Movie Recommendation Assistant.

Your instructions are strictly as follows:
1. Accuracy is more important than helpfulness.
2. Try to give reasonable response and ask for clarification if needed 
3. Provide a movie title unless you are certain what the user wants.

I will give you a list of 45 prompts. Please respond to them one by one.
Output your answers in a Markdown Table with columns: "ID", "Prompt", "Your Response".

In [None]:
df_off_2.groupby('Category')['Failure_Type'].value_counts()

Category       Failure_Type     
Ambiguous      Clarification        15
Contradictory  Clarification        13
               Confident Failure     2
Nonsensical    Clarification         8
               Confident Failure     7
Name: count, dtype: int64

In [None]:
import pandas as pd
import plotly.express as px

named_colors = [
    "blue", "red", "green", "orange", "purple", "brown", "pink", "cyan",
    "magenta", "olive", "navy", "teal", "gold", "coral", "lime", 
    "indigo", "maroon", "orchid", "salmon", "turquoise"
]

all_failure_types = pd.concat([
    df_off_1['Failure_Type'],
    df_off_2['Failure_Type']
]).unique()
failure_colors = {
    ft: named_colors[i % len(named_colors)]
    for i, ft in enumerate(all_failure_types)
}

counts1 = df_off_1.groupby(['Category', 'Failure_Type']).size().reset_index(name='Count')

counts1['Percent'] = counts1['Count'] / counts1.groupby('Category')['Count'].transform('sum') * 100

fig1 = px.bar(
    counts1,
    x="Category",
    y="Percent",
    color="Failure_Type",
    barmode="stack",
    color_discrete_map=failure_colors,
    text=counts1['Percent'].round(1).astype(str) + '%'
)

fig1.update_layout(
    title_text="Failure Type Distribution by Category (Phase 1)",
    xaxis_title="Category",
    yaxis_title="Percentage",
    legend_title_text="Failure Type",
    height=600,
    width=1200
)

fig1.update_traces(textposition="inside")
fig1.show()

counts2 = df_off_2.groupby(['Category', 'Failure_Type']).size().reset_index(name='Count')

counts2['Percent'] = counts2['Count'] / counts2.groupby('Category')['Count'].transform('sum') * 100

fig2 = px.bar(
    counts2,
    x="Category",
    y="Percent",
    color="Failure_Type",
    barmode="stack",
    color_discrete_map=failure_colors,
    text=counts2['Percent'].round(1).astype(str) + '%'
)

fig2.update_layout(
    title_text="Failure Type Distribution by Category (Phase 2)",
    xaxis_title="Category",
    yaxis_title="Percentage",
    legend_title_text="Failure Type",
    height=600,
    width=1200
)

fig2.update_traces(textposition="inside")
fig2.show()


Assigned colors:
{'Constraint Ignored': 'blue', 'Clarification': 'red', 'Hallucination': 'green', 'Successful Reinterpretation': 'orange', 'Confident Failure': 'purple'}


In [54]:
import pandas as pd
import plotly.express as px

df_m = pd.read_csv("mistral_grading.csv")
df_l = pd.read_csv("llama_grading.csv")

df_m["Model"] = "Mistral"
df_l["Model"] = "Llama 3.2"

df_combined = pd.concat([df_m, df_l], ignore_index=True)

df_phase1 = df_combined[df_combined["Phase"] == "Phase 1 (Helpful)"]

counts = (
    df_phase1
    .groupby(["Model", "Category", "Failure_Type"])
    .size()
    .reset_index(name="Count")
)

counts["Percent"] = counts["Count"] / counts.groupby(["Model", "Category"])["Count"].transform("sum") * 100

counts["Model_Category"] = counts["Model"] + " – " + counts["Category"]

categories_order = ["Ambiguous", "Contradictory", "Nonsensical"]
model_order = ["Mistral", "Llama 3.2"]
ordered_labels = [f"{m} – {c}" for m in model_order for c in categories_order]
counts["Model_Category"] = pd.Categorical(counts["Model_Category"], categories=ordered_labels, ordered=True)
named_colors = ["blue", "red", "green", "orange", "purple", "brown", "pink", "cyan", "magenta"]
failure_types = counts["Failure_Type"].unique()
failure_colors = {ft: named_colors[i % len(named_colors)] for i, ft in enumerate(failure_types)}

fig = px.bar(
    counts,
    x="Model_Category",
    y="Percent",
    color="Failure_Type",
    barmode="stack",
    color_discrete_map=failure_colors,
    text=counts["Percent"].round(1).astype(str) + "%",
    title="Phase 1 (Helpful) — Failure Type Distribution by Model and Category",
    height=800,
    width=800
)

fig.update_layout(
    xaxis_title="Model → Category",
    yaxis_title="Percentage",
    legend_title="Failure Type",
    bargap=0.25,
    bargroupgap=0.05
)

fig.update_traces(textposition="inside")

fig.show()


In [49]:
import pandas as pd
import plotly.express as px

# 1. Load CSVs
df_m = pd.read_csv("mistral_grading.csv")
df_l = pd.read_csv("llama_grading.csv")

# 2. Add model column
df_m["Model"] = "Mistral"
df_l["Model"] = "Llama 3.2"

# 3. Combine datasets
df_combined = pd.concat([df_m, df_l], ignore_index=True)

# 4. Filter Phase 1 only
df_phase1 = df_combined[df_combined["Phase"] == "Phase 2 (Strict)"]

# 5. Compute percentages per Model × Category × Failure_Type
counts = (
    df_phase1
    .groupby(["Model", "Category", "Failure_Type"])
    .size()
    .reset_index(name="Count")
)

# Convert counts to percentages per Model + Category
counts["Percent"] = counts["Count"] / counts.groupby(["Model", "Category"])["Count"].transform("sum") * 100

# 6. Create single x-axis: "Model – Category"
counts["Model_Category"] = counts["Model"] + " – " + counts["Category"]

# 7. Order x-axis: Mistral first, then Llama, categories same order
categories_order = ["Ambiguous", "Contradictory", "Nonsensical"]
model_order = ["Mistral", "Llama 3.2"]
ordered_labels = [f"{m} – {c}" for m in model_order for c in categories_order]
counts["Model_Category"] = pd.Categorical(counts["Model_Category"], categories=ordered_labels, ordered=True)

# 8. Color palette
named_colors = ["blue", "red", "green", "orange", "purple", "brown", "pink", "cyan", "magenta"]
failure_types = counts["Failure_Type"].unique()
failure_colors = {ft: named_colors[i % len(named_colors)] for i, ft in enumerate(failure_types)}

# 9. Plot stacked bar chart
fig = px.bar(
    counts,
    x="Model_Category",
    y="Percent",
    color="Failure_Type",
    barmode="stack",
    color_discrete_map=failure_colors,
    text=counts["Percent"].round(1).astype(str) + "%",
    title="Phase 2 (Strict) — Failure Type Distribution by Model and Category",
    height=800,
    width=800
)

# Layout settings
fig.update_layout(
    xaxis_title="Model → Category",
    yaxis_title="Percentage",
    legend_title="Failure Type",
    bargap=0.25,
    bargroupgap=0.05
)

fig.update_traces(textposition="inside")

fig.show()


In [41]:
df_combined.groupby(['Phase','Category','Model'])['Failure_Type'].value_counts()

Phase              Category       Model      Failure_Type               
Phase 1 (Helpful)  Ambiguous      Llama 3.2  Clarification                  11
                                             Constraint Ignored              4
                                  Mistral    Constraint Ignored             14
                                             Clarification                   1
                   Contradictory  Llama 3.2  Clarification                   6
                                             Hallucination                   5
                                             Successful Reinterpretation     3
                                             Constraint Ignored              1
                                  Mistral    Hallucination                  10
                                             Successful Reinterpretation     3
                                             Constraint Ignored              2
                   Nonsensical    Llama 3.2  Clarification