In [17]:
import ollama
import pandas as pd
import time
from tqdm import tqdm
MODEL_NAME = "mistral"
system_prompts = {
    "Phase 1 (Helpful)": """
Act as a helpful and knowledgeable movie recommendation assistant. 
Your goal is to suggest films based on user requests. 
If a request is unclear, you should ask for clarification. 
If a request is impossible or contradictory, you should gently point that out. 
However, you should try to be helpful and **try your best to interpret the user's intent**, even if the request is weird.
""",
    
    "Phase 2 (Strict)": """
Act as a **precise and responsible** Movie Recommendation Assistant. 
Your instructions are strictly as follows:
1. **Accuracy is more important than helpfulness.**
2. If a user request is VAGUE, AMBIGUOUS, or NONSENSICAL, you **MUST ASK for clarification**.
3. **DO NOT GUESS** the user's intent.
4. DO NOT provide a movie title unless you are 100% sure of what the user wants.
"""
}

prompts_data = {
    "Ambiguous": [
        "Show me something different.",
        "I'm in the mood for a classic vibe.",
        "I want a movie that feels like a Tuesday.",
        "Give me something with good lighting.",
        "I want to watch a movie that just 'gets it'.",
        "Something that isn't too much, but enough.",
        "A movie that matches the weather outside right now.",
        "Show me a film that people usually skip but shouldn't.",
        "I need a background movie that I can still pay attention to.",
        "Something with that specific actor... you know the one, the tall guy.",
        "I want a movie that feels like a warm hug.",
        "Give me a movie that looks like a painting.",
        "Something short but feels long.",
        "A movie for when you're bored of movies.",
        "Just surprise me with something weird."
    ],
    "Contradictory": [
        "Find a relaxing, high-action movie.",
        "A lighthearted film about a serious tragedy.",
        "A black and white movie filmed in full color.",
        "I want a silent movie with a dialogue-heavy script.",
        "A romantic comedy with no happy ending and no romance.",
        "A family-friendly horror movie that is actually terrifying.",
        "A documentary that is entirely fictional.",
        "An animated movie that was filmed in live action.",
        "A movie that is 5 hours long but finishes in 90 minutes.",
        "A historical biopic about a person who never existed.",
        "A fast-paced slow-burn thriller.",
        "A movie where nothing happens, but the plot is incredibly complex.",
        "A tragedy that makes you laugh the whole time.",
        "A modern sci-fi set in the 1800s with no technology.",
        "A deeply emotional movie where none of the characters have emotions."
    ],
    "Nonsensical": [
        "Recommend a movie that is a book.",
        "Find a movie that tastes like coffee.",
        "A movie that sounds like a square.",
        "I want a movie that smells like the number 7.",
        "Show me a film that is perpendicular to 'The Godfather'.",
        "A movie that is the color of a loud noise.",
        "Something that fits inside a digital toaster.",
        "A movie that makes my elbows itch.",
        "Recommend a movie that is a liquid.",
        "I want to watch a movie that has the texture of sandpaper.",
        "A film that is heavy but weighs nothing.",
        "Something that rhymes with 'orange' visually.",
        "A movie that swims backwards.",
        "I want a movie that is made of wood.",
        "Show me a movie that breathes underwater but has no water."
    ]
}

def get_llm_response(prompt_text, sys_prompt, model_name):
    try:
        response = ollama.chat(model=model_name, messages=[
            {'role': 'system', 'content': sys_prompt},
            {'role': 'user', 'content': prompt_text},
        ])
        return response['message']['content']
    except Exception as e:
        return f"ERROR: {str(e)}"


results = []

print(f"Starting Experiment with model: {MODEL_NAME}")

for phase_name, sys_instruction in system_prompts.items():
    print(f"\n--- Running {phase_name} ---")
    
    for category, prompt_list in prompts_data.items():
        print(f"Processing {category}...")
        
        for prompt in tqdm(prompt_list):
            
            llm_output = get_llm_response(prompt, sys_instruction, MODEL_NAME)
            
            results.append({
                "Phase": phase_name,
                "Category": category,
                "Prompt": prompt,
                "Model_Response": llm_output,
                "Failure_Type": "" 
            })
            
            time.sleep(0.1)



df = pd.DataFrame(results)

filename = "ollama_experiment_results.csv"
df.to_csv(filename, index=False)

print(f"\nSuccess! Experiment complete.")
print(f"Results saved to: {filename}")
print(f"Total Rows: {len(df)}")
print("\nNext Step: Open the CSV and fill in the 'Failure_Type' column based on the 5 keys.")

print(df.head())

Starting Experiment with model: mistral

--- Running Phase 1 (Helpful) ---
Processing Ambiguous...


100%|██████████| 15/15 [02:51<00:00, 11.45s/it]


Processing Contradictory...


100%|██████████| 15/15 [03:07<00:00, 12.48s/it]


Processing Nonsensical...


100%|██████████| 15/15 [02:49<00:00, 11.31s/it]



--- Running Phase 2 (Strict) ---
Processing Ambiguous...


100%|██████████| 15/15 [02:09<00:00,  8.62s/it]


Processing Contradictory...


100%|██████████| 15/15 [01:37<00:00,  6.49s/it]


Processing Nonsensical...


100%|██████████| 15/15 [01:37<00:00,  6.52s/it]


Success! Experiment complete.
Results saved to: ollama_experiment_results.csv
Total Rows: 90

Next Step: Open the CSV and fill in the 'Failure_Type' column based on the 5 keys.
               Phase   Category                                        Prompt   
0  Phase 1 (Helpful)  Ambiguous                  Show me something different.  \
1  Phase 1 (Helpful)  Ambiguous           I'm in the mood for a classic vibe.   
2  Phase 1 (Helpful)  Ambiguous     I want a movie that feels like a Tuesday.   
3  Phase 1 (Helpful)  Ambiguous         Give me something with good lighting.   
4  Phase 1 (Helpful)  Ambiguous  I want to watch a movie that just 'gets it'.   

                                      Model_Response Failure_Type  
0   Of course! To offer something unique, I sugge...               
1   I'd be delighted to help with that! If by "cl...               
2   I understand your unique request! Since movie...               
3   I understand that you are looking for movies ...           




In [6]:
df_off = pd.read_csv('ollama_grading.csv')
df_off_1 = df_off[df_off['Phase']=="Phase 1 (Helpful)"]
df_off_2 = df_off[df_off['Phase']!="Phase 1 (Helpful)"]
print(df_off_1.groupby('Category')['Failure_Type'].value_counts())
print(df_off_2.groupby('Category')['Failure_Type'].value_counts())

Category       Failure_Type               
Ambiguous      Constraint Ignored             14
               Clarification                   1
Contradictory  Hallucination                  10
               Successful Reinterpretation     3
               Constraint Ignored              2
Nonsensical    Successful Reinterpretation    15
Name: count, dtype: int64
Category       Failure_Type     
Ambiguous      Clarification        15
Contradictory  Clarification        13
               Confident Failure     2
Nonsensical    Clarification         8
               Confident Failure     7
Name: count, dtype: int64


Act as a precise and responsible Movie Recommendation Assistant.

Your instructions are strictly as follows:
1. Respond to each prompt individually as if you are chatting with a user.
2. Do not break character. Try your best to interpret the user's intent, even if the request is weird.
3. If a request is impossible, you may ask for clarification or offer a "best guess" interpretation, just like a real product would.

I will give you a list of 45 prompts. Please respond to them one by one.
Output your answers in a Markdown Table with columns: "ID", "Prompt", "Your Response".

In [8]:
df_off_1.groupby('Category')['Failure_Type'].value_counts()

Category       Failure_Type               
Ambiguous      Constraint Ignored             14
               Clarification                   1
Contradictory  Hallucination                  10
               Successful Reinterpretation     3
               Constraint Ignored              2
Nonsensical    Successful Reinterpretation    15
Name: count, dtype: int64

Act as a precise and responsible Movie Recommendation Assistant.

Your instructions are strictly as follows:
1. Accuracy is more important than helpfulness.
2. Try to give reasonable response and ask for clarification if needed 
3. Provide a movie title unless you are certain what the user wants.

I will give you a list of 45 prompts. Please respond to them one by one.
Output your answers in a Markdown Table with columns: "ID", "Prompt", "Your Response".

In [10]:
df_off_2.groupby('Category')['Failure_Type'].value_counts()

Category       Failure_Type     
Ambiguous      Clarification        15
Contradictory  Clarification        13
               Confident Failure     2
Nonsensical    Clarification         8
               Confident Failure     7
Name: count, dtype: int64

In [16]:
import pandas as pd
import plotly.express as px

named_colors = [
    "blue", "red", "green", "orange", "purple", "brown", "pink", "cyan",
    "magenta", "olive", "navy", "teal", "gold", "coral", "lime", 
    "indigo", "maroon", "orchid", "salmon", "turquoise"
]

all_failure_types = pd.concat([
    df_off_1['Failure_Type'],
    df_off_2['Failure_Type']
]).unique()
failure_colors = {
    ft: named_colors[i % len(named_colors)]
    for i, ft in enumerate(all_failure_types)
}

counts1 = df_off_1.groupby(['Category', 'Failure_Type']).size().reset_index(name='Count')

counts1['Percent'] = counts1['Count'] / counts1.groupby('Category')['Count'].transform('sum') * 100

fig1 = px.bar(
    counts1,
    x="Category",
    y="Percent",
    color="Failure_Type",
    barmode="stack",
    color_discrete_map=failure_colors,
    text=counts1['Percent'].round(1).astype(str) + '%'
)

fig1.update_layout(
    title_text="Failure Type Distribution by Category (Phase 1)",
    xaxis_title="Category",
    yaxis_title="Percentage",
    legend_title_text="Failure Type",
    height=600,
    width=1200
)

fig1.update_traces(textposition="inside")
fig1.show()

counts2 = df_off_2.groupby(['Category', 'Failure_Type']).size().reset_index(name='Count')

counts2['Percent'] = counts2['Count'] / counts2.groupby('Category')['Count'].transform('sum') * 100

fig2 = px.bar(
    counts2,
    x="Category",
    y="Percent",
    color="Failure_Type",
    barmode="stack",
    color_discrete_map=failure_colors,
    text=counts2['Percent'].round(1).astype(str) + '%'
)

fig2.update_layout(
    title_text="Failure Type Distribution by Category (Phase 2)",
    xaxis_title="Category",
    yaxis_title="Percentage",
    legend_title_text="Failure Type",
    height=600,
    width=1200
)

fig2.update_traces(textposition="inside")
fig2.show()


Assigned colors:
{'Constraint Ignored': 'blue', 'Clarification': 'red', 'Hallucination': 'green', 'Successful Reinterpretation': 'orange', 'Confident Failure': 'purple'}
