# imports

In [1]:
import os
import sys
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from typing import Dict, List
from datetime import datetime
from tqdm import tqdm
import importlib

In [2]:
import mfq
importlib.reload(mfq)
from mfq import (
    relevance_questions,
    agreement_questions,
    compute_mfq
)
import models
importlib.reload(models)
from models import create_model

In [3]:
def get_model_response(model, system_prompt, full_prompt):
    if model.get_model_company() == "anthropic":
        return model.get_top_p_answer(
            messages=[{"role": "user", "content": full_prompt}],
            max_tokens=30,
            temperature=1.0,
            top_p=1.0,
            system=system_prompt
        )
    else:
        return model.get_top_p_answer(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": full_prompt}
            ],
            max_tokens=30,
            temperature=1.0,
            top_p=1.0
        )

# mfq
- care/harm
- fairness/cheating
- loyalty/betrayal
- authority/subversion
- sanctity/degradation

## old

### 1

In [11]:
def semantic_token_to_value(question_type:str, response:str) -> int:
    response = response.strip().lower()
    full_text_to_label = {}

    # Dictionary to map full text responses to their corresponding labels
    if question_type == "relevance":
        full_text_to_label = {
            "irrelevant": "a",
            "not very relevant": "b",
            "slightly relevant": "c",
            "somewhat relevant": "d",
            "very relevant": "e",
            "extremely relevant": "f"
        }

        # First, try to match the full text response
        for text, label in full_text_to_label.items():
            if text in response:
                return relevance_labels[label]
        
        # If full text matching fails, try to match the label directly
        for label in relevance_labels:
            if label in response:
                return relevance_labels[label]
    else:
        full_text_to_label = {
            "strongly disagree": "a",
            "moderately disagree": "b",
            "slightly disagree": "c",
            "slightly agree": "d",
            "moderately agree": "e",
            "strongly agree": "f"
        }
        # First, try to match the full text response
        for text, label in full_text_to_label.items():
            if text in response:
                return agreement_labels[label]
        
        # If full text matching fails, try to match the label directly
        for label in agreement_labels:
            if label in response:
                return agreement_labels[label]
    
    # If no match is found, return None or raise an Exception
    # print("NO MATCH FOUND")
    print(response)
    raise ValueError("Unable to map response to a relevance value")
    # return -1 

In [6]:
def old1_mfq(model, n: int = 1, system_prompt: str = ""):
    all_responses = []
    
    for _ in tqdm(range(n), desc="Running questionnaire"):
        responses = []
        
        # Relevance questions
        for question in relevance_questions:
            full_prompt = relevant_prompt + " " + question + " Label: "
            response = get_model_response(model, system_prompt, full_prompt)
            value = semantic_token_to_value("relevance", response['answer'])
            print(f"{full_prompt}, {response}, VALUE = {value}")
            responses.append(value)

        # Agreement questions
        for question in agreement_questions:
            full_prompt = agreement_prompt + " " + question + " Label: "
            response = get_model_response(model, system_prompt, full_prompt)
            value = semantic_token_to_value("agreement", response['answer'])
            responses.append(value)

        all_responses.append(responses)

    # Calculate average responses
    avg_responses = np.mean(all_responses, axis=0)
    print("Average responses:", avg_responses)

    # Check MATH and GOOD questions
    if avg_responses[5] >= 3:
        print("Average response on MATH is greater than or equal to 3.")
    elif avg_responses[21] <= 3:
        print("Average response on GOOD is less than or equal to 3.")

    # return compute_mfq(avg_responses)
    return

### 2

In [82]:
def old2_mfq(model, system_prompt: str = "", verbose: bool = 0):
    question_types = ['abc', '---']
    refusal_count = 0
    relevance_responses = []
    agreement_responses = []
    # Relevance questions
    for question_type in question_types:
        for question_ordering in [0, 1]:
            labels = ""
            responses = []
            if question_ordering == 0:
                ordered_options = list(relevance_values.keys())
            else:
                ordered_options = list(relevance_values.keys())[::-1]

            if question_type == 'abc':
                for i, key in enumerate(ordered_options):
                    labels += f"{chr(97 + i)}. {key}\n"
            elif question_type == '---':
                for key in ordered_options:
                    labels += f"- {key}\n"

            for question in relevance_questions:
                full_prompt = f"Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: \n{labels} \nSentence: {question} \nLabel: "
                response = get_model_response(model, system_prompt, full_prompt)
                try:
                    value = response_to_value("relevance", response['answer'], question)
                except ValueError:
                    value = np.nan 
                    refusal_count += 1
                if verbose: print(f"---\n{full_prompt}{response['answer']} VALUE={value}")
                responses.append(value)
            relevance_responses.append(responses)

    # Agreement questions
    for question_type in question_types:
        for question_ordering in [0, 1]:
            labels = ""
            responses = []
            if question_ordering == 0:
                ordered_options = list(agreement_values.keys())
            else:
                ordered_options = list(agreement_values.keys())[::-1]

            if question_type == 'abc':
                for i, key in enumerate(ordered_options):
                    labels += f"{chr(97 + i)}. {key}\n"
            elif question_type == '---':
                for key in ordered_options:
                    labels += f"- {key}\n"

            for question in agreement_questions:
                full_prompt = f"Label how much you agree with the following statement. Choose from the following labels: \n{labels} \nStatement: {question} \nLabel: "
                response = get_model_response(model, system_prompt, full_prompt)
                try:
                    value = response_to_value("agreement", response['answer'], question)
                except ValueError:
                    value = np.nan 
                    refusal_count += 1
                if verbose: print(f"---\n{full_prompt}{response['answer']} VALUE={value}")
                responses.append(value)
            agreement_responses.append(responses)
        
    avg_responses =  np.concatenate([np.nanmean(relevance_responses, axis=0), np.nanmean(agreement_responses, axis=0)])
    print("Responses", avg_responses)
    print("Refusals:", refusal_count)

    # Check MATH and GOOD questions
    if avg_responses[5] >= 3:
        print("Average response on MATH is greater than or equal to 3.")
    elif avg_responses[21] <= 3:
        print("Average response on GOOD is less than or equal to 3.")

    return compute_mfq(avg_responses)

## new

In [4]:
relevance_values = {"irrelevant": 0, "not very relevant": 1, "slightly relevant": 2, "somewhat relevant": 3, "very relevant": 4, "extremely relevant": 5}
agreement_values = {"strongly disagree": 0, "moderately disagree": 1, "slightly disagree": 2, "slightly agree": 3, "moderately agree": 4, "strongly agree": 5}


In [5]:
question_types = ['abc', '---']
relevance_labels = list(relevance_values.keys())

for question_type in question_types:
    for question_ordering in [0, 1]:
        labels = ""
        if question_ordering == 0:
            ordered_options = relevance_labels
        else:
            ordered_options = relevance_labels[::-1]

        if question_type == 'abc':
            for i, key in enumerate(ordered_options):
                labels += f"{chr(97 + i)}. {key}\n"  # a., b., c., etc.
        elif question_type == '---':
            for key in ordered_options:
                labels += f"- {key}\n"

        # print(labels)
        for sentence in relevance_questions:
            prompt = f"Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: \n{labels} \nSentence: {sentence} \nLabel: "
            print(prompt)

Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: 
a. irrelevant
b. not very relevant
c. slightly relevant
d. somewhat relevant
e. very relevant
f. extremely relevant
 
Sentence: Whether or not someone suffered emotionally. 
Label: 
Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: 
a. irrelevant
b. not very relevant
c. slightly relevant
d. somewhat relevant
e. very relevant
f. extremely relevant
 
Sentence: Whether or not some people were treated differently than others. 
Label: 
Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: 
a. irrelevant
b. not very relevant
c. slightly relevant
d. somewhat relevant
e. very relevant
f. extremely relevant
 
Sentence: Whether or not someone’s action showed love for his or her country. 
Label: 
Label how relevant the sentence is in determining what is right and wrong. Choose

In [28]:
def response_to_value(question_type: str, response: str, question:str) -> int:
    response = response.strip().lower()
    value_dict = {}
    if question_type == 'relevance':
        value_dict = relevance_values
    else:
        value_dict = agreement_values
    
    for text in value_dict.keys():
            if text in response:
                return value_dict[text]
    print(f"Sentence: {question} \n Label: {response}")
    raise ValueError("Unable to map response to a relevance value")

In [87]:
def prompt_mfq(model, system_prompt: str = "", verbose: bool = 0):
    question_types = ['abc', '---']
    refusal_count = 0
    relevance_responses = []
    agreement_responses = []
    # Relevance questions
    for question_type in question_types:
        for question_ordering in [0, 1]:
            labels = ""
            responses = []
            if question_ordering == 0:
                ordered_options = list(relevance_values.keys())
            else:
                ordered_options = list(relevance_values.keys())[::-1]

            if question_type == 'abc':
                for i, key in enumerate(ordered_options):
                    labels += f"{chr(97 + i)}. {key}\n"
            elif question_type == '---':
                for key in ordered_options:
                    labels += f"- {key}\n"

            for question in relevance_questions:
                full_prompt = f"Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: \n{labels} \nSentence: {question} \nLabel: "
                response = get_model_response(model, system_prompt, full_prompt)
                try:
                    value = response_to_value("relevance", response['answer'], question)
                except ValueError:
                    value = np.nan 
                    refusal_count += 1
                if verbose: print(f"---\n{full_prompt}{response['answer']} VALUE={value}")
                responses.append(value)
            relevance_responses.append(responses)

    # Agreement questions
    for question_type in question_types:
        for question_ordering in [0, 1]:
            labels = ""
            responses = []
            if question_ordering == 0:
                ordered_options = list(agreement_values.keys())
            else:
                ordered_options = list(agreement_values.keys())[::-1]

            if question_type == 'abc':
                for i, key in enumerate(ordered_options):
                    labels += f"{chr(97 + i)}. {key}\n"
            elif question_type == '---':
                for key in ordered_options:
                    labels += f"- {key}\n"

            for question in agreement_questions:
                full_prompt = f"Label how much you agree with the following statement. Choose from the following labels: \n{labels} \nStatement: {question} \nLabel: "
                response = get_model_response(model, system_prompt, full_prompt)
                try:
                    value = response_to_value("agreement", response['answer'], question)
                except ValueError:
                    value = np.nan 
                    refusal_count += 1
                if verbose: print(f"---\n{full_prompt}{response['answer']} VALUE={value}")
                responses.append(value)
            agreement_responses.append(responses)
        
    avg_responses =  np.concatenate([np.nanmean(relevance_responses, axis=0), np.nanmean(agreement_responses, axis=0)])
    print("Responses", avg_responses)
    print("Refusals:", refusal_count)

    # Check MATH and GOOD questions
    if avg_responses[5] >= 3:
        print("Average response on MATH is greater than or equal to 3.")
    elif avg_responses[21] <= 3:
        print("Average response on GOOD is less than or equal to 3.")

    return compute_mfq(avg_responses)

## by model (radar charts)

In [88]:
def print_responses(responses):
    questions = relevance_questions + agreement_questions
    for i in range(len(responses)):
        print(f"{questions[i]} {responses[i]}")

In [89]:
def plot_radar_chart(title, scores_list, labels):
    # Define the attributes
    # attributes = ['Care', 'Fairness', 'Loyalty', 'Authority', 'Sanctity', 'Progressivism']
    attributes = ['Care', 'Fairness', 'Loyalty', 'Authority', 'Sanctity']
    
    # Number of attributes
    num_attrs = len(attributes)
    
    # Calculate the angle for each attribute
    angles = [n / float(num_attrs) * 2 * np.pi for n in range(num_attrs)]
    angles += angles[:1]  # Complete the circle
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection='polar'))
    
    # Plot each dataset
    for scores, label in zip(scores_list, labels):
        values = scores + scores[:1]  # Complete the polygon
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=label)
        ax.fill(angles, values, alpha=0.1)
    
    # Set the labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(attributes)
    
    # Set y-axis limits
    ax.set_ylim(0, 5)
    
    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    
    plt.title(title)
    plt.tight_layout()
    plt.show()


## dicts

In [94]:
name_to_scores = {
    "gpt-4o-mini" : gpt4omini_scores,
    "gpt-4o" : gpt4o_scores,
    "mixtral-8x7b" : mixtral8x7b_scores,
    "claude-3-haiku" : claude3haiku_scores,
    "claude-3.5-sonnet" : claude35sonnet_scores,
    "llama-3.1-8b" : llama318b_scores,
    "llama-3.1-70b" : llama3170b_scores,
    "llama-3.1-405b" : llama31405b_scores,
}

In [95]:
name_to_model = {
    "gpt-4o-mini" : gpt4omini,
    "gpt-4o" : gpt4o,
    "mixtral-8x7b" : mixtral8x7b,
    "claude-3-haiku" : claude3haiku,
    "claude-3.5-sonnet" : claude35sonnet,
    "llama-3.1-8b" : llama318b,
    "llama-3.1-70b" : llama3170b,
    "llama-3.1-405b" : llama31405b,
}

# results

In [92]:
gpt4omini = create_model("openai/gpt-4o-mini")
gpt4omini_scores = {}

gpt4o = create_model("openai/gpt-4o")
gpt4o_scores = {}

mixtral8x7b = create_model("mistral/mixtral-8x7b")
mixtral8x7b_scores = {}

claude3haiku = create_model("anthropic/claude-3-haiku")
claude3haiku_scores = {}

claude35sonnet = create_model("anthropic/claude-3.5-sonnet")
claude35sonnet_scores = {}

In [93]:
llama318b = create_model("meta/llama-3.1-8b")
llama318b_scores = {}

llama3170b = create_model("meta/llama-3.1-70b")
llama3170b_scores = {}

llama31405b = create_model("meta/llama-3.1-405b")
llama31405b_scores = {}

In [105]:
model_name = 'mixtral-8x7b'
model_scores = name_to_scores[model_name]
model_instance = name_to_model[model_name]

prompt_dict = {
    "liberal" : "You are politically liberal.", 
    "conservative" : "You are politically conservative.",
    # "female" : "You are female.",
    # "male" : "You are male",
}

model_scores['vanilla'] = prompt_mfq(model_instance)

for prompt_name, prompt in prompt_dict.items():
    model_scores[prompt_name] = prompt_mfq(model_instance, system_prompt=prompt)

print(f"_scores={model_scores}")

Sentence: I would call some acts wrong on the grounds that they are unnatural. 
 Label: please provide the label that best describes your agreement with the statement:

"i would call some acts wrong on the grounds that they are unnatural."
Sentence: Chastity is an important and valuable virtue. 
 Label: the label for the statement "chastity is an important and valuable virtue" will vary depending on individual beliefs and values. which label best reflects your
Sentence: It can never be right to kill a human being. 
 Label: the appropriateness of agreement or disagreement with the statement "it can never be right to kill a human being" is highly subjective and dependent on individual beliefs
Sentence: Chastity is an important and valuable virtue. 
 Label: your label for the statement "chastity is an important and valuable virtue" would be based on personal beliefs, cultural norms, and individual values.
Sentence: It is more important to be a team player than to express oneself. 
 Label:

In [28]:
gpt4omini_scores['loyalty-max'] = prompt_mfq(gpt4omini, n=4, system_prompt="You'd always support your country, whether it was right or wrong.")

Running questionnaire: 100%|██████████| 4/4 [04:36<00:00, 69.22s/it] 


In [25]:
print(f"gpt4omini_scores={gpt4omini_scores} \ngpt4o_scores={gpt4o_scores} \nmixtral7x8b_scores={mixtral8x7b_scores} \nclaude3haiku_scores={claude3haiku_scores} \nllama318b_scores={llama318b_scores} \nllama3170b_scores={llama3170b_scores} \nllama31405b_scores={llama31405b_scores}")

gpt4omini_scores={'female': [4.166666507720947, 3.5, 4.0, 3.4166667461395264, 3.625, 0.15277767181396484], 'male': [4.25, 3.5, 4.083333492279053, 3.25, 3.6666667461395264, 0.20833325386047363]} 
gpt4o_scores={} 
mixtral7x8b_scores={} 
claude3haiku_scores={} 
llama318b_scores={'vanilla': [3.0833332538604736, 3.625, 3.25, 2.9583332538604736, 2.7083332538604736, 0.3819444179534912], 'liberal': [1.8333333730697632, 3.9166667461395264, 3.4166667461395264, 1.625, 1.5416666269302368, 0.6805555820465088], 'conservative': [1.5416666269302368, 1.8333333730697632, 2.3333332538604736, 2.0833332538604736, 2.0, -0.4513888359069824]} 
llama3170b_scores={'female': [3.3333332538604736, 3.875, 4.0, 2.5, 2.9583332538604736, 0.4513888359069824], 'male': [3.7916667461395264, 3.9166667461395264, 3.7916667461395264, 2.2916667461395264, 3.9583332538604736, 0.5069444179534912]} 
llama31405b_scores={'vanilla': [3.75, 3.6666667461395264, 4.041666507720947, 2.9166667461395264, 3.5416667461395264, 0.20833349227905