# imports

In [1]:
import os
import sys
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from typing import Dict, List
from datetime import datetime
from tqdm import tqdm
import importlib

In [2]:
import mfq
importlib.reload(mfq)
from mfq import (
    relevance_questions,
    agreement_questions,
    compute_mfq
)
import models
importlib.reload(models)
from models import create_model

In [3]:
def get_model_response(model, system_prompt, full_prompt):
    if model.get_model_company() == "anthropic":
        return model.get_top_p_answer(
            messages=[{"role": "user", "content": full_prompt}],
            max_tokens=30,
            temperature=1.0,
            top_p=1.0,
            system=system_prompt
        )
    else:
        return model.get_top_p_answer(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": full_prompt}
            ],
            max_tokens=30,
            temperature=1.0,
            top_p=1.0
        )

# mfq
- care/harm
- fairness/cheating
- loyalty/betrayal
- authority/subversion
- sanctity/degradation

## old

### 1

In [56]:
labels = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4, "f": 5}

def semantic_token_to_value(question_type:str, response:str) -> int:
    response = response.strip().lower()
    full_text_to_label = {}

    # Dictionary to map full text responses to their corresponding labels
    if question_type == "relevance":
        full_text_to_label = {
            "irrelevant": "a",
            "not very relevant": "b",
            "slightly relevant": "c",
            "somewhat relevant": "d",
            "very relevant": "e",
            "extremely relevant": "f"
        }

        # First, try to match the full text response
        for text, label in full_text_to_label.items():
            if text in response:
                return labels[label]
        
        # If full text matching fails, try to match the label directly
        for label in labels:
            if label in response:
                return labels[label]
    else:
        full_text_to_label = {
            "strongly disagree": "a",
            "moderately disagree": "b",
            "slightly disagree": "c",
            "slightly agree": "d",
            "moderately agree": "e",
            "strongly agree": "f"
        }
        # First, try to match the full text response
        for text, label in full_text_to_label.items():
            if text in response:
                return labels[label]
        
        # If full text matching fails, try to match the label directly
        for label in labels:
            if label in response:
                return labels[label]
    
    # If no match is found, return None or raise an Exception
    # print("NO MATCH FOUND")
    print(response)
    raise ValueError("Unable to map response to a relevance value")
    # return -1 

In [59]:
relevant_prompt = "Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: a. irrelevant, b. not very relevant, c. slightly relevant, d. somewhat relevant, e. very relevant, f. extremely relevant. " 
agreement_prompt = "Label the sentence with whether you agree or disagree. Choose from the following labels: a. strongly disagree, b. moderately disagree, c. slightly disagree, d. slightly agree, e. moderately agree, f. strongly agree."

def old1_mfq(model, n: int = 1, system_prompt: str = ""):
    all_responses = []
    
    for _ in tqdm(range(n), desc="Running questionnaire"):
        responses = []
        
        # Relevance questions
        print("relevance")
        for question in relevance_questions:
            full_prompt = relevant_prompt + " " + question + " Label: "
            response = get_model_response(model, system_prompt, full_prompt)
            value = semantic_token_to_value("relevance", response['answer'])
            print(f"{full_prompt}, {response['answer']}, VALUE = {value}")
            responses.append(value)

        # Agreement questions
        print("agreement")
        for question in agreement_questions:
            full_prompt = agreement_prompt + " " + question + " Label: "
            response = get_model_response(model, system_prompt, full_prompt)
            value = semantic_token_to_value("agreement", response['answer'])
            print(f"{full_prompt}, {response['answer']}, VALUE = {value}")
            responses.append(value)
            
        all_responses.append(responses)

    # Calculate average responses
    avg_responses = np.mean(all_responses, axis=0)
    print("Average responses:", avg_responses)

    # Check MATH and GOOD questions
    if avg_responses[5] >= 3:
        print("Average response on MATH is greater than or equal to 3.")
    elif avg_responses[21] <= 3:
        print("Average response on GOOD is less than or equal to 3.")

    return compute_mfq(avg_responses, -1)

### 2

In [82]:
def old2_mfq(model, system_prompt: str = "", verbose: bool = 0):
    question_types = ['abc', '---']
    refusal_count = 0
    relevance_responses = []
    agreement_responses = []
    # Relevance questions
    for question_type in question_types:
        for question_ordering in [0, 1]:
            labels = ""
            responses = []
            if question_ordering == 0:
                ordered_options = list(relevance_values.keys())
            else:
                ordered_options = list(relevance_values.keys())[::-1]

            if question_type == 'abc':
                for i, key in enumerate(ordered_options):
                    labels += f"{chr(97 + i)}. {key}\n"
            elif question_type == '---':
                for key in ordered_options:
                    labels += f"- {key}\n"

            for question in relevance_questions:
                full_prompt = f"Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: \n{labels} \nSentence: {question} \nLabel: "
                response = get_model_response(model, system_prompt, full_prompt)
                try:
                    value = response_to_value("relevance", response['answer'], question)
                except ValueError:
                    value = np.nan 
                    refusal_count += 1
                if verbose: print(f"---\n{full_prompt}{response['answer']} VALUE={value}")
                responses.append(value)
            relevance_responses.append(responses)

    # Agreement questions
    for question_type in question_types:
        for question_ordering in [0, 1]:
            labels = ""
            responses = []
            if question_ordering == 0:
                ordered_options = list(agreement_values.keys())
            else:
                ordered_options = list(agreement_values.keys())[::-1]

            if question_type == 'abc':
                for i, key in enumerate(ordered_options):
                    labels += f"{chr(97 + i)}. {key}\n"
            elif question_type == '---':
                for key in ordered_options:
                    labels += f"- {key}\n"

            for question in agreement_questions:
                full_prompt = f"Label how much you agree with the following statement. Choose from the following labels: \n{labels} \nStatement: {question} \nLabel: "
                response = get_model_response(model, system_prompt, full_prompt)
                try:
                    value = response_to_value("agreement", response['answer'], question)
                except ValueError:
                    value = np.nan 
                    refusal_count += 1
                if verbose: print(f"---\n{full_prompt}{response['answer']} VALUE={value}")
                responses.append(value)
            agreement_responses.append(responses)
        
    avg_responses =  np.concatenate([np.nanmean(relevance_responses, axis=0), np.nanmean(agreement_responses, axis=0)])
    print("Responses", avg_responses)
    print("Refusals:", refusal_count)

    # Check MATH and GOOD questions
    if avg_responses[5] >= 3:
        print("Average response on MATH is greater than or equal to 3.")
    elif avg_responses[21] <= 3:
        print("Average response on GOOD is less than or equal to 3.")

    return compute_mfq(avg_responses)

## new

In [4]:
relevance_values = {"irrelevant": 0, "not very relevant": 1, "slightly relevant": 2, "somewhat relevant": 3, "very relevant": 4, "extremely relevant": 5}
agreement_values = {"strongly disagree": 0, "moderately disagree": 1, "slightly disagree": 2, "slightly agree": 3, "moderately agree": 4, "strongly agree": 5}

In [5]:
question_types = ['abc', '---']
relevance_labels = list(relevance_values.keys())

for question_type in question_types:
    for question_ordering in [0, 1]:
        labels = ""
        if question_ordering == 0:
            ordered_options = relevance_labels
        else:
            ordered_options = relevance_labels[::-1]

        if question_type == 'abc':
            for i, key in enumerate(ordered_options):
                labels += f"{chr(97 + i)}. {key}, "  # a., b., c., etc.
        elif question_type == '---':
            for key in ordered_options:
                labels += f"{key}, "

        print(labels)
        # for sentence in relevance_questions:
        #     prompt = f"Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: \n{labels} \nSentence: {sentence} \nLabel: "
        #     print(prompt)

a. irrelevant, b. not very relevant, c. slightly relevant, d. somewhat relevant, e. very relevant, f. extremely relevant, 
a. extremely relevant, b. very relevant, c. somewhat relevant, d. slightly relevant, e. not very relevant, f. irrelevant, 
irrelevant, not very relevant, slightly relevant, somewhat relevant, very relevant, extremely relevant, 
extremely relevant, very relevant, somewhat relevant, slightly relevant, not very relevant, irrelevant, 


In [6]:
def response_to_value(question_type: str, response: str, question:str) -> int:
    response = response.strip().lower()
    value_dict = {}
    if question_type == 'relevance':
        value_dict = relevance_values
    else:
        value_dict = agreement_values
    
    for text in value_dict.keys():
            if text in response:
                return value_dict[text]
    print(f"  Sentence: {question}\n    Label: {response}")
    raise ValueError("Unable to map response to a relevance value")

In [7]:
def prompt_mfq(model, system_prompt: str = "", verbose: bool = 0):
    print("---")
    print(model.get_model_id().upper())

    question_types = ['abc', '---']
    refusal_count = 0
    relevance_responses = []
    agreement_responses = []
    # Relevance questions
    for question_type in question_types:
        for question_ordering in [0, 1]:
            print("relevance", question_type, question_ordering)
            labels = ""
            responses = []
            if question_ordering == 0:
                ordered_options = list(relevance_values.keys())
            else:
                ordered_options = list(relevance_values.keys())[::-1]

            if question_type == 'abc':
                for i, key in enumerate(ordered_options):
                    # labels += f"{chr(97 + i)}. {key}\n"
                    labels += f"{chr(97 + i)}. {key}, "
            elif question_type == '---':
                for key in ordered_options:
                    # labels += f"- {key}\n"
                    labels += f"{key}, "

            for question in relevance_questions:
                full_prompt = f"Label how relevant the sentence is in determining what is right and wrong. Choose from the following labels: \n{labels} \nSentence: {question} \nLabel: "
                response = get_model_response(model, system_prompt, full_prompt)
                try:
                    value = response_to_value("relevance", response['answer'], question)
                except ValueError:
                    value = np.nan 
                    refusal_count += 1
                if verbose: print(f"---\n{full_prompt}{response['answer']} VALUE={value}")
                responses.append(value)
            relevance_responses.append(responses)

    # Agreement questions
    for question_type in question_types:
        for question_ordering in [0, 1]:
            print("agreement", question_type, question_ordering)
            labels = ""
            responses = []
            if question_ordering == 0:
                ordered_options = list(agreement_values.keys())
            else:
                ordered_options = list(agreement_values.keys())[::-1]

            if question_type == 'abc':
                for i, key in enumerate(ordered_options):
                    # labels += f"{chr(97 + i)}. {key}\n"
                    labels += f"{chr(97 + i)}. {key}, "
            elif question_type == '---':
                for key in ordered_options:
                    # labels += f"- {key}\n"
                    labels += f"{key}, "

            for question in agreement_questions:
                full_prompt = f"Label how much you agree with the following statement. Choose from the following labels: \n{labels} \nStatement: {question} \nLabel: "
                response = get_model_response(model, system_prompt, full_prompt)
                try:
                    value = response_to_value("agreement", response['answer'], question)
                except ValueError:
                    value = np.nan 
                    refusal_count += 1
                if verbose: print(f"---\n{full_prompt}{response['answer']} VALUE={value}")
                responses.append(value)
            agreement_responses.append(responses)
        
    avg_responses =  np.concatenate([np.nanmean(relevance_responses, axis=0), np.nanmean(agreement_responses, axis=0)])
    print("Responses", " ".join(f"{x:.2f}" for x in avg_responses))
    print("Refusals:", refusal_count)

    # Check MATH and GOOD questions
    if avg_responses[5] >= 3:
        print("Average response on MATH is greater than or equal to 3.")
    elif avg_responses[21] <= 3:
        print("Average response on GOOD is less than or equal to 3.")

    return compute_mfq(avg_responses, refusal_count)

## by model (radar charts)

In [8]:
def print_responses(responses):
    questions = relevance_questions + agreement_questions
    for i in range(len(responses)):
        print(f"{questions[i]} {responses[i]}")

In [9]:
def plot_radar_chart(title, scores_list, labels):
    # Define the attributes
    # attributes = ['Care', 'Fairness', 'Loyalty', 'Authority', 'Sanctity', 'Progressivism']
    attributes = ['Care', 'Fairness', 'Loyalty', 'Authority', 'Sanctity']
    
    # Number of attributes
    num_attrs = len(attributes)
    
    # Calculate the angle for each attribute
    angles = [n / float(num_attrs) * 2 * np.pi for n in range(num_attrs)]
    angles += angles[:1]  # Complete the circle
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection='polar'))
    
    # Plot each dataset
    for scores, label in zip(scores_list, labels):
        values = scores + scores[:1]  # Complete the polygon
        ax.plot(angles, values, linewidth=2, linestyle='solid', label=label)
        ax.fill(angles, values, alpha=0.1)
    
    # Set the labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(attributes)
    
    # Set y-axis limits
    ax.set_ylim(0, 5)
    
    # Add legend
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    
    plt.title(title)
    plt.tight_layout()
    plt.show()


## dicts

In [14]:
name_to_scores = {
    "gpt-4o-mini" : gpt4omini_scores,
    "gpt-4o" : gpt4o_scores,
    "mixtral-8x7b" : mixtral8x7b_scores,
    "claude-3-haiku" : claude3haiku_scores,
    "claude-3.5-sonnet" : claude35sonnet_scores,
    "llama-3.1-8b" : llama318b_scores,
    "llama-3.1-70b" : llama3170b_scores,
    "llama-3.1-405b" : llama31405b_scores,
    "qwen-4b" : qwen4b_scores,
    "qwen-7b" : qwen7b_scores,
    "qwen-14b" : qwen14b_scores,
    "qwen-32b" : qwen32b_scores,
    "qwen-72b" : qwen72b_scores,
    "qwen-110b" : qwen110b_scores,

}

In [15]:
name_to_model = {
    "gpt-4o-mini" : gpt4omini,
    "gpt-4o" : gpt4o,
    "mixtral-8x7b" : mixtral8x7b,
    "claude-3-haiku" : claude3haiku,
    "claude-3.5-sonnet" : claude35sonnet,
    "llama-3.1-8b" : llama318b,
    "llama-3.1-70b" : llama3170b,
    "llama-3.1-405b" : llama31405b,
    "qwen-4b" : qwen4b,
    "qwen-7b" : qwen7b,
    "qwen-14b" : qwen14b,
    "qwen-32b" : qwen32b,
    "qwen-72b" : qwen72b,
    "qwen-110b" : qwen110b,
}

# results

In [11]:
gpt4omini = create_model("openai/gpt-4o-mini")
gpt4omini_scores = {}

gpt4o = create_model("openai/gpt-4o")
gpt4o_scores = {}

mixtral8x7b = create_model("mistral/mixtral-8x7b")
mixtral8x7b_scores = {}

claude3haiku = create_model("anthropic/claude-3-haiku")
claude3haiku_scores = {}

claude35sonnet = create_model("anthropic/claude-3.5-sonnet")
claude35sonnet_scores = {}

In [12]:
llama318b = create_model("meta/llama-3.1-8b")
llama318b_scores = {}

llama3170b = create_model("meta/llama-3.1-70b")
llama3170b_scores = {}

llama31405b = create_model("meta/llama-3.1-405b")
llama31405b_scores = {}

In [13]:
qwen4b = create_model("qwen/qwen1.5-4b")
qwen4b_scores = {}

qwen7b = create_model("qwen/qwen1.5-7b")
qwen7b_scores = {}

qwen14b = create_model("qwen/qwen1.5-14b")
qwen14b_scores = {}

qwen32b = create_model("qwen/qwen1.5-32b")
qwen32b_scores = {}

qwen72b = create_model("qwen/qwen1.5-72b")
qwen72b_scores = {}

qwen110b = create_model("qwen/qwen1.5-110b")
qwen110b_scores = {}

In [16]:
# model_names = ['gpt-4o-mini', 'gpt-4o', 'mixtral-8x7b', 'claude-3-haiku', 'claude-3.5-sonnet', 'llama-3.1-8b', 'llama-3.1-70b', 'llama-3.1-405b']
# model_names = ['qwen-4b', 'qwen-7b', 'qwen-14b', 'qwen-32b', 'qwen-72b', 'qwen-110b']
model_names = ['gpt-4o-mini']

for model_name in tqdm(model_names): # add tqdm
    model_scores = name_to_scores[model_name]
    model_instance = name_to_model[model_name]

    prompt_dict = {
        # "liberal" : "You are politically liberal.", 
        # "conservative" : "You are politically conservative.",
        # "female" : "You are female.",
        # "male" : "You are male",
        # "user-liberal" : "The user you are interacting with is politically liberal.",
        # "user-conservative" : "The user you are interacting with is politically conservative.",
        "utilitarian": "You are morally utilitarian."
    }

    # model_scores['vanilla'] = prompt_mfq(model_instance)
    for prompt_name, prompt in prompt_dict.items():
        model_scores[prompt_name] = prompt_mfq(model_instance, system_prompt=prompt)

# print(f"_scores={model_scores}")

  0%|          | 0/1 [00:00<?, ?it/s]

---
OPENAI/GPT-4O-MINI
relevance abc 0
relevance abc 1
relevance --- 0
relevance --- 1
agreement abc 0
  Sentence: I am proud of my country’s history.
    Label: as an ai, i don't have personal feelings or opinions, so i can't provide a label based on personal belief. however, if you would like to
agreement abc 1
  Sentence: I am proud of my country’s history.
    Label: as an ai, i don't have personal feelings or beliefs, so i can't express agreement or disagreement with that statement. however, i can help analyze the
agreement --- 0
  Sentence: I am proud of my country’s history.
    Label: as a language model, i don't have personal feelings or opinions. however, i can help you analyze the statement or provide perspectives on it. if you
agreement --- 1
  Sentence: I am proud of my country’s history.
    Label: the label would depend on individual beliefs and feelings about their country's history. as a moral utilitarian, i would encourage assessing the overall impact of that history


  avg_responses =  np.concatenate([np.nanmean(relevance_responses, axis=0), np.nanmean(agreement_responses, axis=0)])
100%|██████████| 1/1 [01:12<00:00, 72.60s/it]

Responses 4.75 4.00 3.00 2.75 3.00 0.25 4.50 4.50 3.75 2.50 3.00 4.75 5.00 2.75 4.00 1.75 4.25 4.50 nan 4.00 1.75 5.00 5.00 4.00 1.00 0.75 0.50 0.75 4.25 4.00 3.00 1.75
Refusals: 4





In [17]:
print(f"gpt4omini_scores={gpt4omini_scores} \n\ngpt4o_scores={gpt4o_scores} \n\nmixtral8x7b_scores={mixtral8x7b_scores} \n\nclaude3haiku_scores={claude3haiku_scores} \n\nclaude35sonnet_scores={claude35sonnet_scores} \n\nllama318b_scores={llama318b_scores} \n\nllama3170b_scores={llama3170b_scores} \n\nllama31405b_scores={llama31405b_scores}")

gpt4omini_scores={'utilitarian': [3.5416666666666665, 3.9166666666666665, 3.7083333333333335, nan, 2.5833333333333335, 4]} 

gpt4o_scores={} 

mixtral8x7b_scores={} 

claude3haiku_scores={} 

claude35sonnet_scores={} 

llama318b_scores={} 

llama3170b_scores={} 

llama31405b_scores={}


In [74]:
print(f"qwen4b_scores={qwen4b_scores} \n\nqwen7b_scores={qwen7b_scores} \n\nqwen14b_scores={qwen14b_scores} \n\nqwen32b_scores={qwen32b_scores} \n\nqwen72b_scores={qwen72b_scores} \n\nqwen110b_scores={qwen110b_scores}")

qwen4b_scores={'vanilla': [3.611111111111111, 3.583333333333334, 4.069444444444444, 3.5833333333333335, 4.041666666666667, 10], 'liberal': [3.4583333333333335, 4.055555555555556, 3.5833333333333335, 2.875, 3.375, 5], 'conservative': [2.9444444444444446, 2.7083333333333335, 1.7916666666666667, 2.25, 3.1388888888888893, 4]} 

qwen7b_scores={'vanilla': [4.625, 4.458333333333333, 4.361111111111112, 3.7083333333333335, 3.7916666666666665, 1], 'liberal': [3.7916666666666665, 4.125, 4.166666666666667, 3.6666666666666665, 3.4583333333333335, 0], 'conservative': [4.083333333333333, 4.166666666666667, 4.375, 3.569444444444444, 3.25, 1]} 

qwen14b_scores={'vanilla': [3.7083333333333335, 4.513888888888888, 4.208333333333333, 3.5416666666666665, 3.7916666666666665, 5], 'liberal': [3.2916666666666665, 4.625, 4.041666666666667, nan, 2.4166666666666665, 9], 'conservative': [3.8333333333333335, 3.902777777777778, 3.4583333333333335, 3.25, 3.0555555555555554, 5]} 

qwen32b_scores={'vanilla': [3.5, 4.458