In [12]:
from enum import Enum
import marvin
import os
import openai
import pandas as pd
from pydantic import BaseModel
import litellm
from groq import Groq

marvin.settings.llm_temperature=0.0 # you want a grading schema which consistently gets you the same answers!!! 
marvin.settings.llm_max_tokens=200
llm_max_context_tokens=600 
openai.api_key = os.environ.get("OPEN_AI_KEY")  # this is for marvin, not litellm
marvin.settings.llm_model='openai/gpt-4'
pd.set_option('display.max_colwidth', None)

    
def call_model(model: str, message: dict, temperature: float, api_key: str):
    """Calls the specified model with given parameters."""
    try:
        response = litellm.completion(model=model, messages=[message], temperature=temperature, api_key=api_key)
        return response
    except Exception as e:
        print(f"Error calling model: {e}")
        return None

def create_response_df(providers_models: dict, prompts: list):
    """Generates a DataFrame from model responses for given prompts."""
    responses = [
        {"model": model, "prompt": prompt, "response": get_model_response(model, prompt, provider)}
        for provider, models in providers_models.items() for model in models for prompt in prompts
    ]
    return pd.DataFrame(responses)

def get_model_response(model, prompt, provider):
    """Helper function to clean up response retrieval and formatting."""
    response = call_model(model=model, message={"role": "user", "content": prompt}, temperature=0.0,
                          api_key=os.environ.get(f"{provider.upper()}_KEY"))
    return response["choices"][0]["message"]["content"].replace('\n', ', ').strip() if response else "Error or no response"

def classify_responses(df, rubric):
    """Applies classification rubric to responses in DataFrame."""
    df['category'] = df.apply(lambda row: marvin.classify(row['response'], rubric).name, axis=1)
    return df

@marvin.classifier
class NumberofRs(Enum):
    PASS = """Says that the letter r is in the word carryforward four times"""
    FAIL = """Says something else, including that the letter r is in the word carryforward some number of times that is not four"""
    ERROR = "Says 'Error or no response' " 

@marvin.classifier
class TimeDiff(Enum):
    PASS = """Says that the time in Copenhagen is 6 PM"""
    FAIL = """Says something else"""
    ERROR = "Says 'Error or no response' " 

@marvin.classifier
class LetterLoc(Enum):
    PASS = """Says that 'y' is the letter"""
    FAIL = """Says something else"""
    ERROR = "Says 'Error or no response' " 

@marvin.classifier
class SuperBowl2023(Enum):
    PASS = """Says the Kansas City Chiefs"""
    FAIL = """Says something else"""
    ERROR = "Says 'Error or no response' " 

@marvin.classifier
class SuperBowl2022(Enum):
    PASS = """Says the Los Angeles Rams"""
    FAIL = """Says something else"""
    ERROR = "Says 'Error or no response' " 


prompt_rubric_pairs = [
    {
        "prompt": "How many times is the letter 'r' in the word 'carryforward'?",
        "rubric": NumberofRs,
    },
    {
        "prompt": "If it's 12 PM on March 26th in Boston, what time is that in Copenhagen?",
        "rubric": TimeDiff,
    },
    {
        "prompt": (
            "I'm looking at the letter \"b\" on the keyboard. "
            "I now go up to the row above, and slightly to the right, to a key that is adjacent to b. "
            "Then I go up again, and just slightly to the left, to a key that is adjacent to that letter. "
            "So now I'm two rows up from the initial \"b\" letter. What letter am I touching?"
        ),
        "rubric": LetterLoc,
    },
    {
        "prompt": "Which team won the 2022 Superbowl?",
        "rubric": SuperBowl2022,
    },
    {
        "prompt": "Which team won the 2023 Superbowl?",
        "rubric": SuperBowl2023,
    },
]


# Process prompts and models
providers_models = {
    "GROQ": ["groq/mixtral-8x7b-32768"],
    "GEMINI": ["gemini/gemini-pro"],
    "OPENAI": ["openai/gpt-3.5-turbo", "openai/gpt-4"],
    "ANTHROPIC": ["anthropic/claude-2"],
}
# Generate and classify responses
df_list = [classify_responses(create_response_df(providers_models, [pair["prompt"]]), pair["rubric"]) for pair in prompt_rubric_pairs]

# Concatenate all DataFrames into one
final_df = pd.concat(df_list, ignore_index=True)



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error calling model: PalmException - No response received. Original response - response: [index: 0
finish_reason: SAFETY
safety_ratings {
  category: HARM_CATEGORY_SEXUALLY_EXPLICIT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HATE_SPEECH
  probability: LOW
}
safety_ratings {
  category: HARM_CATEGORY_HARASSMENT
  probability: MEDIUM
}
safety_ratings {
  category: HARM_CATEGORY_DANGEROUS_CONTENT
  probability: NEGLIGIBLE
}
]
The candidate content was flagged for safety reasons.

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.

Error calling model: PalmException - No response received. Original response - response: [index: 0
finish_reason: SAFETY
safety_ratings {
  category: HARM_CATEGORY

In [13]:
final_df

Unnamed: 0,model,prompt,response,category
0,groq/mixtral-8x7b-32768,How many times is the letter 'r' in the word 'carryforward'?,"The letter 'r' appears three times in the word 'carryforward'. Here's the breakdown:, , 1. The 'r' in the first syllable 'car', 2. The 'r' in the middle of the word 'carry', 3. The 'r' at the end of the word 'forward', , So, the letter 'r' is found three times in this word.",FAIL
1,gemini/gemini-pro,How many times is the letter 'r' in the word 'carryforward'?,Error or no response,ERROR
2,openai/gpt-3.5-turbo,How many times is the letter 'r' in the word 'carryforward'?,There are three times the letter 'r' appears in the word 'carryforward'.,FAIL
3,openai/gpt-4,How many times is the letter 'r' in the word 'carryforward'?,The letter 'r' appears 3 times in the word 'carryforward'.,FAIL
4,anthropic/claude-2,How many times is the letter 'r' in the word 'carryforward'?,"Okay, let's analyze this step-by-step:, The word is: carryforward, The letter we are counting is: r, There are two r's in the first part of the word: ""carry"", There are two more r's in the second part of the word: ""forward"", So there are 2 + 2 = 4, , Therefore, the letter 'r' appears 4 times in the word 'carryforward'.",PASS
5,groq/mixtral-8x7b-32768,"If it's 12 PM on March 26th in Boston, what time is that in Copenhagen?","As of my knowledge up to October 2021, Daylight Saving Time (DST) is not considered. The time difference between Boston, Massachusetts (Eastern Standard Time) and Copenhagen, Denmark (Central European Time) is 6 hours. When it's 12 PM (noon) on March 26th in Boston, it would be 6 PM on the same day in Copenhagen. However, it is important to note that DST begins on different dates in the U.S. and Europe, so the time difference may vary depending on the specific dates of the switch. In 2022, DST in the European Union starts on the last Sunday of March, while in the U.S., it starts on the second Sunday of March. Therefore, on March 26, 2023, the time difference between the two cities would be 5 hours, as both places would be in DST. Always double-check time differences during the transition periods of DST to ensure accuracy.",PASS
6,gemini/gemini-pro,"If it's 12 PM on March 26th in Boston, what time is that in Copenhagen?",7 PM on March 26th,FAIL
7,openai/gpt-3.5-turbo,"If it's 12 PM on March 26th in Boston, what time is that in Copenhagen?",It would be 5 PM on March 26th in Copenhagen. Copenhagen is 5 hours ahead of Boston.,FAIL
8,openai/gpt-4,"If it's 12 PM on March 26th in Boston, what time is that in Copenhagen?",That would be 6 PM on March 26th in Copenhagen. Copenhagen is 6 hours ahead of Boston.,PASS
9,anthropic/claude-2,"If it's 12 PM on March 26th in Boston, what time is that in Copenhagen?","Okay, let's think through this step-by-step:, 1) Boston is in the Eastern Time Zone, which is UTC-5. That means Boston time is 5 hours behind Coordinated Universal Time (UTC)., 2) Copenhagen is in the Central European Time Zone, which is UTC+1. That means Copenhagen time is 1 hour ahead of UTC., 3) So if it's 12 PM (noon) in Boston, that's 5 PM UTC., 4) If it's 5 PM UTC, then in Copenhagen (UTC+1) it is 6 PM., , Therefore, if it's 12 PM on March 26th in Boston, the time in Copenhagen is 6 PM on March 26th.",PASS
