In [2]:
import pandas as pd
import dspy
import pandas as pd
from pydantic import BaseModel, Field
from typing import Literal, Tuple
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
import httpx
import os

In [3]:
class Input(BaseModel):
    question: str = Field(description="The question to be answered")
    answer: str = Field(description="The answer to the question")

class Output(BaseModel):
    rating: Literal["Excellent", "Bad", "Could be improved", "Acceptable"] = Field(description="The LLM's rating of the answer")
    feedback: str = Field(description="Feedback explaining the rating")

class LLMJudgeSignature(dspy.Signature):
    """ Rate the quality of the answer based on the question and provide feedback """

    input: Input = dspy.InputField()
    output: Output = dspy.OutputField()

In [4]:
import requests
from dsp import LM
from ratelimit import limits

RL_CALLS=10_000
RL_PERIOD_SECONDS=60

class OpenRouterClient(LM):
    def __init__(self, api_key=None, base_url="https://openrouter.ai/api/v1", model="gpt-4o", extra_headers=None, **kwargs):
        self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
        self.base_url = base_url
        self.model = model
        self.extra_headers = extra_headers or {}
        self.history = []
        self.provider = "openai"
        self.kwargs = {'temperature': 0.0,
            'max_tokens': 1024,
            'top_p': 1,
            'frequency_penalty': 0,
            'presence_penalty': 0,
            'n': 1}
        self.kwargs.update(kwargs)

    def _get_choice_text(choice):
        return choice["message"]["content"]

    def _get_headers(self):
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        headers.update(self.extra_headers)
        return headers

    @limits(calls=RL_CALLS, period=RL_PERIOD_SECONDS)
    def basic_request(self, prompt: str, **kwargs):
        headers = self._get_headers()
        data = {
            "model": self.model,
            "messages": [
                {"role": "user", "content": prompt}
            ],
            **kwargs
        }

        response = requests.post(f"{self.base_url}/chat/completions", headers=headers, json=data)
        response_data = response.json()
        print(response_data)

        self.history.append({
            "prompt": prompt,
            "response": response_data,
            "kwargs": kwargs,
        })

        return response_data

    def __call__(self, prompt, **kwargs):
        req_kwargs = self.kwargs

        if not kwargs:
            req_kwargs.update(kwargs)

        response_data = self.basic_request(prompt, **req_kwargs)
        completions = [choice["message"]["content"] for choice in response_data.get("choices", [])]
        return completions

In [6]:
class LLMJudge:
    def __init__(self, model='gpt-4o', trainset=None):
        self.lm = OpenRouterClient(model=model)
        dspy.settings.configure(lm=self.lm)
        self.predictor = dspy.TypedPredictor(LLMJudgeSignature)
        
        if trainset:
            print("reached here")
            self.optimize(trainset)

    def rate_answer(self, question: str, answer: str) -> Tuple[str, str]:
        input_data = Input(question=question, answer=answer)
        prediction = self.predictor(input=input_data)
        return prediction.output.rating, prediction.output.feedback

    def optimize(self, trainset):
        def metric(example, pred, trace=None):
            return float(example.human_rating == pred.output.rating)

        config = dict(
            max_bootstrapped_demos=4,
            max_labeled_demos=4,
            num_candidate_programs=10,
            num_threads=4
        )

        teleprompter = BootstrapFewShotWithRandomSearch(metric=metric, **config)
        self.predictor = teleprompter.compile(self.predictor, trainset=trainset)

In [7]:
import json
comp_chem_data = json.load(open('questions_with_feedback-v1.json'))

In [8]:
questions = [q['question'] for q in comp_chem_data]
answers = [q['llm_feedback'] for q in comp_chem_data]
feedback = [q['rating'] for q in comp_chem_data]

trainset = [
    dspy.Example(
        input=Input(question=question, answer=answer),
        human_rating=feedback
    ).with_inputs('input')
    for question, answer, feedback in zip(questions, answers, feedback)
]

llm_judge = LLMJudge(trainset=trainset)

reached here
Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 10 candidate sets.


Average Metric: 1.0 / 3  (33.3):  23%|██▎       | 3/13 [00:02<00:05,  1.84it/s]

{'id': 'gen-IRkOURTwxcpoqoDIoxgM3WCvyjzm', 'model': 'openai/gpt-4o', 'object': 'chat.completion', 'created': 1727026919, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': '{\n  "rating": "Excellent",\n  "feedback": "The answer provides a comprehensive and detailed explanation of when to apply implicit solvation models in DFT calculations. It covers various scenarios and reasons for using implicit solvation models, such as studying molecules in solution, simplifying computations, focusing on general solvent effects, and avoiding complexity. The answer also mentions the limitations of implicit solvation models and when explicit solvent molecules might be necessary. Overall, the response is well-structured, informative, and addresses the question thoroughly."\n}', 'refusal': ''}}], 'system_fingerprint': 'fp_e375328146', 'usage': {'prompt_tokens': 552, 'completion_tokens': 105, 'total_tokens': 657}}
{'id': 'gen-EUQooUcCzhmDg1Hfh

Average Metric: 1.0 / 4  (25.0):  31%|███       | 4/13 [00:02<00:03,  2.37it/s]

{'id': 'gen-TfB5KkYGPWTlZaKkoQkd8z9wJgWj', 'model': 'openai/gpt-4o', 'object': 'chat.completion', 'created': 1727026920, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': '{\n  "rating": "Excellent",\n  "feedback": "The answer provides a comprehensive and detailed explanation of the factors to consider when determining the applicability of Density Functional Theory (DFT) to a chemical system. It covers various aspects such as system size, type of elements, nature of chemical bonds, electronic structure, spin states, and desired properties. Additionally, it offers actionable steps and a summary, making it a well-rounded and informative response. The structure is clear, and the information is relevant and accurate, making it an excellent answer to the question."\n}', 'refusal': ''}}], 'system_fingerprint': 'fp_e375328146', 'usage': {'prompt_tokens': 995, 'completion_tokens': 113, 'total_tokens': 1108}}


Average Metric: 4.0 / 7  (57.1):  54%|█████▍    | 7/13 [00:03<00:02,  2.38it/s]

{'id': 'gen-iYOL2P3AKSBmgBxMLVByhl3NWGph', 'model': 'openai/gpt-4o', 'object': 'chat.completion', 'created': 1727026921, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': '{\n  "rating": "Excellent",\n  "feedback": "The answer provided is comprehensive and well-structured, covering a wide range of scenarios where explicit solvation is necessary. It clearly explains the importance of specific solvent-solute interactions, reaction mechanisms, local solvent structure, non-uniform environments, high precision requirements, and solvent-sensitive properties. The use of bullet points and subcategories makes the information easy to follow and understand. Overall, the answer is detailed and informative, making it an excellent response to the question."\n}', 'refusal': ''}}], 'system_fingerprint': 'fp_e375328146', 'usage': {'prompt_tokens': 742, 'completion_tokens': 105, 'total_tokens': 847}}
{'id': 'gen-5PUMVYE2Ct0AWhbafMISCzpYN7FL',

Average Metric: 5.0 / 8  (62.5):  62%|██████▏   | 8/13 [00:03<00:01,  2.81it/s]

{'id': 'gen-gfC97dSbO4HTuzWo1kdF32mqE6FR', 'model': 'openai/gpt-4o', 'object': 'chat.completion', 'created': 1727026922, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': '{\n  "rating": "Excellent",\n  "feedback": "The answer is comprehensive and accurately addresses the question. It explains the concept of conformers in computational chemistry, provides the specific energy window (3 to 5 kcal/mol), and elaborates on the significance of this range in practical terms. The explanation is clear and informative, making it easy to understand why conformers within this energy range are considered significant."\n}', 'refusal': ''}}], 'system_fingerprint': 'fp_e375328146', 'usage': {'prompt_tokens': 314, 'completion_tokens': 84, 'total_tokens': 398}}




{'id': 'gen-C7C6uj8mMJ2zzMqcRMlNFNabPFA1', 'model': 'openai/gpt-4o', 'object': 'chat.completion', 'created': 1727026923, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': '{\n  "rating": "Excellent",\n  "feedback": "The answer is comprehensive and well-structured, covering various scenarios where a conformer search is important. It explains the concept clearly and provides specific examples, making it easy to understand why and when a conformer search should be conducted. The use of bullet points and bold text for key terms enhances readability and emphasis. Overall, the answer is detailed and informative, addressing the question effectively."\n}', 'refusal': ''}}], 'system_fingerprint': 'fp_e375328146', 'usage': {'prompt_tokens': 578, 'completion_tokens': 90, 'total_tokens': 668}}
{'id': 'gen-Q7kmx0W7sDWQStiLk6QsJQm6eTDe', 'model': 'openai/gpt-4o', 'object': 'chat.completion', 'created': 1727026923, 'choices': [{'logprobs':

KeyboardInterrupt: 