In [1]:
import dspy
import os

PRICE_PER_PROMPT_TOKEN = (10/1000000)
PRICE_PER_COMPLETION_TOKEN = (30/1000000)

gpt_4 = dspy.AzureOpenAI(
    api_version = "2023-12-01-preview",
    api_base = "https://agent-eval-east-us-2.openai.azure.com/",
    model = "gpt-4-turbo-2024-04-09",
    api_key = "API_KEY"
)

colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(lm=gpt_4, rm=colbertv2_wiki17_abstracts)

dspy.settings.show_guidelines = False

cache_turn_on: False


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=50, eval_seed=2023, dev_size=200, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

  table = cls._concat_blocks(blocks, axis=0)


(50, 200)

In [3]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [4]:
class GenerateSearchQuery(dspy.Signature):
    """Write a simple search query that will help answer a complex question."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    query = dspy.OutputField()

In [5]:
from dsp.utils import deduplicate

class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=2, max_hops=2):
        super().__init__()

        self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.max_hops = max_hops

    def forward(self, question):
        context = []

        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [6]:
# Get the prediction. This contains `pred.context` and `pred.answer`.
uncompiled_baleen = SimplifiedBaleen(passages_per_hop=3, max_hops=3)  # uncompiled (i.e., zero-shot) program

In [7]:
# # Ask any question you like to this simple RAG program.
# my_question = "How many storeys are in the castle that David Gregory inherited?"

# pred = uncompiled_baleen(my_question)

# # Print the contexts and the answer.
# print(f"Question: {my_question}")
# print(f"Predicted Answer: {pred.answer}")
# print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

In [8]:
def validate_context_and_answer_and_hops(example, pred, trace=None):
    if not dspy.evaluate.answer_exact_match(example, pred): return False
    if not dspy.evaluate.answer_passage_match(example, pred): return False

    hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs]

    if max([len(h) for h in hops]) > 100: return False
    if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))): return False

    return True

In [9]:
def gold_passages_retrieved(example, pred, trace=None):
    gold_titles = set(map(dspy.evaluate.normalize_text, example['gold_titles']))
    found_titles = set(map(dspy.evaluate.normalize_text, [c.split(' | ')[0] for c in pred.context]))

    return gold_titles.issubset(found_titles)

In [10]:
from dspy.evaluate.evaluate import Evaluate

# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate_on_hotpotqa = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)

In [11]:
# validate that tokens count are 0
assert gpt_4.total_completion_tokens == 0 and gpt_4.total_prompt_tokens == 0

## Evaluation

In [12]:
import tiktoken

# Get tokenizer
tokenizer = tiktoken.encoding_for_model("gpt-4")


In [13]:
def get_total_demo_tokens(compiled_program):
    total_prompt_tokens = 0
    for predictor in compiled_program.predictors():
        input_fields = predictor.signature.input_fields
        demos = predictor.demos
        for demo in demos:
            for key, value in demo.items():
                if key in input_fields:
                    # print("key: ", key, "value: ", value)
                    total_prompt_tokens += len(tokenizer.encode(str(value)))
    print("Nr of prompt tokens of demos in compiled_baleen: ", total_prompt_tokens)
    return total_prompt_tokens

In [14]:
import time
from tqdm import tqdm
import random

NR_OF_RUNS = 5

program_evals = []
for i in tqdm(range(NR_OF_RUNS)):
    compile_prompt_tokens_pre, compile_completion_tokens_pre = gpt_4.total_prompt_tokens, gpt_4.total_completion_tokens

    # shuffle trainset list
    random.shuffle(trainset)

    uncompiled_baleen = SimplifiedBaleen(passages_per_hop=3, max_hops=3)

    compile_prompt_tokens_post = gpt_4.total_prompt_tokens - compile_prompt_tokens_pre
    compile_completion_tokens_post = gpt_4.total_completion_tokens - compile_completion_tokens_pre

    
    program_results = {
        'compile_completion_tokens': compile_completion_tokens_post,
        'compile_prompt_tokens': compile_prompt_tokens_post,
        'compile_cost': compile_prompt_tokens_post * PRICE_PER_PROMPT_TOKEN + compile_completion_tokens_post * PRICE_PER_COMPLETION_TOKEN,
        'demos_prompt_tokens': 0,
        'eval_accuracies': {},
        'eval_costs': {},
        'per_task_results': {}
    }


    # Evaluate each task in the dev set
    for run in range(1,2):
        program_results['per_task_results'][f'run{run}'] = []
        program_results['eval_accuracies'][f'run{run}'] = 0
        program_results['eval_costs'][f'run{run}'] = 0
        for example in devset:
            start_time = time.time()

            # Log completion and prompt tokens
            completion_tokens_pre = gpt_4.total_completion_tokens
            prompt_tokens_pre = gpt_4.total_prompt_tokens
            
            try:
                # Evaluate the task
                pred = uncompiled_baleen(example.question)
                # Evaluate overall performance using answer exact match metric
                exact_match = dspy.evaluate.answer_exact_match(example, pred)
                print(f"Answer Exact Match: {exact_match}")
            except:
                print("Error evaluating task")
                exact_match = False
            
            # Log completion and prompt tokens
            completion_tokens_post = gpt_4.total_completion_tokens - completion_tokens_pre
            prompt_tokens_post = gpt_4.total_prompt_tokens - prompt_tokens_pre
            print(f"Completion Tokens: {completion_tokens_post}, Prompt Tokens: {prompt_tokens_post}")
            
            # Log evaluation time
            eval_time = time.time() - start_time
            print(f"Evaluation Time: {eval_time} seconds")
            
            

            program_results['per_task_results'][f'run{run}'].append({
                'question': example.question,
                'pred': pred.answer,
                'groud_truth': example.answer,
                'exact_match': exact_match,
                'completion_tokens': completion_tokens_post,
                'prompt_tokens': prompt_tokens_post,
                'cost': prompt_tokens_post * PRICE_PER_PROMPT_TOKEN + completion_tokens_post * PRICE_PER_COMPLETION_TOKEN,
                'eval_time': eval_time
            })


        # Calculate the overall evaluation accuracy
        program_results['eval_accuracies'][f'run{run}'] = sum([task['exact_match'] for task in program_results['per_task_results'][f"run{run}"]])/len(program_results['per_task_results'][f"run{run}"])

        # Calculate the overall cost
        program_results['eval_costs'][f'run{run}'] = sum([task['cost'] for task in program_results['per_task_results'][f"run{run}"]])

    program_evals.append(program_results)

import time
time.sleep(3)
# write the results to a file
import json
with open(f'gpt_4_0409_uncompiled_3hops3passagesperhop_eval_results.json', 'w') as f:
    json.dump(program_evals, f)

  0%|          | 0/5 [00:00<?, ?it/s]

Answer Exact Match: False
Completion Tokens: 292, Prompt Tokens: 3004
Evaluation Time: 19.012964010238647 seconds
Answer Exact Match: False
Completion Tokens: 447, Prompt Tokens: 3807
Evaluation Time: 48.209317684173584 seconds
Answer Exact Match: True
Completion Tokens: 362, Prompt Tokens: 3557
Evaluation Time: 29.971524238586426 seconds
Answer Exact Match: True
Completion Tokens: 197, Prompt Tokens: 3078
Evaluation Time: 15.290457248687744 seconds
Answer Exact Match: False
Completion Tokens: 502, Prompt Tokens: 3517
Evaluation Time: 36.88913369178772 seconds
Answer Exact Match: True
Completion Tokens: 261, Prompt Tokens: 3377
Evaluation Time: 14.24359941482544 seconds
Answer Exact Match: False
Completion Tokens: 629, Prompt Tokens: 4667
Evaluation Time: 46.198753356933594 seconds
Answer Exact Match: False
Completion Tokens: 176, Prompt Tokens: 2772
Evaluation Time: 19.73962688446045 seconds
Answer Exact Match: False
Completion Tokens: 276, Prompt Tokens: 3805
Evaluation Time: 22.0713

 20%|██        | 1/5 [1:26:57<5:47:50, 5217.73s/it]

Answer Exact Match: False
Completion Tokens: 225, Prompt Tokens: 3172
Evaluation Time: 22.46017551422119 seconds
Answer Exact Match: False
Completion Tokens: 335, Prompt Tokens: 3262
Evaluation Time: 31.250972270965576 seconds
Answer Exact Match: False
Completion Tokens: 444, Prompt Tokens: 4128
Evaluation Time: 40.7805712223053 seconds
Answer Exact Match: True
Completion Tokens: 381, Prompt Tokens: 4428
Evaluation Time: 29.523544311523438 seconds
Answer Exact Match: True
Completion Tokens: 194, Prompt Tokens: 3076
Evaluation Time: 15.586866855621338 seconds
Answer Exact Match: True
Completion Tokens: 359, Prompt Tokens: 4836
Evaluation Time: 32.21825623512268 seconds
Answer Exact Match: True
Completion Tokens: 307, Prompt Tokens: 3623
Evaluation Time: 20.531700611114502 seconds
Answer Exact Match: False
Completion Tokens: 419, Prompt Tokens: 5298
Evaluation Time: 44.875377893447876 seconds
Answer Exact Match: False
Completion Tokens: 159, Prompt Tokens: 2758
Evaluation Time: 17.260162

 40%|████      | 2/5 [2:56:20<4:25:08, 5302.94s/it]

Answer Exact Match: False
Completion Tokens: 134, Prompt Tokens: 2916
Evaluation Time: 18.973766565322876 seconds
Answer Exact Match: False
Completion Tokens: 180, Prompt Tokens: 3117
Evaluation Time: 16.063294172286987 seconds
Answer Exact Match: False
Completion Tokens: 398, Prompt Tokens: 3853
Evaluation Time: 36.481083154678345 seconds
Answer Exact Match: True
Completion Tokens: 366, Prompt Tokens: 4155
Evaluation Time: 38.92368197441101 seconds
Answer Exact Match: True
Completion Tokens: 196, Prompt Tokens: 3078
Evaluation Time: 24.229981184005737 seconds
Answer Exact Match: False
Completion Tokens: 439, Prompt Tokens: 4227
Evaluation Time: 33.963685035705566 seconds
Answer Exact Match: True
Completion Tokens: 261, Prompt Tokens: 3377
Evaluation Time: 23.77752137184143 seconds
Answer Exact Match: False
Completion Tokens: 548, Prompt Tokens: 6041
Evaluation Time: 40.264129400253296 seconds
Answer Exact Match: False
Completion Tokens: 154, Prompt Tokens: 2440
Evaluation Time: 14.293

 60%|██████    | 3/5 [4:21:44<2:54:02, 5221.34s/it]

Answer Exact Match: False
Completion Tokens: 158, Prompt Tokens: 2946
Evaluation Time: 12.949060916900635 seconds
Answer Exact Match: False
Completion Tokens: 270, Prompt Tokens: 3210
Evaluation Time: 18.466207265853882 seconds
Answer Exact Match: False
Completion Tokens: 326, Prompt Tokens: 3297
Evaluation Time: 20.252460718154907 seconds
Answer Exact Match: True
Completion Tokens: 299, Prompt Tokens: 3306
Evaluation Time: 22.676756620407104 seconds
Answer Exact Match: True
Completion Tokens: 209, Prompt Tokens: 3092
Evaluation Time: 17.969764947891235 seconds
Answer Exact Match: False
Completion Tokens: 468, Prompt Tokens: 4020
Evaluation Time: 42.739999532699585 seconds
Answer Exact Match: True
Completion Tokens: 262, Prompt Tokens: 2808
Evaluation Time: 18.923916816711426 seconds
Answer Exact Match: False
Completion Tokens: 427, Prompt Tokens: 3316
Evaluation Time: 37.10895037651062 seconds
Answer Exact Match: False
Completion Tokens: 161, Prompt Tokens: 2676
Evaluation Time: 14.00

 80%|████████  | 4/5 [5:38:40<1:23:02, 4982.52s/it]

Answer Exact Match: False
Completion Tokens: 250, Prompt Tokens: 3025
Evaluation Time: 16.53172469139099 seconds
Answer Exact Match: False
Completion Tokens: 302, Prompt Tokens: 3438
Evaluation Time: 17.36777925491333 seconds
Answer Exact Match: False
Completion Tokens: 403, Prompt Tokens: 3134
Evaluation Time: 20.75030207633972 seconds
Answer Exact Match: True
Completion Tokens: 327, Prompt Tokens: 5124
Evaluation Time: 20.088366508483887 seconds
Answer Exact Match: True
Completion Tokens: 222, Prompt Tokens: 3104
Evaluation Time: 19.63569951057434 seconds
Answer Exact Match: True
Completion Tokens: 410, Prompt Tokens: 3876
Evaluation Time: 25.554605960845947 seconds
Answer Exact Match: True
Completion Tokens: 228, Prompt Tokens: 2775
Evaluation Time: 20.67227292060852 seconds
Answer Exact Match: False
Completion Tokens: 432, Prompt Tokens: 3569
Evaluation Time: 29.055702924728394 seconds
Answer Exact Match: False
Completion Tokens: 198, Prompt Tokens: 2793
Evaluation Time: 23.5021998

100%|██████████| 5/5 [6:44:47<00:00, 4857.52s/it]  

Answer Exact Match: False
Completion Tokens: 168, Prompt Tokens: 3550
Evaluation Time: 13.289856672286987 seconds





In [16]:
gpt_4.history

[{'prompt': "Write a simple search query that will help answer a complex question.\n\n---\n\nContext: N/A\n\nQuestion: Are both Cangzhou and Qionghai in the Hebei province of China?\n\nReasoning: Let's think step by step in order to",
  'response': {'id': 'chatcmpl-9QHwRKANTmlzEM4AfHcBtUtg2qoXX',
   'choices': [{'finish_reason': 'stop',
     'index': 0,
     'logprobs': None,
     'message': {'content': 'construct a search query that will help us find the answer to the question. First, we need to identify the key elements of the question: "Cangzhou," "Qionghai," and "Hebei province." We want to find out if both Cangzhou and Qionghai are located in the Hebei province of China. A simple and effective search query would be:\n\n"Cangzhou Qionghai Hebei province China locations"\n\nThis query includes all the key elements and is likely to yield results that specify the locations of Cangzhou and Qionghai in relation to the Hebei province.',
      'role': 'assistant',
      'function_call': N

In [15]:
gpt_4.inspect_history(50)




Answer questions with short factoid answers.

---

Context:
[1] «Eldridge Cleaver | Leroy Eldridge Cleaver (August 31, 1935 – May 1, 1998) was an American writer and political activist who became an early leader of the Black Panther Party. His 1968 book, "Soul On Ice", is a collection of essays that, at the time of its publication, was praised by "The New York Times Book Review" as "brilliant and revealing". In the most controversial part of the book, Cleaver acknowledges committing many acts of rape.»
[2] «Soul On Ice (book) | Soul On Ice is a memoir and collection of essays by Eldridge Cleaver. Originally written in Folsom State Prison in 1965, and published three years later in 1968, it is Cleaver's best known writing and remains a seminal work in African-American literature. The treatises were first printed in the nationally-circulated monthly "Ramparts" and became widely read (even praised by Norman Mailer) for their illustration and commentary on "Black America". Throughout hi

'\n\n\nAnswer questions with short factoid answers.\n\n---\n\nContext:\n[1] «Eldridge Cleaver | Leroy Eldridge Cleaver (August 31, 1935 – May 1, 1998) was an American writer and political activist who became an early leader of the Black Panther Party. His 1968 book, "Soul On Ice", is a collection of essays that, at the time of its publication, was praised by "The New York Times Book Review" as "brilliant and revealing". In the most controversial part of the book, Cleaver acknowledges committing many acts of rape.»\n[2] «Soul On Ice (book) | Soul On Ice is a memoir and collection of essays by Eldridge Cleaver. Originally written in Folsom State Prison in 1965, and published three years later in 1968, it is Cleaver\'s best known writing and remains a seminal work in African-American literature. The treatises were first printed in the nationally-circulated monthly "Ramparts" and became widely read (even praised by Norman Mailer) for their illustration and commentary on "Black America". Th

### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Extracting values for plotting
x = [program['eval_cost'] for program in program_evals] 
y = [program['eval_acc'] for program in program_evals] 
# num_demos = [trial.params['num_demos_for_predictor_prog'] for trial in programs]  # Number of demos
joint_optimized = ["Joint optimization"] * len(program_evals)

# add one manual point to the plot
x.append(0.607312)
y.append(0.325)
joint_optimized.append("DSPy optimization")

# Function to determine the Pareto frontier
def identify_pareto(scores):
    # Initialize the Pareto front: True if a point is on the front
    pareto_front = [True] * len(scores)
    for i in range(len(scores)):
        for j in range(len(scores)):
            if i == j:
                continue
            if scores[j][0] < scores[i][0] and scores[j][1] >= scores[i][1]:
                pareto_front[i] = False
    return pareto_front

# Determine which programs are on the Pareto frontier
pareto_status = identify_pareto(list(zip(x, y)))
pareto_points = [(x[i], y[i]) for i in range(len(x)) if pareto_status[i]]

# Sort Pareto points by x (accuracy)
pareto_points_sorted = sorted(pareto_points, key=lambda point: point[0])
pareto_x = [point[0] for point in pareto_points_sorted]
pareto_y = [point[1] for point in pareto_points_sorted]

# Creating the scatter plot with a color map based on 'joint_optimized'
plt.figure(figsize=(8, 5))
if joint_optimized.count(False) == len(joint_optimized):
    scatter = plt.scatter(x, y, c='blue', alpha=.7)
else:
    scatter_plot = sns.scatterplot(
    x=x, 
    y=y, 
    hue=joint_optimized, 
    palette=sns.color_palette("Dark2")[:2], 
    marker='o', 
    s=100
    )
    # add legend explaining colors
# plt.plot(pareto_x, pareto_y, color='red', linestyle='--', marker='o', label='Pareto Frontier')  # Plot Pareto frontier

plt.ylim(0,.6)
plt.xlim(0,.7)

plt.ylabel('accuracy')
plt.xlabel('cost (USD) (Variable cost incurred during evaluation)')
plt.title('Pareto Frontier of Programs (HotpotQA)')
plt.grid(False)

plt.legend(loc="lower right")


plt.show()
