In [1]:
import dspy
import os

PRICE_PER_PROMPT_TOKEN = (10/1000000)
PRICE_PER_COMPLETION_TOKEN = (30/1000000)

gpt_4 = dspy.AzureOpenAI(
    api_version = "2023-12-01-preview",
    api_base = "https://agent-eval-east-us-2.openai.azure.com/",
    model = "gpt-4-turbo-2024-04-09",
    api_key = "API_KEY"
)

colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(lm=gpt_4, rm=colbertv2_wiki17_abstracts)

dspy.settings.show_guidelines = True

cache_turn_on: False


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=50, eval_seed=2023, dev_size=200, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

len(trainset), len(devset)

  table = cls._concat_blocks(blocks, axis=0)


(50, 200)

In [3]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [4]:
class GenerateSearchQuery(dspy.Signature):
    """Write a simple search query that will help answer a complex question."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    query = dspy.OutputField()

In [5]:
from dsp.utils import deduplicate

class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=2, max_hops=2):
        super().__init__()

        self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.max_hops = max_hops

    def forward(self, question):
        context = []

        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [6]:
# Get the prediction. This contains `pred.context` and `pred.answer`.
uncompiled_baleen = SimplifiedBaleen(passages_per_hop=3, max_hops=3)  # uncompiled (i.e., zero-shot) program

In [7]:
# # Ask any question you like to this simple RAG program.
# my_question = "How many storeys are in the castle that David Gregory inherited?"

# pred = uncompiled_baleen(my_question)

# # Print the contexts and the answer.
# print(f"Question: {my_question}")
# print(f"Predicted Answer: {pred.answer}")
# print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

In [8]:
def validate_context_and_answer_and_hops(example, pred, trace=None):
    if not dspy.evaluate.answer_exact_match(example, pred): return False
    if not dspy.evaluate.answer_passage_match(example, pred): return False

    hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs]

    if max([len(h) for h in hops]) > 100: return False
    if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))): return False

    return True

In [9]:
def gold_passages_retrieved(example, pred, trace=None):
    gold_titles = set(map(dspy.evaluate.normalize_text, example['gold_titles']))
    found_titles = set(map(dspy.evaluate.normalize_text, [c.split(' | ')[0] for c in pred.context]))

    return gold_titles.issubset(found_titles)

In [10]:
from dspy.evaluate.evaluate import Evaluate

# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate_on_hotpotqa = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)

In [11]:
# validate that tokens count are 0
assert gpt_4.total_completion_tokens == 0 and gpt_4.total_prompt_tokens == 0

## Evaluation

In [12]:
import tiktoken

# Get tokenizer
tokenizer = tiktoken.encoding_for_model("gpt-4")


In [13]:
def get_total_demo_tokens(compiled_program):
    total_prompt_tokens = 0
    for predictor in compiled_program.predictors():
        input_fields = predictor.signature.input_fields
        demos = predictor.demos
        for demo in demos:
            for key, value in demo.items():
                if key in input_fields:
                    # print("key: ", key, "value: ", value)
                    total_prompt_tokens += len(tokenizer.encode(str(value)))
    print("Nr of prompt tokens of demos in compiled_baleen: ", total_prompt_tokens)
    return total_prompt_tokens

In [14]:
import time
from tqdm import tqdm
import random

NR_OF_RUNS = 5

program_evals = []
for i in tqdm(range(NR_OF_RUNS)):
    compile_prompt_tokens_pre, compile_completion_tokens_pre = gpt_4.total_prompt_tokens, gpt_4.total_completion_tokens

    # shuffle trainset list
    random.shuffle(trainset)

    uncompiled_baleen = SimplifiedBaleen(passages_per_hop=3, max_hops=3)

    compile_prompt_tokens_post = gpt_4.total_prompt_tokens - compile_prompt_tokens_pre
    compile_completion_tokens_post = gpt_4.total_completion_tokens - compile_completion_tokens_pre

    
    program_results = {
        'compile_completion_tokens': compile_completion_tokens_post,
        'compile_prompt_tokens': compile_prompt_tokens_post,
        'compile_cost': compile_prompt_tokens_post * PRICE_PER_PROMPT_TOKEN + compile_completion_tokens_post * PRICE_PER_COMPLETION_TOKEN,
        'demos_prompt_tokens': 0,
        'eval_accuracies': {},
        'eval_costs': {},
        'per_task_results': {}
    }


    # Evaluate each task in the dev set
    for run in range(1,2):
        program_results['per_task_results'][f'run{run}'] = []
        program_results['eval_accuracies'][f'run{run}'] = 0
        program_results['eval_costs'][f'run{run}'] = 0
        for example in devset:
            start_time = time.time()

            # Log completion and prompt tokens
            completion_tokens_pre = gpt_4.total_completion_tokens
            prompt_tokens_pre = gpt_4.total_prompt_tokens
            
            try:
                # Evaluate the task
                pred = uncompiled_baleen(example.question)
                # Evaluate overall performance using answer exact match metric
                exact_match = dspy.evaluate.answer_exact_match(example, pred)
                print(f"Answer Exact Match: {exact_match}")
            except:
                print("Error evaluating task")
                exact_match = False
            
            # Log completion and prompt tokens
            completion_tokens_post = gpt_4.total_completion_tokens - completion_tokens_pre
            prompt_tokens_post = gpt_4.total_prompt_tokens - prompt_tokens_pre
            print(f"Completion Tokens: {completion_tokens_post}, Prompt Tokens: {prompt_tokens_post}")
            
            # Log evaluation time
            eval_time = time.time() - start_time
            print(f"Evaluation Time: {eval_time} seconds")
            
            

            program_results['per_task_results'][f'run{run}'].append({
                'question': example.question,
                'pred': pred.answer,
                'groud_truth': example.answer,
                'exact_match': exact_match,
                'completion_tokens': completion_tokens_post,
                'prompt_tokens': prompt_tokens_post,
                'cost': prompt_tokens_post * PRICE_PER_PROMPT_TOKEN + completion_tokens_post * PRICE_PER_COMPLETION_TOKEN,
                'eval_time': eval_time
            })


        # Calculate the overall evaluation accuracy
        program_results['eval_accuracies'][f'run{run}'] = sum([task['exact_match'] for task in program_results['per_task_results'][f"run{run}"]])/len(program_results['per_task_results'][f"run{run}"])

        # Calculate the overall cost
        program_results['eval_costs'][f'run{run}'] = sum([task['cost'] for task in program_results['per_task_results'][f"run{run}"]])

    program_evals.append(program_results)

import time
time.sleep(3)
# write the results to a file
import json
with open(f'gpt_4_0409_format_instr_3hops3passagesperhop_eval_results.json', 'w') as f:
    json.dump(program_evals, f)

  0%|          | 0/5 [00:00<?, ?it/s]

Answer Exact Match: True
Completion Tokens: 318, Prompt Tokens: 1621
Evaluation Time: 24.012601375579834 seconds
Answer Exact Match: True
Completion Tokens: 420, Prompt Tokens: 1557
Evaluation Time: 53.07578897476196 seconds
Answer Exact Match: True
Completion Tokens: 375, Prompt Tokens: 2173
Evaluation Time: 29.02118945121765 seconds
Answer Exact Match: True
Completion Tokens: 411, Prompt Tokens: 1630
Evaluation Time: 32.55620837211609 seconds
Answer Exact Match: False
Completion Tokens: 493, Prompt Tokens: 2883
Evaluation Time: 41.63037657737732 seconds
Answer Exact Match: True
Completion Tokens: 291, Prompt Tokens: 1908
Evaluation Time: 37.79394459724426 seconds
Answer Exact Match: False
Completion Tokens: 455, Prompt Tokens: 1739
Evaluation Time: 33.16438817977905 seconds
Answer Exact Match: True
Completion Tokens: 305, Prompt Tokens: 1605
Evaluation Time: 15.704911470413208 seconds
Answer Exact Match: False
Completion Tokens: 260, Prompt Tokens: 1968
Evaluation Time: 21.4700763225

 20%|██        | 1/5 [1:30:24<6:01:39, 5424.97s/it]

Answer Exact Match: False
Completion Tokens: 287, Prompt Tokens: 1664
Evaluation Time: 29.156190872192383 seconds
Answer Exact Match: True
Completion Tokens: 347, Prompt Tokens: 2018
Evaluation Time: 22.592812299728394 seconds
Answer Exact Match: True
Completion Tokens: 320, Prompt Tokens: 2051
Evaluation Time: 21.449559450149536 seconds
Answer Exact Match: True
Completion Tokens: 376, Prompt Tokens: 2370
Evaluation Time: 20.140151262283325 seconds
Answer Exact Match: True
Completion Tokens: 431, Prompt Tokens: 1630
Evaluation Time: 38.04063630104065 seconds
Answer Exact Match: False
Completion Tokens: 474, Prompt Tokens: 1993
Evaluation Time: 33.737486362457275 seconds
Answer Exact Match: True
Completion Tokens: 302, Prompt Tokens: 2104
Evaluation Time: 29.180726528167725 seconds
Answer Exact Match: False
Completion Tokens: 526, Prompt Tokens: 2141
Evaluation Time: 48.399463176727295 seconds
Answer Exact Match: True
Completion Tokens: 274, Prompt Tokens: 1531
Evaluation Time: 14.85719

 40%|████      | 2/5 [2:49:26<4:11:09, 5023.21s/it]

Answer Exact Match: False
Completion Tokens: 288, Prompt Tokens: 1742
Evaluation Time: 22.100959539413452 seconds
Answer Exact Match: True
Completion Tokens: 318, Prompt Tokens: 1903
Evaluation Time: 14.065212726593018 seconds
Answer Exact Match: True
Completion Tokens: 378, Prompt Tokens: 1316
Evaluation Time: 16.138530492782593 seconds
Answer Exact Match: True
Completion Tokens: 321, Prompt Tokens: 2270
Evaluation Time: 13.162086248397827 seconds
Answer Exact Match: True
Completion Tokens: 430, Prompt Tokens: 1630
Evaluation Time: 17.806697368621826 seconds
Answer Exact Match: False
Completion Tokens: 357, Prompt Tokens: 2778
Evaluation Time: 16.56551480293274 seconds
Answer Exact Match: True
Completion Tokens: 313, Prompt Tokens: 1585
Evaluation Time: 18.342586994171143 seconds
Answer Exact Match: False
Completion Tokens: 426, Prompt Tokens: 2256
Evaluation Time: 19.113617420196533 seconds
Answer Exact Match: True
Completion Tokens: 296, Prompt Tokens: 1781
Evaluation Time: 23.27654

 60%|██████    | 3/5 [3:50:56<2:27:08, 4414.06s/it]

Answer Exact Match: False
Completion Tokens: 283, Prompt Tokens: 1728
Evaluation Time: 11.776280164718628 seconds
Answer Exact Match: True
Completion Tokens: 298, Prompt Tokens: 1845
Evaluation Time: 23.16066336631775 seconds
Answer Exact Match: True
Completion Tokens: 338, Prompt Tokens: 1316
Evaluation Time: 13.946072578430176 seconds
Answer Exact Match: True
Completion Tokens: 359, Prompt Tokens: 2592
Evaluation Time: 18.117003679275513 seconds
Answer Exact Match: True
Completion Tokens: 453, Prompt Tokens: 2548
Evaluation Time: 22.238331079483032 seconds
Answer Exact Match: False
Completion Tokens: 323, Prompt Tokens: 2784
Evaluation Time: 12.098395347595215 seconds
Answer Exact Match: True
Completion Tokens: 309, Prompt Tokens: 1794
Evaluation Time: 14.098065376281738 seconds
Answer Exact Match: False
Completion Tokens: 556, Prompt Tokens: 4652
Evaluation Time: 26.992276430130005 seconds
Answer Exact Match: True
Completion Tokens: 285, Prompt Tokens: 1439
Evaluation Time: 10.50859

 80%|████████  | 4/5 [4:49:10<1:07:31, 4051.05s/it]

Answer Exact Match: False
Completion Tokens: 295, Prompt Tokens: 1663
Evaluation Time: 12.497862815856934 seconds
Answer Exact Match: True
Completion Tokens: 289, Prompt Tokens: 1630
Evaluation Time: 11.162587404251099 seconds
Answer Exact Match: True
Completion Tokens: 369, Prompt Tokens: 1334
Evaluation Time: 25.88197135925293 seconds
Answer Exact Match: True
Completion Tokens: 347, Prompt Tokens: 2013
Evaluation Time: 18.44684624671936 seconds
Answer Exact Match: True
Completion Tokens: 425, Prompt Tokens: 1630
Evaluation Time: 16.774128675460815 seconds
Answer Exact Match: False
Completion Tokens: 333, Prompt Tokens: 2333
Evaluation Time: 17.529049396514893 seconds
Answer Exact Match: True
Completion Tokens: 275, Prompt Tokens: 1908
Evaluation Time: 13.742520093917847 seconds
Answer Exact Match: False
Completion Tokens: 455, Prompt Tokens: 1557
Evaluation Time: 20.35036015510559 seconds
Answer Exact Match: True
Completion Tokens: 291, Prompt Tokens: 1439
Evaluation Time: 13.0192708

100%|██████████| 5/5 [5:58:49<00:00, 4305.93s/it]  

Answer Exact Match: False
Completion Tokens: 344, Prompt Tokens: 1756
Evaluation Time: 37.94809603691101 seconds





In [17]:
gpt_4.history

# dump the history to a file
import json
with open(f'gpt_4_0409_format_instr_3hops3passagesperhop_history.json', 'w') as f:
    json.dump(gpt_4.history, f)

In [15]:
gpt_4.inspect_history(50)




Write a simple search query that will help answer a complex question.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the query}. We ...

Query: ${query}

---

Context: N/A

Question: Andrew Form produced which 2013 dystopian horror film?

Reasoning: Let's think step by step in order to[32m produce the query. We need to find the name of a dystopian horror film produced by Andrew Form in 2013. To do this, we can use keywords such as "Andrew Form," "2013," and "dystopian horror film" to narrow down the search results.

Query: "Andrew Form 2013 dystopian horror film"[0m





Write a simple search query that will help answer a complex question.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the query}. We ...

Query: ${query}

---

Context:
[1] «The Purge | The Purge is a 201

'\n\n\nWrite a simple search query that will help answer a complex question.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let\'s think step by step in order to ${produce the query}. We ...\n\nQuery: ${query}\n\n---\n\nContext: N/A\n\nQuestion: Andrew Form produced which 2013 dystopian horror film?\n\nReasoning: Let\'s think step by step in order to\x1b[32m produce the query. We need to find the name of a dystopian horror film produced by Andrew Form in 2013. To do this, we can use keywords such as "Andrew Form," "2013," and "dystopian horror film" to narrow down the search results.\n\nQuery: "Andrew Form 2013 dystopian horror film"\x1b[0m\n\n\n\n\n\nWrite a simple search query that will help answer a complex question.\n\n---\n\nFollow the following format.\n\nContext: may contain relevant facts\n\nQuestion: ${question}\n\nReasoning: Let\'s think step by step in order to ${produce the query}. We ...\n\nQuery: ${quer

### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Extracting values for plotting
x = [program['eval_cost'] for program in program_evals] 
y = [program['eval_acc'] for program in program_evals] 
# num_demos = [trial.params['num_demos_for_predictor_prog'] for trial in programs]  # Number of demos
joint_optimized = ["Joint optimization"] * len(program_evals)

# add one manual point to the plot
x.append(0.607312)
y.append(0.325)
joint_optimized.append("DSPy optimization")

# Function to determine the Pareto frontier
def identify_pareto(scores):
    # Initialize the Pareto front: True if a point is on the front
    pareto_front = [True] * len(scores)
    for i in range(len(scores)):
        for j in range(len(scores)):
            if i == j:
                continue
            if scores[j][0] < scores[i][0] and scores[j][1] >= scores[i][1]:
                pareto_front[i] = False
    return pareto_front

# Determine which programs are on the Pareto frontier
pareto_status = identify_pareto(list(zip(x, y)))
pareto_points = [(x[i], y[i]) for i in range(len(x)) if pareto_status[i]]

# Sort Pareto points by x (accuracy)
pareto_points_sorted = sorted(pareto_points, key=lambda point: point[0])
pareto_x = [point[0] for point in pareto_points_sorted]
pareto_y = [point[1] for point in pareto_points_sorted]

# Creating the scatter plot with a color map based on 'joint_optimized'
plt.figure(figsize=(8, 5))
if joint_optimized.count(False) == len(joint_optimized):
    scatter = plt.scatter(x, y, c='blue', alpha=.7)
else:
    scatter_plot = sns.scatterplot(
    x=x, 
    y=y, 
    hue=joint_optimized, 
    palette=sns.color_palette("Dark2")[:2], 
    marker='o', 
    s=100
    )
    # add legend explaining colors
# plt.plot(pareto_x, pareto_y, color='red', linestyle='--', marker='o', label='Pareto Frontier')  # Plot Pareto frontier

plt.ylim(0,.6)
plt.xlim(0,.7)

plt.ylabel('accuracy')
plt.xlabel('cost (USD) (Variable cost incurred during evaluation)')
plt.title('Pareto Frontier of Programs (HotpotQA)')
plt.grid(False)

plt.legend(loc="lower right")


plt.show()
