In [None]:
import dspy
import os

PRICE_PER_PROMPT_TOKEN = (0.5/1000000)
PRICE_PER_COMPLETION_TOKEN = (1.5/1000000)

gpt_35 = dspy.OpenAI(model='gpt-3.5-turbo-0125', api_key='API_KEY', max_tokens=300)

colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(lm=gpt_35, rm=colbertv2_wiki17_abstracts)

In [None]:
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=10, train_size=50, eval_seed=2000, dev_size=250, test_size=0)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev[:50]]
testset = [x.with_inputs('question') for x in dataset.dev[50:]]

len(trainset), len(devset), len(testset)

In [None]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [None]:
class GenerateSearchQuery(dspy.Signature):
    """Write a simple search query that will help answer a complex question."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    query = dspy.OutputField()

In [None]:
from dsp.utils import deduplicate

class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()

        self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.max_hops = max_hops
    
    def forward(self, question):
        context = []
        
        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [None]:
def validate_context_and_answer_and_hops(example, pred, trace=None):
    if not dspy.evaluate.answer_exact_match(example, pred): return False
    if not dspy.evaluate.answer_passage_match(example, pred): return False
    
    if trace is not None:
        hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs]
    else:
        hops = [example.question]

    if max([len(h) for h in hops]) > 100: return False
    if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))): return False

    return True

In [None]:
def gold_passages_retrieved(example, pred, trace=None):
    gold_titles = set(map(dspy.evaluate.normalize_text, example['gold_titles']))
    found_titles = set(map(dspy.evaluate.normalize_text, [c.split(' | ')[0] for c in pred.context]))

    return gold_titles.issubset(found_titles)

In [None]:
from dspy.evaluate.evaluate import Evaluate

# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate_on_hotpotqa = Evaluate(devset=testset, num_threads=1, display_progress=True, display_table=5)

In [None]:
# validate that tokens count are 0
assert gpt_35.total_completion_tokens == 0 and gpt_35.total_prompt_tokens == 0

## Compilation

In [None]:
from dspy.teleprompt import OptunaJointCostOptimizer, BootstrapFewShot

teleprompter = OptunaJointCostOptimizer(bootstrap_metric=validate_context_and_answer_and_hops, optim_metric=validate_context_and_answer_and_hops, max_bootstrapped_demos=8)
compiled_baleen = teleprompter.compile(SimplifiedBaleen(passages_per_hop=2, max_hops=2), teacher=SimplifiedBaleen(passages_per_hop=2, max_hops=2), trainset=trainset, valset=devset)

In [None]:
# Get total cost for compilation
compile_prompt_tokens, compile_completion_tokens = gpt_35.total_prompt_tokens, gpt_35.total_completion_tokens
print(compile_prompt_tokens * PRICE_PER_PROMPT_TOKEN + compile_completion_tokens * PRICE_PER_COMPLETION_TOKEN)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

programs = compiled_baleen[0]

# Extracting values for plotting
x = [trial.values[1] for trial in programs]  # Accuracy
y = [trial.values[0] for trial in programs]  # Cost
# num_demos = [trial.params['num_demos_for_predictor_prog'] for trial in programs]  # Number of demos
num_demos = [1 for trial in programs]  # Number of demos

# Function to determine the Pareto frontier
def identify_pareto(scores):
    # Initialize the Pareto front: True if a point is on the front
    pareto_front = [True] * len(scores)
    for i in range(len(scores)):
        for j in range(len(scores)):
            if i == j:
                continue
            if scores[j][0] < scores[i][0] and scores[j][1] >= scores[i][1]:
                pareto_front[i] = False
    return pareto_front

# Determine which programs are on the Pareto frontier
pareto_status = identify_pareto(list(zip(x, y)))
pareto_points = [(x[i], y[i]) for i in range(len(x)) if pareto_status[i]]

# Sort Pareto points by x (accuracy)
pareto_points_sorted = sorted(pareto_points, key=lambda point: point[0])
pareto_x = [point[0] for point in pareto_points_sorted]
pareto_y = [point[1] for point in pareto_points_sorted]

# Creating the scatter plot with a color map based on 'num_demos'
colors = plt.cm.viridis(np.linspace(0, 1, max(num_demos) + 1))  # Generate a color map
plt.figure(figsize=(8, 5))
if num_demos.count(1) == len(num_demos):
    scatter = plt.scatter(x, y, c='blue', alpha=.7)  # All programs have 1 demo
else:
    scatter = plt.scatter(x, y, c=num_demos, cmap='Dark2', alpha=.7)  # Use the colormap
plt.plot(pareto_x, pareto_y, color='red', linestyle='--', marker='o', label='Pareto Frontier')  # Plot Pareto frontier

plt.ylabel('Accuracy')
plt.xlabel('Cost of Demos')
plt.title('Pareto Frontier of Programs on Train Set')
plt.grid(True)

# Set the color bar with specific steps
cbar = plt.colorbar(scatter, ticks=range(1, max(num_demos)+1))  # setting ticks for each demo number
cbar.set_label('Number of Demos')
cbar.ax.set_yticklabels(range(1, max(num_demos)+1))  # setting tick labels
plt.legend()

plt.show()


In [None]:
# save list of all params per program
programs = compiled_baleen[0]
programs_params = [trial.params for trial in programs]

# save to json
import json
with open('gpt_3.5_0125_compiled_joint_optim_2hops2passagesperhop_retrieval_programs_params.json', 'w') as f:
    json.dump(programs_params, f)

## Evaluation

In [None]:
import time
from tqdm import tqdm

program_evals = []
# evaluate all pareto efficient programs on the dev set across 5 runs
for program in tqdm(compiled_baleen[1]):
    program_results = {
        'compile_completion_tokens': compile_completion_tokens,
        'compile_prompt_tokens': compile_prompt_tokens,
        'compile_cost': compile_prompt_tokens * PRICE_PER_PROMPT_TOKEN + compile_completion_tokens * PRICE_PER_COMPLETION_TOKEN,
        'demos_prompt_tokens': program.values[1],
        'trainset_acc': program.values[0]/100,
        'eval_accuracies': {},
        'eval_gold_passages_retrieved_accuracies': {},
        'eval_costs': {},
        'per_task_results': {}
    }

    print(f"demos_prompt_tokens: {program.values[1]}, trainset_acc: {program.values[0]/100} show_guidelines {program.params['formatting_instructions_added']}")

    if program.params['formatting_instructions_added'] == 1:
        dspy.settings.show_guidelines = True
    else:
        dspy.settings.show_guidelines = False
    print(f"show_guidelines: {dspy.settings.show_guidelines}")


    for run in tqdm(range(5)):
        program_results['per_task_results'][f'run{run}'] = []
        # Evaluate each task in the dev set
        for example in testset:
            start_time = time.time()

            # Log completion and prompt tokens
            completion_tokens_pre = gpt_35.total_completion_tokens
            prompt_tokens_pre = gpt_35.total_prompt_tokens
            # Evaluate the task
            try:
                pred = program.user_attrs['program'](example.question)
                # Evaluate overall performance using answer exact match metric
                exact_match = dspy.evaluate.answer_exact_match(example, pred)
                gold_passages = gold_passages_retrieved(example, pred)
                print(f"Answer Exact Match: {exact_match}")
                print(f"Gold Passages Retrieved: {gold_passages}")
            except Exception as e:
                print(f"Error: {e}")
                exact_match = False
                gold_passages = False
            
            # Log completion and prompt tokens
            completion_tokens_post = gpt_35.total_completion_tokens - completion_tokens_pre
            prompt_tokens_post = gpt_35.total_prompt_tokens - prompt_tokens_pre
            print(f"Completion Tokens: {completion_tokens_post}, Prompt Tokens: {prompt_tokens_post}")
            
            # Log evaluation time
            eval_time = time.time() - start_time
            print(f"Evaluation Time: {eval_time} seconds")
            
            

            program_results['per_task_results'][f'run{run}'].append({
                'question': example.question,
                'pred': pred.answer,
                'groud_truth': example.answer,
                'exact_match': exact_match,
                'gold_passages_retrieved': gold_passages,
                'completion_tokens': completion_tokens_post,
                'prompt_tokens': prompt_tokens_post,
                'cost': prompt_tokens_post * PRICE_PER_PROMPT_TOKEN + completion_tokens_post * PRICE_PER_COMPLETION_TOKEN,
                'eval_time': eval_time
            })
        
        # Calculate the overall evaluation accuracy
        program_results['eval_accuracies'][f'run{run}'] = sum([task['exact_match'] for task in program_results['per_task_results'][f"run{run}"]])/len(program_results['per_task_results'][f"run{run}"])

        # Calculate the overall gold passages retrieved accuracy
        program_results['eval_gold_passages_retrieved_accuracies'][f'run{run}'] = sum([task['gold_passages_retrieved'] for task in program_results['per_task_results'][f"run{run}"]])/len(program_results['per_task_results'][f"run{run}"])

        # Calculate the overall cost
        program_results['eval_costs'][f'run{run}'] = sum([task['cost'] for task in program_results['per_task_results'][f"run{run}"]])
    
    print(program_results)
    program_evals.append(program_results)

import time
time.sleep(3)
# write the results to a file
import json
with open(f'gpt_3.5_0125_compiled_joint_optim_2hops2passagesperhop_retrieval_eval_results.json', 'w') as f:
    json.dump(program_evals, f)

In [None]:
# save the history to a file
import json
with open('gpt_3.5_0125_compiled_joint_optim_3hops3passagesperhop_retrieval_history.json', 'w') as f:
    json.dump(gpt_35.history, f)