In [1]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("DSPy")

<Experiment: artifact_location='mlflow-artifacts:/374362034103955121', creation_time=1741686562632, experiment_id='374362034103955121', last_update_time=1741686562632, lifecycle_stage='active', name='DSPy', tags={}>

In [2]:
mlflow.dspy.autolog()

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from dotenv import load_dotenv

load_dotenv()


True

In [23]:
import dspy

llama3b = dspy.LM('fireworks_ai/accounts/fireworks/models/llama-v3p2-3b-instruct', temperature=0.7)
gpt4o = dspy.LM('openai/gpt-4o', temperature=0.7)

dspy.configure(lm=llama3b)

In [5]:
import random
from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="hover-nlp/hover", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
hover = [
    dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
    for x in hover
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids and not hpqa_ids.add(x["hpqa_id"])
]

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

Downloading builder script: 100%|██████████| 4.48k/4.48k [00:00<00:00, 15.9kB/s]
Downloading readme: 100%|██████████| 4.13k/4.13k [00:00<00:00, 17.7kB/s]
Downloading data: 9.21MB [00:00, 77.4MB/s]                   
Downloading data: 2.15MB [00:00, 44.9MB/s]                  
Downloading data: 899kB [00:00, 35.5MB/s]                   
Generating train split: 100%|██████████| 18171/18171 [00:00<00:00, 36946.44 examples/s]
Generating validation split: 100%|██████████| 4000/4000 [00:00<00:00, 28205.96 examples/s]
Generating test split: 100%|██████████| 4000/4000 [00:00<00:00, 68392.45 examples/s]


In [6]:
example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

Claim: This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".
Pages that must be retrieved: ['Miss Potter', 'Chris Noonan', 'Academy Award for Best Director']


In [7]:
DOCS = {}

def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=k)
    results = [x['text'] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results

In [8]:
def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

In [9]:
instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

In [19]:
react(claim="David Gregory was born in 1625.").titles[:3]

[]

In [20]:
dspy.inspect_history(n=1)





[34m[2025-03-12T18:38:23.559367][0m

[31mSystem message:[0m

Your input fields are:
1. `claim` (str)
2. `trajectory` (str)

Your output fields are:
1. `reasoning` (str)
2. `titles` (list[str])

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## claim ## ]]
{claim}

[[ ## trajectory ## ]]
{trajectory}

[[ ## reasoning ## ]]
{reasoning}

[[ ## titles ## ]]
{titles}        # note: the value you produce must adhere to the JSON schema: {"type": "array", "items": {"type": "string"}}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Find all Wikipedia titles relevant to verifying (or refuting) the claim.


[31mUser message:[0m

[[ ## claim ## ]]
David Gregory was born in 1625.

[[ ## trajectory ## ]]
[[ ## thought_0 ## ]]
Searching for information about David Gregory's birth.

[[ ## tool_name_0 ## ]]
search_wikipedia

[[ ## tool_args_0 ## ]]
{"query": {"type": "string"}}

[[ ## observation_0 ## ]]


In [21]:
def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

In [22]:
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

evaluate(safe_react)

Average Metric: 0.00 / 1 (0.0%):   1%|          | 1/100 [00:00<00:51,  1.91it/s]