In [1]:
!pip install -q langchain dspy-ai crewai google-search-results numexpr


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from langchain.agents import AgentExecutor, load_tools
from langchain.chat_models import ChatOpenAI
from langchain_core.tools import Tool
from scripts.modified_calculator import LLMMathChain

MODEL_ID = "gpt-3.5-turbo"

llm = ChatOpenAI(model=MODEL_ID, temperature=0)

langchain_serpapi_tool = load_tools(["serpapi"], llm=llm)[0]
# Rename tools in the same format used by other tools

  warn_deprecated(


# DSPy

In [4]:
import dspy

gpt4_llm = dspy.OpenAI(model=MODEL_ID)
dspy.settings.configure(lm=gpt4_llm)

### Define tools

In [5]:
langchain_math_tool = Tool(
    name="Calculator",
    description="Useful for when you need to answer questions about math.",
    func=LLMMathChain.from_llm(llm=llm).run,
    coroutine=LLMMathChain.from_llm(llm=llm).arun,
)


class DSPyMathTool:
    name = "calculator"
    input_variable = "math formula"
    desc = "Useful for when you need to answer questions about math."

    def __init__(self, k=3):
        pass

    def __call__(self, *args, **kwargs):
        return langchain_math_tool.run(*args, **kwargs)


class DSPySerpAPITool:
    name = "search"
    input_variable = "search input"
    desc = langchain_serpapi_tool.description

    def __init__(self, k=3):
        pass

    def __call__(self, *args, **kwargs):
        return langchain_serpapi_tool.run(*args, **kwargs)

In [6]:
import datetime
import dspy


class GiveTime:
    name = "GiveTime"
    input_variable = "empty"
    desc = "takes an empty string and returns the current local time"

    def __init__(self, k=3):
        pass

    def __call__(self, *args, **kwargs):
        return datetime.datetime.now().strftime("%H:%M:%S")

In [7]:
math_tool = DSPyMathTool()
serpapi_tool = DSPySerpAPITool()
agent_dspy = dspy.ReAct("question -> answer", tools=[math_tool, serpapi_tool])

In [8]:
async def call_dspy(agent, question: str) -> str:
    response = {"output": agent(question=question).answer, "intermediate_steps": []}
    return response

In [9]:
await call_dspy(agent_dspy, "what is 2+2")

{'output': '4', 'intermediate_steps': []}

In [10]:
gpt4_llm.inspect_history()





You will be given `question` and you will respond with `answer`.

To do this, you will interleave Thought, Action, and Observation steps.

Thought can reason about the current situation, and Action can be the following types:

(1) calculator[math formula], which Useful for when you need to answer questions about math.
(2) search[search input], which A search engine. Useful for when you need to answer questions about current events. Input should be a search query.
(3) Finish[answer], which returns the final `answer` and finishes the task

---

Follow the following format.

Question: ${question}

Thought 1: next steps to take based on last observation

Action 1: always either calculator[math formula] or search[search input] or, when done, Finish[answer]

Observation 1: observations based on action

Thought 2: next steps to take based on last observation

Action 2: always either calculator[math formula] or search[search input] or, when done, Finish[answer]

Observation 2: observations

### Compile DSPy

In [10]:
import datasets

math_dataset = datasets.load_dataset("gsm8k", "main")["train"]


def build_dspy_trainset(dataset):
    dspy_dataset = []
    for i in range(len(dataset)):
        dspy_dataset.append(
            dspy.Example(
                question=dataset["question"][i],
                answer=dataset["answer"][i],
            ).with_inputs("question")
        )
    return dspy_dataset


# from dspy.teleprompt import BootstrapFewShot

# teleprompter = BootstrapFewShot(metric=validate_with_llm_judge, max_rounds=5)
# compiled_rag = teleprompter.compile(
#     agent,
#     trainset=build_dspy_trainset(math_dataset[:20]),
#     valset=build_dspy_trainset(math_dataset[20:50]),
# )

# Benchmark different frameworks

In [11]:
math_dataset = (
    datasets.load_dataset("reasoning-machines/gsm-hard", "main")["train"]
    .shuffle(seed=42)
    .select(range(30))
)
math_dataset = math_dataset.rename_column("input", "question")
math_dataset = math_dataset.rename_column("target", "answer")

task_column = ["GSM8K-hard"] * len(math_dataset)
math_dataset = math_dataset.add_column("task", task_column).select_columns(
    ["question", "answer", "task"]
)
eval_dataset = math_dataset

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


In [14]:
eval_chat_model = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
eval_model_name = "GPT4"

from scripts.run_agents import answer_questions

### DSPy

In [14]:
# DSPy
await answer_questions(
    eval_dataset,
    agent_dspy,
    "dspy_agent",
    output_folder="output_dspy2",
    agent_call_function=call_dspy,
)

  0%|          | 0/30 [00:00<?, ?it/s]

 13%|█▎        | 4/30 [01:00<06:38, 15.31s/it]

### Vanilla GPT

In [15]:
prompt = "Answer the question below. You can think out loud, but remind to ALWAYS use the exact characters `Final Answer:` when you provide a definitive answer. Question:"


async def call_vanilla_gpt(agent, question: str):
    output = agent.invoke(prompt + question).content
    if "Final Answer:" in output:
        output = output.split("Final Answer:")[1]
    response = {"output": output, "intermediate_steps": []}
    return response

In [16]:
llm_gpt = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

await answer_questions(
    eval_dataset,
    llm_gpt,
    "gpt3.5_agent_notools",
    output_folder="output_dspy2",
    agent_call_function=call_vanilla_gpt,
)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [00:43<00:00,  1.44s/it]


[{'agent_name': 'gpt3.5_agent_notools',
  'question': 'Charlie wants to sell beeswax candles.  For every pound of beeswax, he can make 10 tapered candles.  One pound of beeswax and the wicks cost $10.00 in supplies.   If he sells each candle for $2.00 each, what is his net profit if he makes and sells 1080379 candles?',
  'gt_answer': 1080379.0,
  'prediction': '` $1080379.',
  'intermediate_steps': [],
  'parsing_error': False,
  'iteration_limit_exceeded': False,
  'agent_error': None,
  'start_time': '2024-02-07 15:56:55',
  'end_time': '2024-02-07 15:56:57',
  'task': 'GSM8K-hard'},
 {'agent_name': 'gpt3.5_agent_notools',
  'question': 'Melissa is summoned to jury duty. She spends 6 hours a day for 3 days listening to a court case. If Melissa is paid $557654 per day but also has to pay $3 for parking each day, how much jury pay does she make per hour after expenses?',
  'gt_answer': 92941.8333333333,
  'prediction': '` Melissa makes $30980.61 per hour after expenses.',
  'intermedi

### LangChain

In [17]:
from scripts.agents import build_openai_agent

langchain_agent = build_openai_agent(model_id="gpt-3.5-turbo-1106")

await answer_questions(
    eval_dataset,
    langchain_agent,
    "gpt3.5_langchain2",
    output_folder="output_dspy2",
)

  warn_deprecated(
  0%|          | 0/30 [00:00<?, ?it/s]



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `10*1080379*2-(10.00+1080379)`


[0m[33;1m[1;3m20527191.0[0m

  3%|▎         | 1/30 [00:04<02:06,  4.37s/it]

[32;1m[1;3mCharlie's net profit, if he makes and sells 1,080,379 candles, would be $20,527,191.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `(557654 - 3) / (6 * 3)`


[0m[33;1m[1;3m30980.61111111111[0m

  7%|▋         | 2/30 [00:09<02:14,  4.82s/it]

[32;1m[1;3mAfter deducting the parking expenses, Melissa makes approximately $30,980.61 per hour for her jury duty.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `13 + (2 * 7843414) - (16 + 7843414)`


[0m[33;1m[1;3m7843411[0m

 10%|█         | 3/30 [00:18<02:59,  6.64s/it]

[32;1m[1;3mThe boots work out to be $7,843,411 more expensive on eBay compared to Amazon.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `20*2`


[0m[33;1m[1;3m40[0m[32;1m[1;3m
Invoking: `calculator` with `125-40-(0.6*(20*2))`


[0m[33;1m[1;3m61.0[0m

 13%|█▎        | 4/30 [00:36<04:47, 11.07s/it]

[32;1m[1;3mJohn leaves the vet with $61 after paying for the vaccines and the heartworm check.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `8 + 8 + 16 + 24`


[0m[33;1m[1;3m56[0m

 17%|█▋        | 5/30 [00:44<04:11, 10.05s/it]

[32;1m[1;3mThe total age of the four birds is 56 years.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `(3*35)-8089991`


[0m[33;1m[1;3m-8089886[0m

 20%|██        | 6/30 [00:50<03:32,  8.87s/it]

[32;1m[1;3mIf Jackie hires the accountant, she will have $8,089,886 more than if she does the taxes herself.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `(3000 - 900) * 0.10`


[0m[33;1m[1;3m210.0[0m

 23%|██▎       | 7/30 [00:59<03:20,  8.71s/it]

[32;1m[1;3mTom has to pay $210 per month for the bedroom set.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m


 27%|██▋       | 8/30 [01:03<02:38,  7.19s/it]

[32;1m[1;3mTo calculate the total minutes used by Jason this month, we can use the following formula:

Total minutes used = (15 minutes/day * 30 days) + 300 extra minutes

Total minutes used = (15 * 30) + 300
Total minutes used = 450 + 300
Total minutes used = 750 minutes

To find out how many minutes Jason has left from his 1000-minute plan, we can subtract the total minutes used from the plan:

Minutes left = 1000 - 750
Minutes left = 250 minutes

So, Jason has 250 minutes left on his phone plan for this month.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `3*15 + 5*22 + 6*18`


[0m[33;1m[1;3m263[0m

 30%|███       | 9/30 [01:08<02:18,  6.62s/it]

[32;1m[1;3mIt will take Andy a total of 263 minutes to string all 12 racquets.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `7493436 - (3*16) - (3*6)`


[0m[33;1m[1;3m7493370[0m

 33%|███▎      | 10/30 [01:12<01:53,  5.70s/it]

[32;1m[1;3mTed spent a total of $7,493,370.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `80*0.8*5*5128470`


[0m[33;1m[1;3m1641110400.0[0m

 37%|███▋      | 11/30 [01:16<01:42,  5.38s/it]

[32;1m[1;3mJohn makes $1,641,110,400 from selling the planks.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `(40+40+16)/2`


[0m[33;1m[1;3m48.0[0m

 40%|████      | 12/30 [01:26<01:58,  6.60s/it]

[32;1m[1;3mThe average of Mia and Emma's ages is 48 years.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `1534633*2+1534633+5`


[0m[33;1m[1;3m4603904[0m

 43%|████▎     | 13/30 [01:35<02:06,  7.47s/it]

[32;1m[1;3mThe boots cost $4,603,904.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `12+8`


[0m[33;1m[1;3m20[0m

 47%|████▋     | 14/30 [01:40<01:48,  6.75s/it]

[32;1m[1;3mOut of the past thirty days, Christina had 12 good days and 8 bad days. Since the first eight days were good, the second eight days were bad, and the third eight days were neutral, she has used up 8 good days, 8 bad days, and 8 neutral days. This means that she has 20 days left, and if the next three days are good, neutral, and good, she will have 17 good days left in the month.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `965589 - (5 * 2)`


[0m[33;1m[1;3m965579[0m

 50%|█████     | 15/30 [01:47<01:39,  6.62s/it]

[32;1m[1;3mVioletta will get $965,579 in change after buying 5 crayons at $2 each.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `7442649*5+7442649*8`


[0m[33;1m[1;3m96754437[0m

 53%|█████▎    | 16/30 [01:55<01:40,  7.16s/it]

[32;1m[1;3mLloyd earned $96,754,437 for the first two weeks of math tutoring.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `2298834 - 21 - 23 - 9 - 28`


[0m[33;1m[1;3m2298753[0m

 57%|█████▋    | 17/30 [02:02<01:32,  7.11s/it]

[32;1m[1;3mAfter buying stickers from the store, receiving some for his birthday, giving some to his sister, and using some to decorate a greeting card, Charlie has 2,298,753 stickers left.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `36/6`


[0m[33;1m[1;3m6.0[0m[32;1m[1;3m
Invoking: `calculator` with `24/4983407`


[0m[33;1m[1;3m4.81598231892358e-06[0m

 60%|██████    | 18/30 [02:15<01:47,  8.97s/it]

[32;1m[1;3mJackson will be able to plant 6 rows of red tulips and approximately 4.82e-06 rows of blue tulips. Since it's not possible to plant a fraction of a row, Jackson will plant 6 rows of red tulips and 0 rows of blue tulips.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `200 + 300*0.5 + 7013802*30 + 6*2`


[0m[33;1m[1;3m210414422.0[0m

 63%|██████▎   | 19/30 [02:25<01:41,  9.22s/it]

[32;1m[1;3mThe total cost of the doctor's visit for Hannah is $21,041,422.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `3 * 567745 / 10 * 1.25`


[0m[33;1m[1;3m212904.375[0m

 67%|██████▋   | 20/30 [02:30<01:17,  7.77s/it]

[32;1m[1;3mGreg will spend approximately $212,904.38 on jello for the birthday party.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `5611617 * 0.25 * 0.4`


[0m[33;1m[1;3m561161.7000000001[0m

 70%|███████   | 21/30 [02:35<01:02,  6.95s/it]

[32;1m[1;3mOut of 5,611,617 riders, approximately 561,162 men can stay upright on the wave.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `1007458 + (1007458 - 35) + 2*(1007458 - 35)`


[0m[33;1m[1;3m4029727[0m

 73%|███████▎  | 22/30 [12:49<25:13, 189.19s/it]

[32;1m[1;3mThe total number of gems in the chest is 4,029,727.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `900 / 2775702 * 2.5`


[0m[33;1m[1;3m0.0008106057494644597[0m

 77%|███████▋  | 23/30 [12:59<15:48, 135.45s/it]

[32;1m[1;3mThe farmer will earn approximately $0.0008 if he sells the 900 eggs for $2.5 per tray.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `1*52+(6363220+2*6363220)*52`


[0m[33;1m[1;3m992662372[0m

 80%|████████  | 24/30 [13:06<09:41, 96.97s/it] 

[32;1m[1;3mBud spends $99,266,237.20 on making macaroni and cheese in one year.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `5*5+3*1551100+10`


[0m[33;1m[1;3m4653335[0m

 83%|████████▎ | 25/30 [13:11<05:46, 69.22s/it]

[32;1m[1;3mCherrie bought a total of 4,653,335 gifts for her friends, including the additional 10 random gifts.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `(120-10)/(2+6+2)`


[0m[33;1m[1;3m11.0[0m

 87%|████████▋ | 26/30 [13:21<03:26, 51.54s/it]

[32;1m[1;3mVicki can allow 11 groups to perform in the concert.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `6109674 / 3`


[0m[33;1m[1;3m2036558.0[0m

 90%|█████████ | 27/30 [13:27<01:53, 37.78s/it]

[32;1m[1;3mJennie put stamps on 2036558 letters, leaving 4073116 letters still needing stamps. If there are now 30 letters in the pile of already-stamped letters, then there were 30 + 4073116 = 4073146 letters in that pile when Jennie began.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `61 - 993735 + 18`


[0m[33;1m[1;3m-993656[0m[32;1m[1;3m
Invoking: `calculator` with `61 - 993735 + 18`
responded: After performing the calculations, it seems that there is a mistake. Let me recalculate.

[0m[33;1m[1;3m-993656[0m[32;1m[1;3m
Invoking: `calculator` with `61 - 993735 + 18`
responded: I apologize for the confusion. It seems that there was an error in the calculation. Let me correct that.

[0m[33;1m[1;3m-993656[0m[32;1m[1;3m
Invoking: `calculator` with `61 - 993735 + 18`
responded: It seems that there is an error in the calculation. Let me correct that.

[0m[33;1m[1;3m-993656[0m[32;1m[1;3m
In

 93%|█████████▎| 28/30 [13:52<01:08, 34.04s/it]

[33;1m[1;3m-993656[0m[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `(6671520 / 7 - 4 * 5) / 2 + 4 * 5`


[0m[33;1m[1;3m476547.14285714284[0m

 97%|█████████▋| 29/30 [13:59<00:25, 25.96s/it]

[32;1m[1;3mPeter will have a total of approximately 476,547 action figures by the end of the week.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `calculator` with `(40*10 + 4896852*12) / 2`


[0m[33;1m[1;3m29381312.0[0m

100%|██████████| 30/30 [14:12<00:00, 28.41s/it]

[32;1m[1;3mThe average total distance spat by Zoey and Sydney is 29,381,312 feet.[0m

[1m> Finished chain.[0m





[{'agent_name': 'gpt3.5_langchain2',
  'question': 'Charlie wants to sell beeswax candles.  For every pound of beeswax, he can make 10 tapered candles.  One pound of beeswax and the wicks cost $10.00 in supplies.   If he sells each candle for $2.00 each, what is his net profit if he makes and sells 1080379 candles?',
  'gt_answer': 1080379.0,
  'prediction': "Charlie's net profit, if he makes and sells 1,080,379 candles, would be $20,527,191.",
  'intermediate_steps': [{'tool': 'calculator',
    'tool_input': '10*1080379*2-(10.00+1080379)',
    'tool_output': '20527191.0'}],
  'parsing_error': False,
  'iteration_limit_exceeded': False,
  'agent_error': None,
  'start_time': '2024-02-07 15:57:38',
  'end_time': '2024-02-07 15:57:43',
  'task': 'GSM8K-hard'},
 {'agent_name': 'gpt3.5_langchain2',
  'question': 'Melissa is summoned to jury duty. She spends 6 hours a day for 3 days listening to a court case. If Melissa is paid $557654 per day but also has to pay $3 for parking each day, ho

### CrewAI

In [None]:
from crewai import Agent, Task, Crew, Process


async def call_crew_sequential(agent, question: str):
    output = agent.invoke(prompt + question).content
    # Creating a senior researcher agent
    researcher = Agent(
        role="Senior Researcher",
        goal=f"Solve this question: {question}",
        verbose=True,
        backstory="""Driven by curiosity, you like solving math problems""",
    )

    # Creating a writer agent
    veryfier = Agent(
        role="Senior Verifyer",
        goal=f"Check results of others for this question: {question}",
        verbose=True,
        backstory="""You have a flair to detect errors in math problems and correct them.""",
    )
    # Research task for identifying AI trends
    solving_task = Task(
        description=f"""Solve the given question: {question}. Think step-by-step.""",
        expected_output="A short answer",
        max_inter=3,
        tools=[langchain_math_tool, langchain_serpapi_tool],
        agent=researcher,
    )

    # Writing task based on research findings
    verification_task = Task(
        description=f"""You have to answer {question}, then provide a final answer.
        Think step-by-step to correct any errors. Remember to preface your final answer with 'Final Answer:'. In your final answer, do not talk about wether the previous answer was correct or not. Just provide the corrected answer.
        """,
        expected_output=f"The corrected answer to the question {question}",
        tools=[langchain_math_tool, langchain_serpapi_tool],
        agent=veryfier,
    )
    # Forming the tech-focused crew
    crew = Crew(
        agents=[researcher, veryfier],
        tasks=[solving_task, verification_task],
        process=Process.sequential,  # Sequential task execution
    )
    output = crew.kickoff()

    if "Final Answer:" in output:
        output = output.split("Final Answer:")[1]
    response = {"output": output, "intermediate_steps": []}
    return response


llm_gpt = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

await answer_questions(
    eval_dataset,
    llm_gpt,
    "gpt3.5_crewai_sequential",
    output_folder="output_dspy",
    agent_call_function=call_crew_sequential,
)

100%|██████████| 30/30 [00:00<00:00, 8927.22it/s]


[{'agent_name': 'gpt3.5_crewai_sequential',
  'question': 'A craft store makes a third of its sales in the fabric section, a quarter of its sales in the jewelry section, and the rest in the stationery section. They made 36 sales today. How many sales were in the stationery section?',
  'gt_answer': 'The craft store made 36 / 3 = <<36/3=12>>12 sales in the fabric section.\nIt made 36 / 4 = <<36/4=9>>9 sales in the jewelry section.\nThus, there were 36 - 12 - 9 = <<36-12-9=15>>15 sales in the stationery section.\n#### 15',
  'prediction': 'The stationery section made 15 sales.',
  'intermediate_steps': [],
  'parsing_error': False,
  'iteration_limit_exceeded': False,
  'agent_error': None,
  'start_time': '2024-02-07 14:35:10',
  'end_time': '2024-02-07 14:35:50',
  'task': 'GSM8K'},
 {'agent_name': 'gpt3.5_crewai_sequential',
  'question': "Marcy is a makeup artist and has agreed to do some makeup for her friend's wedding. The only makeup she has limited stock of is lip gloss so she co

# Evaluate results

In [18]:
import pandas as pd
import glob
import json

result_df = pd.concat([pd.read_json(f) for f in glob.glob("output_dspy2/*.json")])
result_df = result_df.drop(["start_time", "end_time"], axis=1)
answers = result_df.to_dict(orient="records")
with open("output/results_dspy_experiment.json", "w") as f:
    json.dump(answers, f)

In [19]:
from scripts.evaluation import evaluate_answers
from scripts.prompts import EVALUATION_PROMPT_TEMPLATE

output_file_name = "output/results_dspy_experiment.json"

with open(output_file_name, "r") as f:
    answers = json.load(f)

results = await evaluate_answers(
    answers,
    eval_chat_model,
    "GPT4",
    EVALUATION_PROMPT_TEMPLATE,
    output_file=output_file_name,
)
result_df = pd.DataFrame.from_records(results)

print("Evaluation is complete!")

  0%|          | 0/64 [00:00<?, ?it/s]

100%|██████████| 64/64 [00:20<00:00,  3.20it/s]

Evaluation is complete!





In [20]:
result_df["eval_score_GPT4"] = result_df["eval_score_GPT4"].astype(float)
result_df["eval_score_GPT4"] = (result_df["eval_score_GPT4"] - 1) / 4
display(result_df.groupby("agent_name")["eval_score_GPT4"].mean())

agent_name
dspy_agent              0.000000
gpt3.5_agent_notools    0.566667
gpt3.5_langchain2       0.433333
Name: eval_score_GPT4, dtype: float64

##### Thoughts on DSPy:
- Contrary to the framework's promise (see discussions [here](https://news.ycombinator.com/item?id=37417698)), many prompts are hardcoded: [example](https://github.com/search?q=repo%3Astanfordnlp%2Fdspy%20To%20do%20this%2C%20you%20will%20interleave%20Thought%2C%20Action%2C%20and%20Observation%20steps.&type=code)
- The frameworks makes it really complicated to get visibility on what's happening under the hood.
- Probably mainly for this reason, I did not manage to get it to achieve correct performance in 1 day of work.

# CrewAI