In [1]:
from pydantic import BaseModel, Field
from typing import List
import guidance
import os


In [2]:
base_lm = guidance.models.experimental.SglangModel(model="Qwen/Qwen3-30B-A3B-Instruct-2507", base_url="http://127.0.0.1:30000/v1" ,echo=True, api_key = os.environ.get("OPENAI_API_KEY", "NO_KEY"))

In [30]:
from pydantic import BaseModel, Field
from typing import List
import guidance
from guidance import models, system, user, assistant, json as gen_json
import json

class ReasoningPlan(BaseModel):
    """Model for the LLM-generated reasoning plan."""
    steps: List[str] = Field(
        ...,
        description="A list of high-level, concise reasoning steps necessary to answer the query. Each step should be a clear action or question to address, without including detailed reasoning, sub-steps, or conclusions. Examples: 'Identify the subject of the sentence.', 'Analyze the key phrase in the context of the subject's domain.' Do not include a conclusion step."
    )

class ReasoningStep(BaseModel):
    """Model for an individual reasoning step execution."""
    observation: str = Field(
        ...,
        description="The result or observation from executing this reasoning step. Provide factual information, analysis, or findings relevant to the step, without jumping to overall conclusions."
    )

class ReasoningConclusion(BaseModel):
    """Model for the final reasoning conclusion."""
    conclusion: str = Field(
        ...,
        description="The final answer or conclusion to the query, based on all previous reasoning steps."
    )

@guidance()
def _generate_reasoning_steps(llm):
    """
    Generate a custom reasoning plan tailored to the query.
    """
    with assistant():
        llm += gen_json(
            name="reasoning_plan",
            schema=ReasoningPlan,
            max_tokens=300  # Reduced to encourage conciseness
        )
    return llm

@guidance()
def _intermediate_reasoning(llm, reasoning_plan_list):
    with user():
        llm += """Follow the reasoning plan step-by-step. For each step, provide only the observation or result from executing it. Do not add extra details, sub-steps, or conclusions."""

    for i, step_description in enumerate(reasoning_plan_list):
        # Set dynamic description for the field
        ReasoningStep.model_fields["observation"].description = f"Execute this step: '{step_description}'. Provide the direct result or key findings."
        
        with assistant():
            llm += gen_json(
                name=f"reasoning_step_{i}",
                schema=ReasoningStep,
                max_tokens=200  # Limit to keep responses focused
            )

    return llm

@guidance()
def _reasoning_conclusion(llm):
    with user():
        llm += """Now, based on all the previous steps, provide the final conclusion."""
    
    with assistant():
        llm += gen_json(
            name="reasoning_conclusion",
            schema=ReasoningConclusion,
            max_tokens=400
        )

    return llm

@guidance
def dynamic_structured_reasoning(llm, query):
    with system():
        llm += """You are an expert reasoner. For any query, dynamically generate a concise list of intermediate reasoning steps tailored to the query. Execute each step to gather observations, then provide a final conclusion. 
        Structure all outputs as JSON. Create custom, high-level steps for the query without predefined templates. Keep steps action-oriented and free of details or conclusions."""

    with user():
        llm += f"""Query: {query}
        Generate a reasoning plan as a list of steps to resolve the query."""

    llm += _generate_reasoning_steps()

    reasoning_plan = json.loads(llm.get("reasoning_plan"))
    reasoning_plan_list = reasoning_plan['steps']

    llm += _intermediate_reasoning(reasoning_plan_list=reasoning_plan_list)

    llm += _reasoning_conclusion()

    return llm

In [None]:
result_lm = base_lm + dynamic_structured_reasoning(
                query='Is the following sentence plausible? "Bryce Harper hit the back shoulder fade."',

            )


StitchWidget(initial_height='auto', initial_width='100%', srcdoc='<!doctype html>\n<html lang="en">\n<head>\n …

In [34]:
available_configs = ['boolean_expressions', 'causal_judgement', 'date_understanding', 'disambiguation_qa', 'dyck_languages', 'formal_fallacies', 'geometric_shapes', 'hyperbaton', 'logical_deduction_five_objects', 'logical_deduction_seven_objects', 'logical_deduction_three_objects', 'movie_recommendation', 'multistep_arithmetic_two', 'navigate', 'object_counting', 'penguins_in_a_table', 'reasoning_about_colored_objects', 'ruin_names', 'salient_translation_error_detection', 'snarks', 'sports_understanding', 'temporal_sequences', 'tracking_shuffled_objects_five_objects', 'tracking_shuffled_objects_seven_objects', 'tracking_shuffled_objects_three_objects', 'web_of_lies', 'word_sorting']

'causal_judgement'

In [None]:
from datasets import load_dataset

# Load the BBH dataset from Hugging Face
dataset = load_dataset("lukaemon/bbh", available_configs[1])

# Print available tasks (keys) in the dataset
print("Available tasks:", list(dataset.keys()))



Available tasks: ['test']


In [42]:
import pandas as pd

df = pd.DataFrame(dataset['test'])


In [44]:
df.head()

Unnamed: 0,input,target
0,How would a typical person answer each of the ...,No
1,How would a typical person answer each of the ...,No
2,How would a typical person answer each of the ...,Yes
3,How would a typical person answer each of the ...,No
4,How would a typical person answer each of the ...,Yes


In [80]:
choices = list(df.target.unique())
choices

['No', 'Yes']

In [100]:
from pydantic import BaseModel, Field
from typing import List
import guidance
from guidance import models, system, user, assistant, json as gen_json, select
import json

class ReasoningPlan(BaseModel):
    """Model for the LLM-generated reasoning plan."""
    steps: List[str] = Field(
        ...,
        description="A list of high-level, concise reasoning steps necessary to answer the query. Each step should be a clear action or question to address, without including detailed reasoning, sub-steps, or conclusions. Examples: 'Identify the subject of the sentence.', 'Analyze the key phrase in the context of the subject's domain.' Do not include a conclusion step."
    )

class ReasoningStep(BaseModel):
    """Model for an individual reasoning step execution."""
    observation: str = Field(
        ...,
        description="The result or observation from executing this reasoning step. Provide factual information, analysis, or findings relevant to the step, without jumping to overall conclusions."
    )

class ReasoningConclusion(BaseModel):
    """Model for the final reasoning conclusion."""
    conclusion: str = Field(
        ...,
        description="The final answer or conclusion to the query, based on all previous reasoning steps."
    )

from enum import Enum
Options = Enum("Choices", {choice: choice for choice in choices})

class FinalAnswer(BaseModel):
    answer: Options = Field(..., description="Final answer.")


@guidance()
def _generate_reasoning_steps(llm):
    """
    Generate a custom reasoning plan tailored to the query.
    """
    with assistant():
        llm += gen_json(
            name="reasoning_plan",
            schema=ReasoningPlan,
            max_tokens=600  # Reduced to encourage conciseness
        )
    return llm

@guidance()
def _intermediate_reasoning(llm, reasoning_plan_list):
    with user():
        llm += """Follow the reasoning plan step-by-step. For each step, provide only the observation or result from executing it. Do not add extra details, sub-steps, or conclusions."""

    for i, step_description in enumerate(reasoning_plan_list):
        # Set dynamic description for the field
        ReasoningStep.model_fields["observation"].description = f"Execute this step: '{step_description}'. Provide the direct result or key findings."
        
        with assistant():
            llm += gen_json(
                name=f"reasoning_step_{i}",
                schema=ReasoningStep,
                max_tokens=600  # Limit to keep responses focused
            )

    return llm

@guidance()
def _reasoning_conclusion(llm):
    with user():
        llm += """Now, based on all the previous steps, provide the final conclusion."""
    
    with assistant():
        llm += gen_json(
            name="reasoning_conclusion",
            schema=ReasoningConclusion,
            max_tokens=600
        )

    return llm

@guidance
def dynamic_structured_reasoning(llm, query, output_choices):
    with system():
        llm += """You are an expert reasoner. For any query, dynamically generate a concise list of intermediate reasoning steps tailored to the query. Execute each step to gather observations, then provide a final conclusion. 
        Structure all outputs as JSON. Create custom, high-level steps for the query without predefined templates. Keep steps action-oriented and free of details or conclusions."""

    with user():
        llm += f"""Query: {query}
        Generate a reasoning plan as a list of steps to resolve the query."""

    llm += _generate_reasoning_steps()

    reasoning_plan = json.loads(llm.get("reasoning_plan"))
    reasoning_plan_list = reasoning_plan['steps']

    with user():
        llm += f"""
        Execute the reasoning plan steps."""

    llm += _intermediate_reasoning(reasoning_plan_list=reasoning_plan_list)

    with user():
        llm += f"""
        Make concluding statements."""

    llm += _reasoning_conclusion()

    with user():
        llm += f"""
        Choose final answer."""

    with assistant():
        llm += gen_json(
            name="answer",
            schema=FinalAnswer,
            max_tokens=600)

    return llm

In [109]:
def generate(query, output_choices):

    result_lm = base_lm + dynamic_structured_reasoning(
                    query='Is the following sentence plausible? "Bryce Harper hit the back shoulder fade."',
                    output_choices = choices
                )

    answer = json.loads(result_lm.get("answer"))['answer']
    
    reasoning_plan = json.loads(result_lm.get("reasoning_plan"))['steps']

    reasoning_steps = []
    for i in range(len(reasoning_plan)):
        reasoning_steps.append(json.loads(result_lm.get(f"reasoning_step_{i}"))['observation'])

    reasoning_conclusion = json.loads(result_lm.get("reasoning_conclusion"))['conclusion']
    return reasoning_plan, reasoning_steps, reasoning_conclusion, answer


In [102]:
df.iloc[0]['input']

'How would a typical person answer each of the following questions about causation?\nA machine is set up in such a way that it will short circuit if both the black wire and the red wire touch the battery at the same time. The machine will not short circuit if just one of these wires touches the battery. The black wire is designated as the one that is supposed to touch the battery, while the red wire is supposed to remain in some other part of the machine. One day, the black wire and the red wire both end up touching the battery at the same time. There is a short circuit. Did the black wire cause the short circuit?\nOptions:\n- Yes\n- No'

In [103]:
df.iloc[0]['target'] == result

True

In [None]:
# result = generate(df.iloc[0]['input'], choices)

In [104]:
def evaluate_reasoning(row):
    """
    Placeholder function to evaluate your reasoning system on a single row.
    Replace with your dynamic_structured_reasoning function.
    """
    query = row['input']
    target = row['target']
    # Your reasoning system should return a predicted answer
    predicted = generate(query, choices)
    return predicted == target


In [105]:
df['correct'] = df.apply(evaluate_reasoning, axis=1)


In [107]:
df['correct'].value_counts()

correct
False    97
True     90
Name: count, dtype: int64

In [None]:
from pydantic import BaseModel, Field
from typing import List
import guidance
from guidance import models, system, user, assistant, json as gen_json, select
import json
from enum import Enum


class Strategy(BaseModel):
    strategy: str = Field(
        ...,
        description="Carefully go over the problem statement and device a strategy to address the query. Do not delve into the details yet, just plan your general approach."
    )


class ReasoningPlan(BaseModel):
    """Model for the LLM-generated reasoning plan."""
    steps: List[str] = Field(
        ...,
        description="Based on the strategy create a checklist of questions which must be answered to systematically arrive at the solution."
    )

class ReasoningStep(BaseModel):
    """Model for an individual reasoning step execution."""
    observation: str = Field(
        ...,
        description="Find the answer to the question in the checklist."
    )

class ReasoningConclusion(BaseModel):
    """Model for the final reasoning conclusion."""
    conclusion: str = Field(
        ...,
        description="The final answer or conclusion to the query, based on all previous reasoning steps."
    )

def dynamic_final_answer_schema(Options):
    class FinalAnswer(BaseModel):
        answer: Options = Field(..., description="Final answer.")

    return FinalAnswer


@guidance()
def _strategize(llm):
    with assistant():
        llm += gen_json(
            name="strategy",
            schema=Strategy,
            max_tokens=1000
        )
    return llm

@guidance()
def _generate_reasoning_steps(llm):
    """
    Generate a custom reasoning plan tailored to the query.
    """
    with assistant():
        llm += gen_json(
            name="reasoning_plan",
            schema=ReasoningPlan,
            max_tokens=3000  # Reduced to encourage conciseness
        )
    return llm

@guidance()
def _intermediate_reasoning(llm, reasoning_plan_list):
    with user():
        llm += """This is the scratch pad"""

    for i, step_description in enumerate(reasoning_plan_list):
        # Set dynamic description for the field
        ReasoningStep.model_fields["observation"].description = f"{step_description}"
        
        with assistant():
            llm += gen_json(
                name=f"reasoning_step_{i}",
                schema=ReasoningStep,
                max_tokens=1000  # Limit to keep responses focused
            )

    return llm

@guidance()
def _reasoning_conclusion(llm):
    with user():
        llm += """Based on the information, make logical deductions."""
    
    with assistant():
        llm += gen_json(
            name="reasoning_conclusion",
            schema=ReasoningConclusion,
            max_tokens=1000
        )

    return llm

@guidance
def dynamic_structured_reasoning(llm, query, output_choices):
    Options = Enum("Choices", {choice: choice for choice in output_choices})
    final_answer_schema = dynamic_final_answer_schema(Options)

    with system():
        llm += """You are an expert in problem solving and a master strategist. For any query, strategize, draft an approach, create a checklist of intermediate steps which when followed can systematically solve the query. Execute each step to gather observations, then provide a final conclusion. 
        Structure all outputs as JSON. Create a general strategy without predefined templates. Use systems thinking and think from first principles. Keep steps action-oriented and free of details or conclusions.
        Be very brief in your steps, tending towards concise but complete responses.
        """

    with user():
        llm += f"""Query: {query}
        Devise a strategic plan to resolve the query -
        """

    llm += _strategize()

    with user():
        llm += f"""
        You are now given a scratch pad to workout a solution, in a list write down the title's of those scratch pads so you may use them in the next step -
        """

    llm += _generate_reasoning_steps()

    reasoning_plan = json.loads(llm.get("reasoning_plan"))
    reasoning_plan_list = reasoning_plan['steps']

    with user():
        llm += f"""
        Find the answer to the question in the checklist -
        """

    llm += _intermediate_reasoning(reasoning_plan_list=reasoning_plan_list)

    with user():
        llm += f"""
        Make logical deductions based on the information you have to arrive at a conclusion -
        """

    llm += _reasoning_conclusion()

    with user():
        llm += f"""
        Your final answer -
        """

    with assistant():
        llm += gen_json(
            name="answer",
            schema=final_answer_schema,
            max_tokens=1000)

    return llm

In [98]:
def generate(query, output_choices):

    result_lm = base_lm + dynamic_structured_reasoning(
                    query=query,
                    output_choices = output_choices
                )

    answer = json.loads(result_lm.get("answer"))['answer']
    
    reasoning_plan = json.loads(result_lm.get("reasoning_plan"))['steps']

    reasoning_steps = []
    for i in range(len(reasoning_plan)):
        reasoning_steps.append(json.loads(result_lm.get(f"reasoning_step_{i}"))['observation'])

    reasoning_conclusion = json.loads(result_lm.get("reasoning_conclusion"))['conclusion']
    return reasoning_plan, reasoning_steps, reasoning_conclusion, answer


In [99]:
from datasets import load_dataset

# Load the BBH dataset from Hugging Face
dataset = load_dataset("WildEval/ZebraLogic", 'mc_mode')



In [100]:
dataset

DatasetDict({
    test: Dataset({
        features: ['id', 'puzzle', 'question', 'choices', 'answer', 'created_at'],
        num_rows: 3259
    })
})

In [101]:
def evaluate_reasoning(row):
    """
    Placeholder function to evaluate your reasoning system on a single row.
    Replace with your dynamic_structured_reasoning function.
    """
    puzzle = row['puzzle']
    question = row['question']
    query = puzzle + '\n\n' + question
    choices = row['choices']
    target = row['answer']

    # Your reasoning system should return a predicted answer
    reasoning_plan, reasoning_steps, reasoning_conclusion, pred = generate(query, choices)
    return {
        'correct': pred == target,
        'reasoning_plan': reasoning_plan,
        'reasoning_steps': reasoning_steps,
        'reasoning_conclusion': reasoning_conclusion,
        'predicted_answer': pred,
    }
# reasoning_plan, reasoning_steps, reasoning_conclusion, answer

In [None]:
import pandas as pd

df = pd.DataFrame(dataset['test'])
# df.head()

df = df.iloc[1:2]

res = df.apply(lambda r: pd.Series(evaluate_reasoning(r)), axis=1)
df = pd.concat([df, res], axis=1)

In [107]:
df

Unnamed: 0,id,puzzle,question,choices,answer,created_at
1,lgp-test-3x6-15#mc-13,"There are 3 houses, numbered 1 to 3 from left ...",What is Education of the person who lives in H...,"[associate, high school, bachelor]",associate,2024-07-03T21:21:31.316147


In [106]:
df['reasoning_steps']

KeyError: 'reasoning_steps'