In [1]:
from pydantic import BaseModel, Field
from typing import List
import guidance
import os


In [2]:
base_lm = guidance.models.experimental.SglangModel(model="Qwen/Qwen3-30B-A3B-Instruct-2507", base_url="http://127.0.0.1:30000/v1" ,echo=True, api_key = os.environ.get("OPENAI_API_KEY", "NO_KEY"))

In [3]:
# def run_gen_test(lm):
#     with guidance.user():
#         lm += "What is the capital of France? and its population?"
#         lm += "Format your answer as follows: Capital: <capital>, Population: <population>"

#     with guidance.assistant():
#         lm += guidance.gen(max_tokens=1024, temperature=0.7, name="answer")
#         print(lm["answer"])

# run_gen_test(base_lm)

In [4]:
from pydantic import BaseModel, Field
from typing import List
import guidance
from guidance import models, system, user, assistant, json as gen_json, select
import json
from enum import Enum


class FirstPrinciples(BaseModel):
    absolute_truths: List[str] = Field(
        ...,
        description="Look at the problem statement and list the things that you know that are absolute truths."
    )


class GroundUpReasoning(BaseModel):
    """Model for the LLM-generated reasoning plan."""
    reason: str = Field(
        ...,
        description="Based on the known truths reason in a concise sharp manner from ground up to address the problem."
    )

class Reflection(BaseModel):
    """Model for an individual reasoning step execution."""
    evaluation: str = Field(
        ...,
        description="Reflect on your reasoning and identify gaps / uncertainties / mistakes."
    )

class ReasoningConclusion(BaseModel):
    """Model for the final reasoning conclusion."""
    conclusion: str = Field(
        ...,
        description="Address the identified gaps in the previous step if any and deduce the final conclusion."
    )

def dynamic_final_answer_schema(Options):
    class FinalAnswer(BaseModel):
        answer: Options = Field(..., description="Final answer.")

    return FinalAnswer


@guidance()
def _first_principles(llm):
    with assistant():
        llm += gen_json(
            name="first_principles",
            schema=FirstPrinciples,
            max_tokens=5000
        )
    return llm

@guidance()
def _generate_groundup_reasoning(llm):
    """
    Generate a custom reasoning plan tailored to the query.
    """
    with assistant():
        llm += gen_json(
            name="groundup_reasoning",
            schema=GroundUpReasoning,
            max_tokens=5000  # Reduced to encourage conciseness
        )
    return llm

@guidance()
def _reflection(llm):
    with assistant():
        llm += gen_json(
            name="reflection",
            schema=Reflection,
            max_tokens=5000
        )
    return llm
    

@guidance()
def _reasoning_conclusion(llm):
    with user():
        llm += """Based on the information, make logical deductions."""
    
    with assistant():
        llm += gen_json(
            name="reasoning_conclusion",
            schema=ReasoningConclusion,
            max_tokens=5000
        )

    return llm

@guidance
def dynamic_structured_reasoning(llm, query, output_choices):
    Options = Enum("Choices", {choice: choice for choice in output_choices})
    final_answer_schema = dynamic_final_answer_schema(Options)

    with system():
        llm += """You are an expert in problem solving. For any query, reason from first principles to solve the query. 
        Structure all outputs as JSON. Be very brief in your steps, tending towards concise but complete responses.
        """

    with user():
        llm += f"""Query: {query}
        Look at the problem statement and list the things that you know that are absolute truths -
        """

    llm += _first_principles()

    with user():
        llm += f"""
        Based on the known truths reason from ground up to address the problem, be extremely brief and to the point in your reasoning -
        """

    llm += _generate_groundup_reasoning()

    with user():
        llm += f"""
        Reflect to identify gaps / uncertainties / mistakes in reasoning process -
        """

    llm += _reflection()

    with user():
        llm += f"""
        Make logical deductions based on the information you have to arrive at a conclusion -
        """

    llm += _reasoning_conclusion()

    with user():
        llm += f"""
        Your final answer -
        """

    with assistant():
        llm += gen_json(
            name="answer",
            schema=final_answer_schema,
            max_tokens=1000)

    return llm

In [5]:
def generate(query, output_choices):

    result_lm = base_lm + dynamic_structured_reasoning(
                    query=query,
                    output_choices = output_choices
                )

    answer = json.loads(result_lm.get("answer"))['answer']
    
    groundup_reasoning = json.loads(result_lm.get("groundup_reasoning"))['reason']
    first_principles = json.loads(result_lm.get("first_principles"))['absolute_truths']
    reflection = json.loads(result_lm.get("reflection"))['evaluation']   
    reasoning_conclusion = json.loads(result_lm.get("reasoning_conclusion"))['conclusion']
    return first_principles, groundup_reasoning, reflection, reasoning_conclusion, answer


In [6]:
from datasets import load_dataset

# Load the BBH dataset from Hugging Face
dataset = load_dataset("WildEval/ZebraLogic", 'mc_mode')



In [7]:
from json import JSONDecodeError


def evaluate_reasoning(row):
    """
    Placeholder function to evaluate your reasoning system on a single row.
    Replace with your dynamic_structured_reasoning function.
    """
    puzzle = row['puzzle']
    question = row['question']
    query = puzzle + '\n\n' + question
    choices = row['choices']
    target = row['answer']
    print(row['id'])
    # Your reasoning system should return a predicted answer
    try:
        first_principles, groundup_reasoning, reflection, reasoning_conclusion, pred = generate(query, choices)
    except JSONDecodeError as e:
        first_principles, groundup_reasoning, reflection, reasoning_conclusion, pred = ["JSONDecodeError"]*5
    return {
        'correct': pred == target,
        'first_principles': first_principles,
        'groundup_reasoning': groundup_reasoning,
        'reflection': reflection,
        'reasoning_conclusion': reasoning_conclusion,
        'predicted_answer': pred,
    }


In [12]:
import pandas as pd

df = pd.DataFrame(dataset['test'])
# df.head()
# CURRENTLY FINISHED - [:550]

df = df.iloc[550:600]

res = df.apply(lambda r: pd.Series(evaluate_reasoning(r)), axis=1)
df = pd.concat([df, res], axis=1)

lgp-test-5x5-5#mc-20


StitchWidget(initial_height='auto', initial_width='100%', srcdoc='<!doctype html>\n<html lang="en">\n<head>\n …

lgp-test-5x6-2#mc-0
lgp-test-6x6-13#mc-2
lgp-test-5x6-37#mc-2
lgp-test-6x6-31#mc-10
lgp-test-4x4-10#mc-15
lgp-test-6x6-24#mc-19
lgp-test-5x4-39#mc-1
lgp-test-2x2-37#mc-1
lgp-test-3x5-11#mc-8
lgp-test-4x3-4#mc-6
lgp-test-2x6-34#mc-3
lgp-test-4x2-1#mc-5
lgp-test-3x6-18#mc-12
lgp-test-6x2-5#mc-1
lgp-test-4x5-30#mc-18
lgp-test-4x5-12#mc-8
lgp-test-2x4-16#mc-7
lgp-test-3x4-29#mc-3
lgp-test-4x3-37#mc-9
lgp-test-6x4-33#mc-20
lgp-test-5x6-29#mc-23
lgp-test-5x5-3#mc-20
lgp-test-5x5-22#mc-22
lgp-test-4x5-24#mc-10
lgp-test-4x5-22#mc-18
lgp-test-6x3-7#mc-15
lgp-test-4x2-34#mc-2
lgp-test-4x6-12#mc-5
lgp-test-5x4-31#mc-7
lgp-test-5x4-9#mc-4
lgp-test-4x6-20#mc-2
lgp-test-2x6-21#mc-1
lgp-test-3x3-34#mc-0
lgp-test-2x2-38#mc-2
lgp-test-2x2-26#mc-1
lgp-test-5x3-37#mc-2
lgp-test-5x4-22#mc-16
lgp-test-2x2-10#mc-2
lgp-test-4x3-6#mc-9
lgp-test-2x2-11#mc-3
lgp-test-6x5-3#mc-0
lgp-test-5x5-0#mc-10
lgp-test-4x5-6#mc-16
lgp-test-3x4-25#mc-8
lgp-test-6x5-20#mc-19
lgp-test-2x6-0#mc-0
lgp-test-6x5-23#mc-27
lgp-test

In [13]:
df

Unnamed: 0,id,puzzle,question,choices,answer,created_at,correct,first_principles,groundup_reasoning,reflection,reasoning_conclusion,predicted_answer
550,lgp-test-5x5-5#mc-20,"There are 5 houses, numbered 1 to 5 from left ...",What is Name of the person who lives in House 5?,"[Eric, Bob, Peter, Arnold, Alice]",Peter,2024-07-03T21:21:31.348788,False,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError
551,lgp-test-5x6-2#mc-0,"There are 5 houses, numbered 1 to 5 from left ...",What is Name of the person who lives in House 1?,"[Eric, Bob, Peter, Alice, Arnold]",Eric,2024-07-03T21:21:31.354663,False,[House 1 has the person whose birthday is in F...,House 1 has birthday in February (Clue 11). Ho...,The reasoning contains a critical flaw: it res...,The puzzle contains an irreconcilable contradi...,Arnold
552,lgp-test-6x6-13#mc-2,"There are 6 houses, numbered 1 to 6 from left ...",What is HairColor of the person who lives in H...,"[red, brown, gray, black, auburn, blonde]",black,2024-07-03T21:21:31.387783,False,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError
553,lgp-test-5x6-37#mc-2,"There are 5 houses, numbered 1 to 5 from left ...",What is FavoriteSport of the person who lives ...,"[basketball, baseball, soccer, tennis, swimming]",swimming,2024-07-03T21:21:31.361884,False,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError
554,lgp-test-6x6-31#mc-10,"There are 6 houses, numbered 1 to 6 from left ...",What is Animal of the person who lives in Hous...,"[rabbit, dog, cat, horse, bird, fish]",cat,2024-07-03T21:21:31.392797,False,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError
555,lgp-test-4x4-10#mc-15,"There are 4 houses, numbered 1 to 4 from left ...",What is Education of the person who lives in H...,"[bachelor, associate, high school, master]",bachelor,2024-07-03T21:21:31.323370,True,[House 3 has the person with a master's degree...,"Arnold is in House 3 (Clue 5,7). Master's degr...","Initial assignment: House 1: Eric, associate, ...",The person in House 4 has a bachelor's degree.,bachelor
556,lgp-test-6x6-24#mc-19,"There are 6 houses, numbered 1 to 6 from left ...",What is MusicGenre of the person who lives in ...,"[pop, classical, rock, jazz, hip hop, country]",rock,2024-07-03T21:21:31.390934,False,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError
557,lgp-test-5x4-39#mc-1,"There are 5 houses, numbered 1 to 5 from left ...",What is Hobby of the person who lives in House 1?,"[gardening, knitting, cooking, photography, pa...",knitting,2024-07-03T21:21:31.347762,False,[The person who is very short is in the fifth ...,House 3: tall (Clue 13) → grilled cheese (Clue...,The reasoning correctly deduced that Bob is in...,The hobby of the person in House 1 is photogra...,photography
558,lgp-test-2x2-37#mc-1,"There are 2 houses, numbered 1 to 2 from left ...",What is Vacation of the person who lives in Ho...,"[beach, mountain]",mountain,2024-07-03T21:21:31.299192,True,"[There are 2 houses, numbered 1 to 2 from left...",Eric cannot be in House 2 (no house to the rig...,No gaps or mistakes. The logic is sound: Eric ...,The person in House 1 has the mountain vacation.,mountain
559,lgp-test-3x5-11#mc-8,"There are 3 houses, numbered 1 to 3 from left ...",What is Food of the person who lives in House 2?,"[pizza, grilled cheese, spaghetti]",spaghetti,2024-07-03T21:21:31.312410,True,"[House 3 has the person who eats pizza., The D...",From clue 7: Arnold is Swedish. From clue 2: S...,All steps are logically consistent. Key checks...,The food of the person in House 2 is spaghetti.,spaghetti


In [14]:
df['correct'].value_counts()

correct
True     32
False    18
Name: count, dtype: int64

In [15]:
path = './zebralogic_eval.csv'
df.to_csv(path, mode="a", index=False, header=not os.path.exists(path))

In [16]:
dub = pd.read_csv(path)


In [17]:
dub['correct'].value_counts()

correct
True     449
False    151
Name: count, dtype: int64