In [1]:
from pydantic import BaseModel, Field
from typing import List
import guidance
import os


In [2]:
# # vllm hosted model using LiteLLM

# litellm_desc = {
#     "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507",
#     "litellm_params": {  # params for litellm completion/embedding call
#         "model": "hosted_vllm/Qwen/Qwen3-30B-A3B-Instruct-2507",
#         "api_key": os.environ.get("VLLM_API_KEY", "NO_KEY"), # set your vLLM API key if needed
#         "api_base": "http://localhost:8000/v1", # change to your vLLM API base URL
#     },
# }
# base_lm = guidance.models.experimental.LiteLLM(model_description=litellm_desc, echo=True)

In [2]:
base_lm = guidance.models.experimental.SglangModel(model="Qwen/Qwen3-30B-A3B-Instruct-2507", base_url="http://127.0.0.1:40000/v1" ,echo=True, api_key = os.environ.get("OPENAI_API_KEY", "NO_KEY"))

In [4]:
# def run_gen_test(lm):
#     with guidance.user():
#         lm += "What is the capital of France? and its population?"
#         lm += "Format your answer as follows: Capital: <capital>, Population: <population>"

#     with guidance.assistant():
#         lm += guidance.gen(max_tokens=1024, temperature=0.7, name="answer")
#         print(lm["answer"])

# run_gen_test(base_lm)

In [3]:
from pydantic import BaseModel, Field
from typing import List
import guidance
from guidance import models, system, user, assistant, json as gen_json, select
import json
from enum import Enum


class FirstPrinciples(BaseModel):
    absolute_truths: List[str] = Field(
        ...,
        description="Look at the problem statement and list the things that you know that are absolute truths."
    )


class GroundUpReasoning(BaseModel):
    """Model for the LLM-generated reasoning plan."""
    reason: str = Field(
        ...,
        description="Based on the known truths reason in a concise sharp manner from ground up to address the problem."
    )

class Reflection(BaseModel):
    """Model for an individual reasoning step execution."""
    evaluation: str = Field(
        ...,
        description="Reflect on your reasoning and identify gaps / uncertainties / mistakes."
    )

class ReasoningConclusion(BaseModel):
    """Model for the final reasoning conclusion."""
    conclusion: str = Field(
        ...,
        description="Address the identified gaps in the previous step if any and deduce the final conclusion."
    )

def dynamic_final_answer_schema(Options):
    class FinalAnswer(BaseModel):
        answer: Options = Field(..., description="Final answer.")

    return FinalAnswer


@guidance()
def _first_principles(llm):
    with assistant():
        llm += gen_json(
            name="first_principles",
            schema=FirstPrinciples,
            max_tokens=5000
        )
    return llm

@guidance()
def _generate_groundup_reasoning(llm):
    """
    Generate a custom reasoning plan tailored to the query.
    """
    with assistant():
        llm += gen_json(
            name="groundup_reasoning",
            schema=GroundUpReasoning,
            max_tokens=5000  # Reduced to encourage conciseness
        )
    return llm

@guidance()
def _reflection(llm):
    with assistant():
        llm += gen_json(
            name="reflection",
            schema=Reflection,
            max_tokens=5000
        )
    return llm
    

@guidance()
def _reasoning_conclusion(llm):
    with user():
        llm += """Based on the information, make logical deductions."""
    
    with assistant():
        llm += gen_json(
            name="reasoning_conclusion",
            schema=ReasoningConclusion,
            max_tokens=5000
        )

    return llm

@guidance
def dynamic_structured_reasoning(llm, query, output_choices):
    Options = Enum("Choices", {choice: choice for choice in output_choices})
    final_answer_schema = dynamic_final_answer_schema(Options)

    with system():
        llm += """You are an expert in problem solving. For any query, reason from first principles to solve the query. 
        Structure all outputs as JSON. Be very brief in your steps, tending towards concise but complete responses.
        """

    with user():
        llm += f"""Query: {query}
        Look at the problem statement and list the things that you know that are absolute truths -
        """

    llm += _first_principles()

    with user():
        llm += f"""
        Based on the known truths reason from ground up to address the problem, be extremely brief and to the point in your reasoning -
        """

    llm += _generate_groundup_reasoning()

    with user():
        llm += f"""
        Reflect to identify gaps / uncertainties / mistakes in reasoning process -
        """

    llm += _reflection()

    with user():
        llm += f"""
        Make logical deductions based on the information you have to arrive at a conclusion -
        """

    llm += _reasoning_conclusion()

    with user():
        llm += f"""
        Your final answer -
        """

    with assistant():
        llm += gen_json(
            name="answer",
            schema=final_answer_schema,
            max_tokens=1000)

    return llm

In [4]:
def generate(query, output_choices, lm = None):
    lm = lm or base_lm
    result_lm = lm + dynamic_structured_reasoning(
                    query=query,
                    output_choices = output_choices
                )

    answer = json.loads(result_lm.get("answer"))['answer']
    
    groundup_reasoning = json.loads(result_lm.get("groundup_reasoning"))['reason']
    first_principles = json.loads(result_lm.get("first_principles"))['absolute_truths']
    reflection = json.loads(result_lm.get("reflection"))['evaluation']   
    reasoning_conclusion = json.loads(result_lm.get("reasoning_conclusion"))['conclusion']
    return first_principles, groundup_reasoning, reflection, reasoning_conclusion, answer


In [5]:
from datasets import load_dataset

# Load the BBH dataset from Hugging Face
dataset = load_dataset("WildEval/ZebraLogic", 'mc_mode')



In [6]:
from json import JSONDecodeError


def evaluate_reasoning(row):
    """
    Placeholder function to evaluate your reasoning system on a single row.
    Replace with your dynamic_structured_reasoning function.
    """
    
    # from guidance.models.experimental import SglangModel
    # lm = SglangModel(
    #     model="Qwen/Qwen3-30B-A3B-Instruct-2507",
    #     base_url="http://127.0.0.1:30000/v1",
    #     echo=True,
    #     api_key=os.environ.get("OPENAI_API_KEY", "NO_KEY"),
    # )
    puzzle = row['puzzle']
    question = row['question']
    query = puzzle + '\n\n' + question
    choices = row['choices']
    target = row['answer']
    print(row['id'])
    # Your reasoning system should return a predicted answer
    try:
        first_principles, groundup_reasoning, reflection, reasoning_conclusion, pred = generate(query, choices)
    except JSONDecodeError as e:
        first_principles, groundup_reasoning, reflection, reasoning_conclusion, pred = ["JSONDecodeError"]*5
    return {
        'correct': pred == target,
        'first_principles': first_principles,
        'groundup_reasoning': groundup_reasoning,
        'reflection': reflection,
        'reasoning_conclusion': reasoning_conclusion,
        'predicted_answer': pred, 
    }


In [7]:
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import pandas as pd

# # df is already df.iloc[600:650]  (keeps original index)
# def run_parallel_preserve_index(df, workers=16):
#     idx_to_row = df.to_dict(orient="index")  # {index: rowdict}
#     results = {}

#     with ThreadPoolExecutor(max_workers=workers) as ex:
#         futs = {ex.submit(evaluate_reasoning, row): idx
#                 for idx, row in idx_to_row.items()}
#         for fut in as_completed(futs):
#             idx = futs[fut]
#             results[idx] = fut.result()

#     res_df = pd.DataFrame.from_dict(results, orient="index")
#     # ensure order matches df (and keep original index)
#     res_df = res_df.reindex(df.index)
#     return df.join(res_df)

# df = pd.DataFrame(dataset['test'])
# # df.head()
# # CURRENTLY FINISHED - [:600]

# df = df.iloc[600:605]
# df = run_parallel_preserve_index(df, workers=16)


In [8]:
# import pandas as pd
# df = pd.DataFrame(dataset['test'])
# len(df)

In [None]:
import pandas as pd

df = pd.DataFrame(dataset['test'])
# df.head()
# CURRENTLY FINISHED - [:2000]

df = df.iloc[2000:2250]

res = df.apply(lambda r: pd.Series(evaluate_reasoning(r)), axis=1)
df = pd.concat([df, res], axis=1)

lgp-test-2x4-22#mc-7


StitchWidget(initial_height='auto', initial_width='100%', srcdoc='<!doctype html>\n<html lang="en">\n<head>\n …

lgp-test-6x2-33#mc-2
lgp-test-6x5-9#mc-20
lgp-test-6x4-33#mc-14
lgp-test-2x6-14#mc-11
lgp-test-4x4-2#mc-0
lgp-test-5x4-10#mc-0
lgp-test-5x5-16#mc-3
lgp-test-6x5-8#mc-1
lgp-test-4x6-5#mc-9
lgp-test-5x6-11#mc-3
lgp-test-4x2-30#mc-7
lgp-test-5x3-17#mc-7
lgp-test-4x6-35#mc-4
lgp-test-6x4-32#mc-22
lgp-test-6x3-6#mc-8
lgp-test-3x5-29#mc-2
lgp-test-6x5-20#mc-11
lgp-test-6x4-0#mc-20
lgp-test-5x5-13#mc-9
lgp-test-6x5-29#mc-21
lgp-test-6x6-32#mc-25
lgp-test-3x6-32#mc-13
lgp-test-4x5-15#mc-14
lgp-test-4x5-19#mc-7
lgp-test-4x3-20#mc-2
lgp-test-2x6-26#mc-11
lgp-test-6x5-13#mc-28
lgp-test-6x6-5#mc-34
lgp-test-2x2-34#mc-0
lgp-test-6x6-20#mc-7
lgp-test-4x4-7#mc-12
lgp-test-5x6-32#mc-3
lgp-test-6x3-4#mc-8
lgp-test-5x5-29#mc-23
lgp-test-5x5-3#mc-22
lgp-test-6x6-11#mc-16
lgp-test-6x6-11#mc-1
lgp-test-3x4-33#mc-6
lgp-test-5x6-36#mc-8
lgp-test-5x6-0#mc-21
lgp-test-5x5-39#mc-15
lgp-test-3x4-1#mc-7
lgp-test-2x2-23#mc-2
lgp-test-4x5-34#mc-11
lgp-test-5x5-38#mc-7
lgp-test-3x2-14#mc-5
lgp-test-3x6-26#mc-12
lgp-

In [14]:
df

Unnamed: 0,id,puzzle,question,choices,answer,created_at,correct,first_principles,groundup_reasoning,reflection,reasoning_conclusion,predicted_answer
1851,lgp-test-2x4-22#mc-7,"There are 2 houses, numbered 1 to 2 from left ...",What is Children of the person who lives in Ho...,"[Bella, Fred]",Bella,2024-07-03T21:21:31.301321,True,"[There are 2 houses, numbered 1 (left) and 2 (...",Arnold has a cat (clue 3). The cat owner is di...,No gaps or mistakes. Reasoning correctly uses ...,The child in House 2 is Bella.,Bella
1852,lgp-test-6x2-33#mc-2,"There are 6 houses, numbered 1 to 6 from left ...",What is Name of the person who lives in House 2?,"[Alice, Arnold, Eric, Peter, Bob, Carol]",Eric,2024-07-03T21:21:31.364581,True,"[Bob is in the fifth house. (Clue 9), The pers...",Bob is in house 5 (clue 9). Bob loves lilies (...,The reasoning correctly deduced Bob in house 5...,The person in House 2 is Eric.,Eric
1853,lgp-test-6x5-9#mc-20,"There are 6 houses, numbered 1 to 6 from left ...",What is Name of the person who lives in House 5?,"[Alice, Carol, Arnold, Bob, Eric, Peter]",Peter,2024-07-03T21:21:31.377625,True,"[Bob is in the first house., Alice is very tal...",House 5: Desert smoothie (clue 19). Alice is v...,The contradiction arose because Bob (very shor...,The person who lives in House 5 is Peter.,Peter
1854,lgp-test-6x4-33#mc-14,"There are 6 houses, numbered 1 to 6 from left ...",What is Nationality of the person who lives in...,"[chinese, german, swede, brit, dane, norwegian]",swede,2024-07-03T21:21:31.374609,False,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError
1855,lgp-test-2x6-14#mc-11,"There are 2 houses, numbered 1 to 2 from left ...",What is Mother of the person who lives in Hous...,"[Holly, Aniya]",Aniya,2024-07-03T21:21:31.304923,True,[House 2 is occupied by the person who is shor...,House 2 is short → House 1 is very short. Eric...,All deductions follow logically from clues. No...,The mother of the person in House 2 is Aniya.,Aniya
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,lgp-test-2x6-2#mc-6,"There are 2 houses, numbered 1 to 2 from left ...",What is Name of the person who lives in House 2?,"[Arnold, Eric]",Arnold,2024-07-03T21:21:31.304179,True,"[House 1 is to the left of House 2., There are...","Eric is in House 1 (clue 1), so Arnold is in H...",All steps follow logically from clues. No gaps...,The person in House 2 is Arnold.,Arnold
1996,lgp-test-2x6-13#mc-11,"There are 2 houses, numbered 1 to 2 from left ...",What is Pet of the person who lives in House 2?,"[dog, cat]",dog,2024-07-03T21:21:31.304868,True,[House 2 has the person with a high school dip...,"House 2 has high school diploma (clue 2), so r...",No gaps or mistakes. All deductions are logica...,The pet of the person in House 2 is a dog.,dog
1997,lgp-test-6x5-14#mc-4,"There are 6 houses, numbered 1 to 6 from left ...",What is Occupation of the person who lives in ...,"[doctor, nurse, teacher, artist, engineer, law...",nurse,2024-07-03T21:21:31.378575,True,"[Arnold is in the fourth house. (Clue 6), The ...","Arnold is in house 4 (Clue 6), so house 4: Nam...",The reasoning is consistent and logically stru...,The person in House 1 has the occupation of nu...,nurse
1998,lgp-test-3x5-17#mc-13,"There are 3 houses, numbered 1 to 3 from left ...",What is HouseStyle of the person who lives in ...,"[ranch, victorian, colonial]",victorian,2024-07-03T21:21:31.312894,True,"[House 2 is occupied by Arnold (Clue 7)., The ...",From Clue 6 and Clue 2: ranch must be in House...,The reasoning correctly deduces that ranch mus...,House 3 has a Victorian-style house.,victorian


In [15]:
df['correct'].value_counts()

correct
True     109
False     40
Name: count, dtype: int64

In [16]:
path = './zebralogic_eval.csv'
df.to_csv(path, mode="a", index=False, header=not os.path.exists(path))

In [17]:
dub = pd.read_csv(path)


In [18]:
dub['correct'].value_counts()

correct
True     1457
False     543
Name: count, dtype: int64

In [24]:
dub

Unnamed: 0,id,puzzle,question,choices,answer,created_at,correct,first_principles,groundup_reasoning,reflection,reasoning_conclusion,predicted_answer
0,lgp-test-6x4-37#mc-16,"There are 6 houses, numbered 1 to 6 from left ...",What is Name of the person who lives in House 5?,"['Eric', 'Bob', 'Alice', 'Peter', 'Carol', 'Ar...",Bob,2024-07-03T21:21:31.375234,False,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError,JSONDecodeError
1,lgp-test-3x6-15#mc-13,"There are 3 houses, numbered 1 to 3 from left ...",What is Education of the person who lives in H...,"['associate', 'high school', 'bachelor']",associate,2024-07-03T21:21:31.316147,True,"['Peter is in House 1.', 'Eric is in House 3.'...",Peter (House 1) has bachelor's degree → House ...,Initial reasoning correctly deduces: House 1: ...,House 3 has education: associate.,associate
2,lgp-test-4x6-24#mc-9,"There are 4 houses, numbered 1 to 4 from left ...",What is BookGenre of the person who lives in H...,"['science fiction', 'mystery', 'romance', 'fan...",mystery,2024-07-03T21:21:31.335059,True,"['House 1 has the bird keeper (Clue 2).', 'The...","House 1: red hair (Clue 6), bird (Clue 2). Hou...",Initial reasoning incorrectly assumed black ha...,The BookGenre of the person in House 2 is myst...,mystery
3,lgp-test-5x5-18#mc-0,"There are 5 houses, numbered 1 to 5 from left ...",What is Name of the person who lives in House 1?,"['Peter', 'Bob', 'Arnold', 'Eric', 'Alice']",Eric,2024-07-03T21:21:31.350746,True,['House 1 is occupied by the teacher (Clue 8)....,House 1 is the teacher (Clue 8). The only name...,"The reasoning correctly eliminates Bob, Alice,...",The person in House 1 is Eric.,Eric
4,lgp-test-4x2-35#mc-0,"There are 4 houses, numbered 1 to 4 from left ...",What is Name of the person who lives in House 1?,"['Eric', 'Arnold', 'Alice', 'Peter']",Peter,2024-07-03T21:21:31.319814,True,"['Eric is in the second house.', 'The Swedish ...","Alice is in house 3 (clue 5), so she is Britis...",The reasoning assumes Arnold is not in house 1...,The person in House 1 is Peter.,Peter
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,lgp-test-6x2-19#mc-1,"There are 6 houses, numbered 1 to 6 from left ...",What is Food of the person who lives in House 1?,"['stir fry', 'spaghetti', 'grilled cheese', 'p...",stew,2024-07-03T21:21:31.363714,True,"['House 1: Bob', 'House 3: Alice', 'House 4: A...",Bob is in House 1 (Clue 5). Carol loves stir f...,Initial reasoning assumed stir fry cannot be i...,"House 1: Bob, Food: stew",stew
1596,lgp-test-3x4-32#mc-2,"There are 3 houses, numbered 1 to 3 from left ...",What is PhoneModel of the person who lives in ...,"['samsung galaxy s21', 'iphone 13', 'google pi...",google pixel 6,2024-07-03T21:21:31.311068,True,"['Peter uses an iPhone 13 (from clue 1).', ""Pe...",Peter uses iPhone 13 and has Sept birthday. Er...,Initial reasoning correctly identified constra...,The person in House 1 uses the Google Pixel 6.,google pixel 6
1597,lgp-test-5x4-39#mc-7,"There are 5 houses, numbered 1 to 5 from left ...",What is Food of the person who lives in House 2?,"['spaghetti', 'grilled cheese', 'stew', 'stir ...",stew,2024-07-03T21:21:31.347796,False,['The person who is very short is in the fifth...,House 3: tall (Clue 13). House 3: grilled chee...,Initial error: Assumed Bob must be in 4 after ...,House 2's food is cooking.,grilled cheese
1598,lgp-test-5x4-16#mc-10,"There are 5 houses, numbered 1 to 5 from left ...",What is Cigar of the person who lives in House 3?,"['pall mall', 'blue master', 'dunhill', 'blend...",blends,2024-07-03T21:21:31.345104,True,['House 4 has the Pall Mall smoker (Clue 10).'...,"Bob ∈ {2,3,4}, Blue Master smoker is left of B...",The reasoning process is logically consistent ...,The cigar in House 3 is blends.,blends
