# reasoning.py

Auto-generated implementation from the Agentic RL PhD codebase.

### Original Implementations & References
The following links point to the official or high-quality reference implementations for the papers covered in this notebook:

- https://github.com/ysymyth/ReAct (ReAct), https://github.com/princeton-nlp/tree-of-thought-llm (ToT), https://github.com/PeterGriffinJin/Search-R1 (Search-R1)

*Note: The code below is a simplified pedagogical implementation.*

In [None]:
# Papers:
# 1. "ReAct: Synergizing Reasoning and Acting" (Yao et al., 2022)
# 2. "Tree of Thoughts" (Yao et al., 2023)
# 3. "Search-R1" (Jin et al., 2025)

class ReActAgent:
    """
    The classic Thought-Action-Observation loop.
    """
    def __init__(self, llm, tools):
        self.llm = llm
        self.tools = tools
        self.history = ""

    def step(self, question):
        prompt = f"""
        Question: {question}
        History: {self.history}
        Format: Thought -> Action -> Observation
        Next step:
        """
        response = self.llm.generate(prompt)
        
        if "Action:" in response:
            action = self.parse_action(response)
            obs = self.execute(action)
            self.history += f"\n{response}\nObservation: {obs}"
            return self.step(question) # Recursive loop
        else:
            return response # Final Answer

class TreeOfThoughts:
    """
    Paper: Tree of Thoughts (ToT)
    Innovation: Search (BFS/DFS) over the "thought space".
    """
    def __init__(self, llm):
        self.llm = llm

    def solve(self, problem, breadth=3, depth=3):
        candidates = [problem] # Initial nodes
        
        for d in range(depth):
            next_candidates = []
            for node in candidates:
                # 1. Generate: Propose `breadth` next steps
                proposals = self.llm.generate(f"Propose {breadth} next steps for: {node}")
                
                # 2. Evaluate: Score each proposal
                scored_proposals = []
                for p in proposals:
                    score = self.llm.evaluate(f"Rate validness of {p} for {problem}")
                    if score > 0.5: # Pruning
                        scored_proposals.append(p)
                
                next_candidates.extend(scored_proposals)
            
            candidates = next_candidates
        
        return candidates[0] if candidates else "Failed"

class SearchR1:
    """
    Paper: Search-R1 (2025)
    Innovation: RL-trained Reasoning that autonomously calls search.
    """
    def __init__(self, llm, search_tool):
        self.llm = llm
        self.search = search_tool

    def reason(self, query):
        # The model is trained to output special tokens like <search>
        response = self.llm.generate(query)
        
        while "<search>" in response:
            # 1. Extract Query
            q = response.split("<search>")[1].split("</search>")[0]
            
            # 2. Execute
            result = self.search(q)
            
            # 3. Append & Continue Reasoning
            # This is where the RL training happens (PPO on this trajectory)
            context = f"{response}\n<result>{result}</result>"
            response = self.llm.generate(context)
            
        return response
