diff --git a/optillm/__init__.py b/optillm/__init__.py index c4312267..f16b2ace 100644 --- a/optillm/__init__.py +++ b/optillm/__init__.py @@ -2,7 +2,7 @@ import os # Version information -__version__ = "0.1.12" +__version__ = "0.1.13" # Get the path to the root optillm.py spec = util.spec_from_file_location( diff --git a/optillm/plugins/deepthink/README.md b/optillm/plugins/deepthink/README.md new file mode 100644 index 00000000..7c284c63 --- /dev/null +++ b/optillm/plugins/deepthink/README.md @@ -0,0 +1,136 @@ +# Deep Think Plugin + +## Overview + +The Deep Think plugin combines two powerful approaches for enhanced reasoning in large language models: + +1. **SELF-DISCOVER Framework**: A method where LLMs self-discover task-intrinsic reasoning structures by selecting, adapting, and implementing atomic reasoning modules into a coherent reasoning plan. + +2. **Uncertainty-Routed Chain-of-Thought**: An approach that generates multiple chain-of-thought samples, evaluates confidence through consistency, and routes to either majority voting (high confidence) or greedy decoding (low confidence). + +## Key Features + +- **Adaptive Reasoning Structure**: Automatically discovers the best reasoning approach for each specific task +- **Confidence-Based Routing**: Uses uncertainty estimation to decide between multiple samples or single greedy output +- **Reasoning Model Support**: Designed for models that produce structured thinking in `` tags +- **Multiple Sampling**: Generates multiple reasoning paths and selects the most reliable one + +## How It Works + +### Stage 1: SELF-DISCOVER Reasoning Structure + +1. **SELECT**: From 39 atomic reasoning modules, select those most relevant for the task +2. **ADAPT**: Rephrase selected modules to be task-specific +3. **IMPLEMENT**: Create a structured JSON reasoning plan + +### Stage 2: Uncertainty-Routed Generation + +1. **Multiple Sampling**: Generate n samples (default: 3) using the discovered structure +2. **Confidence Evaluation**: Assess consistency across samples +3. **Route Decision**: + - High confidence → Use majority vote + - Low confidence → Use greedy sample (temperature=0) + +## Usage + +```python +# Via optillm model prefix +model = "deepthink-your-model-name" + +# Via optillm_approach in request +{ + "model": "your-model-name", + "optillm_approach": "deepthink", + "messages": [...], + "deepthink_samples": 3, # Number of samples for uncertainty routing + "confidence_threshold": 0.7, # Threshold for majority vs greedy routing + "max_tokens": 16382, # Extended context for reasoning + "temperature": 0.7, # Default temperature for sampling + "top_p": 0.95 # Default top_p for sampling +} +``` + +## Configuration Parameters + +- `deepthink_samples` (int, default=3): Number of reasoning samples to generate +- `confidence_threshold` (float, default=0.7): Confidence threshold for routing decision +- `max_tokens` (int, default=16382): Maximum tokens for generation +- `temperature` (float, default=0.7): Sampling temperature +- `top_p` (float, default=0.95): Top-p sampling parameter +- `enable_self_discover` (bool, default=True): Whether to use SELF-DISCOVER structure +- `reasoning_modules_limit` (int, default=5): Max reasoning modules to select + +## Atomic Reasoning Modules + +The plugin includes 39 reasoning modules covering: +- Critical thinking and analysis +- Creative and innovative approaches +- Systems thinking and holistic analysis +- Risk assessment and evaluation +- Step-by-step decomposition +- Collaborative and perspective-taking approaches +- Reflective and meta-cognitive strategies + +## Examples + +### Mathematical Problem Solving +Input: "Solve: If a train travels 120 miles in 2 hours, how long will it take to travel 300 miles?" + +The plugin will: +1. Discover a reasoning structure focused on rate calculations +2. Generate multiple solution paths +3. Evaluate consistency and select the most reliable answer + +### Complex Reasoning Task +Input: "Analyze the potential long-term economic impacts of remote work adoption" + +The plugin will: +1. Select reasoning modules like systems thinking, risk analysis, and critical thinking +2. Create a structured analysis plan +3. Generate multiple perspectives and synthesize the most coherent analysis + +## Implementation Details + +- **Reasoning Extraction**: Automatically extracts content from `` tags +- **Consistency Scoring**: Uses multiple metrics including answer similarity and reasoning coherence +- **Adaptive Thresholds**: Can be fine-tuned based on model performance +- **Token Efficiency**: Optimized to minimize redundant computation while maximizing reasoning quality + +## Performance + +The Deep Think approach has shown significant improvements on complex reasoning tasks, with particularly strong results on mathematical competition problems. + +### AIME 2025 Results + +| Model | Approach | Accuracy | Improvement | +|-------|----------|----------|-------------| +| qwen-3-32b | Baseline | 43.33% | - | +| qwen-3-32b | Deep Think | **63.33%** | **+20.00pp** | + +*Experimental settings: max_completion_tokens=16382, temperature=0.7, top_p=0.95* + +**Key Findings:** +- **46% relative improvement** over baseline on mathematical reasoning +- **Cerebras inference** was crucial for enabling high inference-time compute without latency penalty +- The combination of SELF-DISCOVER structure discovery and uncertainty-routed sampling proved particularly effective for competition mathematics +- Enhanced accuracy on multi-step problems requiring systematic reasoning + +### Other Improvements + +The Deep Think approach has also demonstrated: +- Enhanced accuracy on multi-step problems +- Better handling of ambiguous or open-ended questions +- Improved consistency across different problem types +- Reduced hallucination through confidence-based routing + +## Limitations + +- Increased computational cost due to multiple sampling +- Longer response times for complex reasoning tasks +- Requires models capable of structured thinking output +- May over-engineer solutions for simple problems + +## References + +- Zhou, P. et al. "SELF-DISCOVER: Large Language Models Self-Compose Reasoning Structures" (2024) +- Uncertainty-routed chain-of-thought approaches in advanced reasoning systems diff --git a/optillm/plugins/deepthink/__init__.py b/optillm/plugins/deepthink/__init__.py new file mode 100644 index 00000000..cf0c221e --- /dev/null +++ b/optillm/plugins/deepthink/__init__.py @@ -0,0 +1,6 @@ +""" +Deep Think Plugin for OptILM + +A plugin that combines SELF-DISCOVER framework with uncertainty-routed +chain-of-thought for enhanced reasoning capabilities. +""" \ No newline at end of file diff --git a/optillm/plugins/deepthink/reasoning_modules.py b/optillm/plugins/deepthink/reasoning_modules.py new file mode 100644 index 00000000..007893b5 --- /dev/null +++ b/optillm/plugins/deepthink/reasoning_modules.py @@ -0,0 +1,234 @@ +""" +Atomic Reasoning Modules for SELF-DISCOVER Framework + +This module contains the 39 reasoning modules as described in the SELF-DISCOVER paper. +These modules represent high-level cognitive heuristics for problem-solving. +""" + +# 39 Atomic Reasoning Modules from SELF-DISCOVER paper +REASONING_MODULES = [ + { + "id": 1, + "name": "experimental_design", + "description": "How could I devise an experiment to help solve that problem?" + }, + { + "id": 2, + "name": "iterative_problem_solving", + "description": "Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress can be made." + }, + { + "id": 3, + "name": "progress_measurement", + "description": "How could I measure progress on this problem?" + }, + { + "id": 4, + "name": "problem_simplification", + "description": "How can I simplify the problem so that it is easier to solve?" + }, + { + "id": 5, + "name": "assumption_analysis", + "description": "What are the key assumptions underlying this problem?" + }, + { + "id": 6, + "name": "risk_assessment", + "description": "What are the potential risks and drawbacks of each solution?" + }, + { + "id": 7, + "name": "perspective_analysis", + "description": "What are the alternative perspectives or viewpoints on this problem?" + }, + { + "id": 8, + "name": "long_term_implications", + "description": "What are the long-term implications of this problem and its solutions?" + }, + { + "id": 9, + "name": "problem_decomposition", + "description": "How can I break down this problem into smaller, more manageable parts?" + }, + { + "id": 10, + "name": "critical_thinking", + "description": "Critical Thinking: This style involves analyzing the problem from different perspectives, questioning assumptions, and evaluating the evidence or information available. It focuses on logical reasoning, evidence-based decision-making, and identifying potential biases or flaws in thinking." + }, + { + "id": 11, + "name": "creative_thinking", + "description": "Try creative thinking, generate innovative and out-of-the-box ideas to solve the problem. Explore unconventional solutions, thinking beyond traditional boundaries, and encouraging imagination and originality." + }, + { + "id": 12, + "name": "collaborative_thinking", + "description": "Seek input and collaboration from others to solve the problem. Emphasize teamwork, open communication, and leveraging the diverse perspectives and expertise of a group to come up with effective solutions." + }, + { + "id": 13, + "name": "systems_thinking", + "description": "Use systems thinking: Consider the problem as part of a larger system and understanding the interconnectedness of various elements. Focus on identifying the underlying causes, feedback loops, and interdependencies that influence the problem, and developing holistic solutions that address the system as a whole." + }, + { + "id": 14, + "name": "risk_analysis", + "description": "Use Risk Analysis: Evaluate potential risks, uncertainties, and tradeoffs associated with different solutions or approaches to a problem. Emphasize assessing the potential consequences and likelihood of success or failure, and making informed decisions based on a balanced analysis of risks and benefits." + }, + { + "id": 15, + "name": "reflective_thinking", + "description": "Use Reflective Thinking: Step back from the problem, take the time for introspection and self-reflection. Examine personal biases, assumptions, and mental models that may influence problem-solving, and being open to learning from past experiences to improve future approaches." + }, + { + "id": 16, + "name": "core_issue_identification", + "description": "What is the core issue or problem that needs to be addressed?" + }, + { + "id": 17, + "name": "causal_analysis", + "description": "What are the underlying causes or factors contributing to the problem?" + }, + { + "id": 18, + "name": "historical_analysis", + "description": "Are there any potential solutions or strategies that have been tried before? If yes, what were the outcomes and lessons learned?" + }, + { + "id": 19, + "name": "obstacle_identification", + "description": "What are the potential obstacles or challenges that might arise in solving this problem?" + }, + { + "id": 20, + "name": "data_analysis", + "description": "Are there any relevant data or information that can provide insights into the problem? If yes, what data sources are available, and how can they be analyzed?" + }, + { + "id": 21, + "name": "stakeholder_analysis", + "description": "Are there any stakeholders or individuals who are directly affected by the problem? What are their perspectives and needs?" + }, + { + "id": 22, + "name": "resource_analysis", + "description": "What resources (financial, human, technological, etc.) are needed to tackle the problem effectively?" + }, + { + "id": 23, + "name": "success_metrics", + "description": "How can progress or success in solving the problem be measured or evaluated?" + }, + { + "id": 24, + "name": "metric_identification", + "description": "What indicators or metrics can be used?" + }, + { + "id": 25, + "name": "problem_type_technical", + "description": "Is the problem a technical or practical one that requires a specific expertise or skill set? Or is it more of a conceptual or theoretical problem?" + }, + { + "id": 26, + "name": "physical_constraints", + "description": "Does the problem involve a physical constraint, such as limited resources, infrastructure, or space?" + }, + { + "id": 27, + "name": "behavioral_aspects", + "description": "Is the problem related to human behavior, such as a social, cultural, or psychological issue?" + }, + { + "id": 28, + "name": "decision_making", + "description": "Does the problem involve decision-making or planning, where choices need to be made under uncertainty or with competing objectives?" + }, + { + "id": 29, + "name": "analytical_problem", + "description": "Is the problem an analytical one that requires data analysis, modeling, or optimization techniques?" + }, + { + "id": 30, + "name": "design_challenge", + "description": "Is the problem a design challenge that requires creative solutions and innovation?" + }, + { + "id": 31, + "name": "systemic_issues", + "description": "Does the problem require addressing systemic or structural issues rather than just individual instances?" + }, + { + "id": 32, + "name": "time_sensitivity", + "description": "Is the problem time-sensitive or urgent, requiring immediate attention and action?" + }, + { + "id": 33, + "name": "typical_solutions", + "description": "What kinds of solution typically are produced for this kind of problem specification?" + }, + { + "id": 34, + "name": "alternative_solutions", + "description": "Given the problem specification and the current best solution, have a guess about other possible solutions." + }, + { + "id": 35, + "name": "radical_rethinking", + "description": "Let's imagine the current best solution is totally wrong, what other ways are there to think about the problem specification?" + }, + { + "id": 36, + "name": "solution_modification", + "description": "What is the best way to modify this current best solution, given what you know about these kinds of problem specification?" + }, + { + "id": 37, + "name": "novel_solution", + "description": "Ignoring the current best solution, create an entirely new solution to the problem." + }, + { + "id": 38, + "name": "step_by_step", + "description": "Let's think step by step." + }, + { + "id": 39, + "name": "step_by_step_plan", + "description": "Let's make a step by step plan and implement it with good notion and explanation." + } +] + +def get_all_modules(): + """Return all 39 reasoning modules.""" + return REASONING_MODULES + +def get_modules_by_category(): + """Categorize modules by their primary focus.""" + categories = { + "analytical": [1, 3, 5, 10, 14, 17, 20, 23, 24, 25, 29], + "creative": [2, 4, 11, 30, 34, 35, 37], + "systematic": [9, 13, 16, 18, 22, 31, 33, 36, 38, 39], + "collaborative": [7, 12, 15, 21], + "risk_oriented": [6, 8, 14, 19], + "behavioral": [27, 28], + "constraint_focused": [26, 32] + } + + return { + category: [REASONING_MODULES[i-1] for i in indices] + for category, indices in categories.items() + } + +def get_modules_by_ids(module_ids): + """Get specific modules by their IDs.""" + return [module for module in REASONING_MODULES if module["id"] in module_ids] + +def get_module_descriptions(): + """Get just the descriptions for prompting.""" + return [f"{module['name']}: {module['description']}" for module in REASONING_MODULES] diff --git a/optillm/plugins/deepthink/self_discover.py b/optillm/plugins/deepthink/self_discover.py new file mode 100644 index 00000000..71426241 --- /dev/null +++ b/optillm/plugins/deepthink/self_discover.py @@ -0,0 +1,392 @@ +""" +SELF-DISCOVER Framework Implementation + +This module implements the SELF-DISCOVER framework for automatically discovering +task-intrinsic reasoning structures. +""" + +import json +import logging +import re +from typing import List, Dict, Any, Tuple +from .reasoning_modules import get_all_modules, get_module_descriptions + +logger = logging.getLogger(__name__) + +class SelfDiscover: + """ + Implementation of the SELF-DISCOVER framework. + + The framework operates in two stages: + 1. Stage 1: Discover task-specific reasoning structure (SELECT, ADAPT, IMPLEMENT) + 2. Stage 2: Use discovered structure to solve problem instances + """ + + def __init__(self, client, model: str, max_tokens: int = 16382): + self.client = client + self.model = model + self.max_tokens = max_tokens + self.reasoning_modules = get_all_modules() + self.completion_tokens = 0 + + def discover_reasoning_structure(self, task_description: str, task_examples: List[str] = None) -> Dict[str, Any]: + """ + Stage 1: Discover reasoning structure for the given task. + + Args: + task_description: Description of the task type + task_examples: Optional examples of the task (without labels) + + Returns: + Dict containing the discovered reasoning structure + """ + logger.info("Starting SELF-DISCOVER reasoning structure discovery") + + # Step 1: SELECT relevant reasoning modules + selected_modules = self._select_modules(task_description, task_examples) + logger.info(f"Selected {len(selected_modules)} reasoning modules") + + # Step 2: ADAPT modules to be task-specific + adapted_modules = self._adapt_modules(selected_modules, task_description, task_examples) + logger.info("Adapted modules to be task-specific") + + # Step 3: IMPLEMENT structured reasoning plan + reasoning_structure = self._implement_structure(adapted_modules, task_description, task_examples) + logger.info("Implemented reasoning structure") + + return { + "selected_modules": selected_modules, + "adapted_modules": adapted_modules, + "reasoning_structure": reasoning_structure, + "completion_tokens": self.completion_tokens + } + + def _select_modules(self, task_description: str, task_examples: List[str] = None) -> List[Dict[str, Any]]: + """SELECT: Choose relevant reasoning modules for the task.""" + + module_descriptions = get_module_descriptions() + modules_text = "\n".join([f"{i+1}. {desc}" for i, desc in enumerate(module_descriptions)]) + + examples_text = "" + if task_examples: + examples_text = "\n\nTask examples:\n" + "\n".join([f"Example {i+1}: {ex}" for i, ex in enumerate(task_examples)]) + + select_prompt = f"""You are an expert in problem-solving and reasoning. Given a task description and available reasoning modules, select the most relevant modules that would be useful for solving this type of task. + +Task description: {task_description}{examples_text} + +Available reasoning modules: +{modules_text} + +Instructions: +1. Analyze the task and identify what types of reasoning would be most helpful +2. Select 3-7 reasoning modules that are most relevant for this task +3. Consider both the complexity of the task and the complementary nature of different modules +4. Avoid selecting too many similar modules +5. IMPORTANT: Respond ONLY with a valid JSON array of numbers + +Example response format: [1, 5, 9, 15, 23] + +Selected modules (JSON array only):""" + + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": select_prompt}], + max_tokens=1024, + temperature=0.3 + ) + + self.completion_tokens += response.usage.completion_tokens + + try: + # Extract JSON from response + response_text = response.choices[0].message.content.strip() + # Look for JSON array in the response + json_match = re.search(r'\[[\d,\s]+\]', response_text) + if json_match: + selected_indices = json.loads(json_match.group(0)) + else: + # Fallback: extract numbers from response + numbers = re.findall(r'\b(\d+)\b', response_text) + selected_indices = [int(n) for n in numbers[:7]] # Limit to 7 modules + + # Convert to module objects (1-indexed to 0-indexed) + selected_modules = [] + for idx in selected_indices: + if 1 <= idx <= len(self.reasoning_modules): + selected_modules.append(self.reasoning_modules[idx-1]) + + return selected_modules[:7] # Ensure we don't exceed reasonable limit + + except Exception as e: + logger.warning(f"Error parsing selected modules: {e}") + # Fallback to first few modules + return self.reasoning_modules[:5] + + def _adapt_modules(self, selected_modules: List[Dict[str, Any]], task_description: str, task_examples: List[str] = None) -> List[str]: + """ADAPT: Rephrase modules to be more task-specific.""" + + modules_text = "\n".join([f"- {module['description']}" for module in selected_modules]) + + examples_text = "" + if task_examples: + examples_text = "\n\nTask examples:\n" + "\n".join([f"Example {i+1}: {ex}" for i, ex in enumerate(task_examples)]) + + adapt_prompt = f"""You are an expert in adapting general reasoning strategies to specific tasks. Given the selected reasoning modules and task description, rephrase each module to be more specific and tailored to this particular type of task. + +Task description: {task_description}{examples_text} + +Selected reasoning modules: +{modules_text} + +Instructions: +1. For each module, rephrase the description to be more specific to this task +2. Keep the core reasoning approach but make it more actionable for this specific type of problem +3. Use terminology and concepts relevant to the task domain +4. Make the adapted descriptions more concrete and specific + +Provide the adapted modules as a numbered list:""" + + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": adapt_prompt}], + max_tokens=2048, + temperature=0.3 + ) + + self.completion_tokens += response.usage.completion_tokens + + response_text = response.choices[0].message.content.strip() + + # Extract adapted modules from numbered list + adapted_modules = [] + lines = response_text.split('\n') + for line in lines: + line = line.strip() + if re.match(r'^\d+\.', line): + # Remove the number prefix + adapted_desc = re.sub(r'^\d+\.\s*', '', line) + adapted_modules.append(adapted_desc) + + return adapted_modules + + def _implement_structure(self, adapted_modules: List[str], task_description: str, task_examples: List[str] = None) -> Dict[str, Any]: + """IMPLEMENT: Create a structured reasoning plan in JSON format.""" + + modules_text = "\n".join([f"{i+1}. {module}" for i, module in enumerate(adapted_modules)]) + + examples_text = "" + if task_examples: + examples_text = "\n\nTask examples:\n" + "\n".join([f"Example {i+1}: {ex}" for i, ex in enumerate(task_examples)]) + + # Provide a demonstration of a reasoning structure + demo_structure = """{ + "problem_analysis": "Analyze the core components and requirements", + "approach_selection": "Choose the most appropriate solution method", + "step_by_step_solution": { + "step_1": "First logical step with clear reasoning", + "step_2": "Second step building on previous results", + "step_3": "Continue logical progression" + }, + "verification": "Check the solution for accuracy and completeness", + "final_answer": "Present the final result clearly" +}""" + + implement_prompt = f"""You are an expert in creating structured reasoning plans. Given the adapted reasoning modules for a specific task, create a detailed JSON reasoning structure that can be followed step-by-step to solve instances of this task. + +Task description: {task_description}{examples_text} + +Adapted reasoning modules: +{modules_text} + +Example of a reasoning structure format: +{demo_structure} + +Instructions: +1. Create a JSON structure that operationalizes the adapted reasoning modules +2. The structure should be specific enough to guide step-by-step reasoning +3. Include clear field names that indicate what should be filled in each step +4. Make it actionable - each field should represent a concrete reasoning step +5. Ensure the structure flows logically from problem understanding to final answer +6. The structure should be comprehensive enough to handle the complexity of the task + +7. IMPORTANT: Return ONLY valid JSON with double quotes around all property names and string values +8. Do not include any text before or after the JSON structure + +Valid JSON reasoning structure:""" + + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": implement_prompt}], + max_tokens=2048, + temperature=0.3 + ) + + self.completion_tokens += response.usage.completion_tokens + + response_text = response.choices[0].message.content.strip() + + # Extract and parse JSON from response with improved error handling + return self._parse_json_structure(response_text) + + def _parse_json_structure(self, response_text: str) -> Dict[str, Any]: + """Parse JSON structure with robust error handling and cleanup.""" + + # Define fallback structure + fallback_structure = { + "problem_understanding": "Analyze and understand the problem requirements", + "solution_approach": "Determine the best approach based on problem characteristics", + "step_by_step_reasoning": "Work through the problem systematically", + "verification": "Verify the solution is correct and complete", + "final_answer": "State the final answer clearly" + } + + # Try multiple JSON extraction and parsing strategies + strategies = [ + self._extract_json_strategy_1, + self._extract_json_strategy_2, + self._extract_json_strategy_3, + self._clean_and_parse_strategy + ] + + for i, strategy in enumerate(strategies, 1): + try: + structure = strategy(response_text) + if structure and isinstance(structure, dict) and len(structure) > 0: + logger.debug(f"Successfully parsed JSON using strategy {i}") + return structure + except Exception as e: + logger.debug(f"Strategy {i} failed: {e}") + continue + + logger.warning(f"All JSON parsing strategies failed. Using fallback structure.") + logger.debug(f"Raw response that failed to parse: {response_text[:500]}...") + return fallback_structure + + def _extract_json_strategy_1(self, text: str) -> Dict[str, Any]: + """Strategy 1: Find first complete JSON object with balanced braces.""" + start_idx = text.find('{') + if start_idx == -1: + raise ValueError("No opening brace found") + + brace_count = 0 + end_idx = start_idx + + for i in range(start_idx, len(text)): + if text[i] == '{': + brace_count += 1 + elif text[i] == '}': + brace_count -= 1 + if brace_count == 0: + end_idx = i + 1 + break + + if brace_count != 0: + raise ValueError("Unbalanced braces") + + json_str = text[start_idx:end_idx] + return json.loads(json_str) + + def _extract_json_strategy_2(self, text: str) -> Dict[str, Any]: + """Strategy 2: Use regex with non-greedy matching.""" + # Look for JSON object with non-greedy matching + json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', text) + if not json_match: + raise ValueError("No JSON object found with regex") + + json_str = json_match.group(0) + return json.loads(json_str) + + def _extract_json_strategy_3(self, text: str) -> Dict[str, Any]: + """Strategy 3: Extract between ```json``` code blocks.""" + patterns = [ + r'```json\s*([^`]+)```', + r'```\s*([^`]+)```', + r'`([^`]+)`' + ] + + for pattern in patterns: + match = re.search(pattern, text, re.DOTALL) + if match: + json_str = match.group(1).strip() + try: + return json.loads(json_str) + except: + continue + + raise ValueError("No valid JSON found in code blocks") + + def _clean_and_parse_strategy(self, text: str) -> Dict[str, Any]: + """Strategy 4: Clean common formatting issues and parse.""" + # Find JSON-like content + json_match = re.search(r'\{.*\}', text, re.DOTALL) + if not json_match: + raise ValueError("No JSON-like content found") + + json_str = json_match.group(0) + + # Common cleanup operations + cleanups = [ + # Fix single quotes to double quotes (but be careful about apostrophes) + (r"(? str: + """ + Stage 2: Use the discovered reasoning structure to solve a specific problem. + """ + + structure_text = json.dumps(reasoning_structure, indent=2) + + solve_prompt = f"""Follow the step-by-step reasoning structure below to solve the given problem. Fill in each field with your reasoning and analysis, then provide your final answer. + +Reasoning Structure: +{structure_text} + +Problem to solve: {problem} + +Instructions: +1. Work through each field in the reasoning structure systematically +2. Provide detailed reasoning for each step +3. Use the structure to guide your thinking process +4. Ensure your reasoning is logical and well-supported +5. Wrap your internal reasoning in tags +6. Provide a clear final answer after your reasoning + + +[Follow the reasoning structure step by step here] + + +Based on my systematic analysis using the reasoning structure, the answer is:""" + + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": solve_prompt}], + max_tokens=self.max_tokens, + temperature=0.7 + ) + + self.completion_tokens += response.usage.completion_tokens + + return response.choices[0].message.content.strip() diff --git a/optillm/plugins/deepthink/uncertainty_cot.py b/optillm/plugins/deepthink/uncertainty_cot.py new file mode 100644 index 00000000..65bd7a25 --- /dev/null +++ b/optillm/plugins/deepthink/uncertainty_cot.py @@ -0,0 +1,334 @@ +""" +Uncertainty-Routed Chain-of-Thought Implementation + +This module implements uncertainty-routed CoT that generates multiple reasoning samples, +evaluates confidence through consistency, and routes to either majority voting or greedy decoding. +""" + +import re +import logging +import json +from typing import List, Dict, Any, Tuple +from collections import Counter +from difflib import SequenceMatcher + +logger = logging.getLogger(__name__) + +class UncertaintyRoutedCoT: + """ + Implements uncertainty-routed chain-of-thought reasoning. + + The approach: + 1. Generate k chain-of-thought samples + 2. Evaluate confidence through consistency analysis + 3. Route to majority vote (high confidence) or greedy sample (low confidence) + """ + + def __init__(self, client, model: str, max_tokens: int = 16382): + self.client = client + self.model = model + self.max_tokens = max_tokens + self.completion_tokens = 0 + + def generate_with_uncertainty_routing( + self, + prompt: str, + num_samples: int = 3, + confidence_threshold: float = 0.7, + temperature: float = 0.7, + top_p: float = 0.95 + ) -> Dict[str, Any]: + """ + Generate response using uncertainty-routed chain-of-thought. + + Args: + prompt: The prompt to generate responses for + num_samples: Number of samples to generate for uncertainty evaluation + confidence_threshold: Threshold for routing decision + temperature: Sampling temperature for multiple samples + top_p: Top-p parameter for sampling + + Returns: + Dict containing final response, confidence score, and routing decision + """ + logger.info(f"Generating {num_samples} samples for uncertainty routing") + + # Generate multiple samples + samples = self._generate_multiple_samples( + prompt, num_samples, temperature, top_p + ) + + # Generate greedy sample for comparison + greedy_sample = self._generate_greedy_sample(prompt) + + # Extract thinking and answers from samples + sample_data = [] + for sample in samples: + thinking = self._extract_thinking(sample) + answer = self._extract_answer(sample) + sample_data.append({ + "full_response": sample, + "thinking": thinking, + "answer": answer + }) + + greedy_thinking = self._extract_thinking(greedy_sample) + greedy_answer = self._extract_answer(greedy_sample) + + # Evaluate confidence through consistency + confidence_score = self._evaluate_confidence(sample_data) + + # Log confidence evaluation details + logger.debug(f"Confidence evaluation completed: {confidence_score:.3f}") + logger.debug(f"Sample answers: {[sample['answer'][:50] + '...' if len(sample['answer']) > 50 else sample['answer'] for sample in sample_data if sample['answer']]}") + + # Route decision based on confidence + if confidence_score >= confidence_threshold: + # High confidence: use majority vote + final_response = self._majority_vote_response(sample_data) + routing_decision = "majority_vote" + logger.info(f"High confidence ({confidence_score:.3f} >= {confidence_threshold}) - using majority vote") + else: + # Low confidence: use greedy sample + final_response = greedy_sample + routing_decision = "greedy" + logger.info(f"Low confidence ({confidence_score:.3f} < {confidence_threshold}) - using greedy sample") + + return { + "final_response": final_response, + "confidence_score": confidence_score, + "routing_decision": routing_decision, + "samples": sample_data, + "greedy_sample": { + "full_response": greedy_sample, + "thinking": greedy_thinking, + "answer": greedy_answer + }, + "completion_tokens": self.completion_tokens + } + + def _generate_multiple_samples( + self, + prompt: str, + num_samples: int, + temperature: float, + top_p: float + ) -> List[str]: + """Generate multiple samples by calling the API multiple times.""" + samples = [] + + for i in range(num_samples): + logger.debug(f"Generating sample {i+1}/{num_samples}") + + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=temperature, + top_p=top_p + ) + + self.completion_tokens += response.usage.completion_tokens + samples.append(response.choices[0].message.content.strip()) + + return samples + + def _generate_greedy_sample(self, prompt: str) -> str: + """Generate a single greedy sample with temperature=0.""" + logger.debug("Generating greedy sample") + + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.0 # Greedy decoding + ) + + self.completion_tokens += response.usage.completion_tokens + + return response.choices[0].message.content.strip() + + def _extract_thinking(self, response: str) -> str: + """Extract content from tags.""" + match = re.search(r'(.*?)', response, re.DOTALL) + if match: + return match.group(1).strip() + return "" + + def _extract_answer(self, response: str) -> str: + """Extract the final answer from the response.""" + # Look for answer after tag + think_end = response.find('') + if think_end != -1: + answer_part = response[think_end + 8:].strip() + else: + answer_part = response.strip() + + # Try to extract final answer with common patterns + patterns = [ + r'(?:the )?(?:final )?answer is:?\s*(.+?)(?:\n|$)', + r'(?:therefore|thus|so),?\s*(?:the )?(?:answer is:?\s*)?(.+?)(?:\n|$)', + r'(?:conclusion|result):?\s*(.+?)(?:\n|$)', + ] + + for pattern in patterns: + match = re.search(pattern, answer_part, re.IGNORECASE) + if match: + return match.group(1).strip() + + # Fallback: return the first significant line after thinking + lines = answer_part.split('\n') + for line in lines: + line = line.strip() + if line and len(line) > 10: # Skip very short lines + return line + + return answer_part[:200] if answer_part else "" # Truncate if too long + + def _evaluate_confidence(self, sample_data: List[Dict[str, Any]]) -> float: + """ + Evaluate confidence based on consistency across samples. + + Returns a confidence score between 0 and 1. + """ + if len(sample_data) < 2: + return 0.5 # Neutral confidence for single sample + + # Extract answers and thinking for analysis + answers = [sample["answer"] for sample in sample_data if sample["answer"]] + thinking_texts = [sample["thinking"] for sample in sample_data if sample["thinking"]] + + if not answers: + return 0.1 # Very low confidence if no answers extracted + + # Evaluate answer consistency + answer_consistency = self._calculate_answer_consistency(answers) + + # Evaluate reasoning consistency + reasoning_consistency = self._calculate_reasoning_consistency(thinking_texts) + + # Combine metrics (weighted average) + confidence = (0.6 * answer_consistency + 0.4 * reasoning_consistency) + + logger.debug(f"Answer consistency: {answer_consistency:.3f} (weight: 0.6)") + logger.debug(f"Reasoning consistency: {reasoning_consistency:.3f} (weight: 0.4)") + logger.debug(f"Combined confidence: {confidence:.3f}") + + # Log additional details for debugging low confidence + if confidence < 0.5: + logger.debug(f"Low confidence detected. Sample count: {len(sample_data)}") + logger.debug(f"Answers found: {len(answers)}, Thinking texts: {len(thinking_texts)}") + if answers: + logger.debug(f"Sample answers: {answers}") + if len(answers) >= 2: + logger.debug(f"Most common answer appears {max(Counter(answers).values())} times out of {len(answers)}") + + return confidence + + def _calculate_answer_consistency(self, answers: List[str]) -> float: + """Calculate consistency of final answers.""" + if len(answers) < 2: + return 0.5 + + # Normalize answers for comparison + normalized_answers = [] + for answer in answers: + # Remove common variations and normalize + norm_answer = re.sub(r'[^\w\s]', '', answer.lower().strip()) + norm_answer = re.sub(r'\s+', ' ', norm_answer) + normalized_answers.append(norm_answer) + + # Count occurrences + answer_counts = Counter(normalized_answers) + most_common_count = answer_counts.most_common(1)[0][1] + total_answers = len(answers) + + # Calculate agreement ratio + agreement_ratio = most_common_count / total_answers + + logger.debug(f"Answer distribution: {dict(answer_counts)}") + logger.debug(f"Agreement ratio: {agreement_ratio:.3f} ({most_common_count}/{total_answers})") + + # Also consider semantic similarity for non-identical answers + max_similarity = 0.0 + for i, ans1 in enumerate(normalized_answers): + for j, ans2 in enumerate(normalized_answers[i+1:], i+1): + similarity = SequenceMatcher(None, ans1, ans2).ratio() + max_similarity = max(max_similarity, similarity) + + # Combine exact matches and semantic similarity + consistency = max(agreement_ratio, max_similarity) + + return min(consistency, 1.0) + + def _calculate_reasoning_consistency(self, thinking_texts: List[str]) -> float: + """Calculate consistency of reasoning processes.""" + if len(thinking_texts) < 2: + return 0.5 + + # Calculate pairwise similarity of reasoning + similarities = [] + for i, text1 in enumerate(thinking_texts): + for j, text2 in enumerate(thinking_texts[i+1:], i+1): + # Use sequence matcher for text similarity + similarity = SequenceMatcher(None, text1.lower(), text2.lower()).ratio() + similarities.append(similarity) + + if not similarities: + return 0.5 + + # Return average similarity + avg_similarity = sum(similarities) / len(similarities) + + logger.debug(f"Reasoning similarity pairs: {[f'{s:.3f}' for s in similarities]}") + logger.debug(f"Average reasoning similarity: {avg_similarity:.3f}") + + return min(avg_similarity, 1.0) + + def _majority_vote_response(self, sample_data: List[Dict[str, Any]]) -> str: + """ + Create response based on majority vote of answers and best reasoning. + """ + # Get most common answer + answers = [sample["answer"] for sample in sample_data if sample["answer"]] + if not answers: + return sample_data[0]["full_response"] + + # Normalize and count answers + normalized_answers = [] + for answer in answers: + norm_answer = re.sub(r'[^\w\s]', '', answer.lower().strip()) + norm_answer = re.sub(r'\s+', ' ', norm_answer) + normalized_answers.append(norm_answer) + + answer_counts = Counter(normalized_answers) + most_common_answer = answer_counts.most_common(1)[0][0] + + # Find the sample with the most common answer and best reasoning + best_sample = None + best_reasoning_length = 0 + + for i, sample in enumerate(sample_data): + if sample["answer"]: + norm_answer = re.sub(r'[^\w\s]', '', sample["answer"].lower().strip()) + norm_answer = re.sub(r'\s+', ' ', norm_answer) + + if norm_answer == most_common_answer: + reasoning_length = len(sample["thinking"]) + if reasoning_length > best_reasoning_length: + best_reasoning_length = reasoning_length + best_sample = sample + + if best_sample: + return best_sample["full_response"] + else: + # Fallback to first sample with the most common answer + for sample in sample_data: + if sample["answer"]: + norm_answer = re.sub(r'[^\w\s]', '', sample["answer"].lower().strip()) + norm_answer = re.sub(r'\s+', ' ', norm_answer) + if norm_answer == most_common_answer: + return sample["full_response"] + + # Final fallback + return sample_data[0]["full_response"] diff --git a/optillm/plugins/deepthink_plugin.py b/optillm/plugins/deepthink_plugin.py new file mode 100644 index 00000000..51b4adc8 --- /dev/null +++ b/optillm/plugins/deepthink_plugin.py @@ -0,0 +1,267 @@ +""" +Deep Think Plugin for OptILM + +Combines SELF-DISCOVER framework with uncertainty-routed chain-of-thought +for enhanced reasoning in large language models. +""" + +import logging +from typing import Tuple, Dict, Any +from optillm.plugins.deepthink.self_discover import SelfDiscover +from optillm.plugins.deepthink.uncertainty_cot import UncertaintyRoutedCoT + +# Plugin identifier for optillm +SLUG = "deepthink" + +logger = logging.getLogger(__name__) + +def run( + system_prompt: str, + initial_query: str, + client, + model: str, + request_config: Dict[str, Any] = None +) -> Tuple[str, int]: + """ + Main entry point for the Deep Think plugin. + + Combines SELF-DISCOVER reasoning structure discovery with + uncertainty-routed chain-of-thought generation. + + Args: + system_prompt: System prompt for the model + initial_query: User's initial query/problem + client: OpenAI-compatible client instance + model: Model identifier + request_config: Additional configuration parameters + + Returns: + Tuple of (response_text, completion_tokens_used) + """ + logger.info("Starting Deep Think reasoning process") + + # Extract configuration parameters + config = _parse_config(request_config or {}) + + try: + # Initialize components + self_discover = SelfDiscover( + client=client, + model=model, + max_tokens=config["max_tokens"] + ) + + uncertainty_cot = UncertaintyRoutedCoT( + client=client, + model=model, + max_tokens=config["max_tokens"] + ) + + total_tokens = 0 + + # Stage 1: SELF-DISCOVER reasoning structure (if enabled) + reasoning_structure = None + if config["enable_self_discover"]: + logger.info("Discovering task-specific reasoning structure") + + discovery_result = self_discover.discover_reasoning_structure( + task_description=_extract_task_description(initial_query, system_prompt), + task_examples=None # Could be enhanced to extract examples + ) + + reasoning_structure = discovery_result["reasoning_structure"] + total_tokens += discovery_result["completion_tokens"] + + logger.info(f"Discovered reasoning structure with {len(reasoning_structure)} components") + + # Prepare enhanced prompt + enhanced_prompt = _create_enhanced_prompt( + system_prompt=system_prompt, + initial_query=initial_query, + reasoning_structure=reasoning_structure, + config=config + ) + + # Stage 2: Uncertainty-routed generation + logger.info("Generating response with uncertainty routing") + + generation_result = uncertainty_cot.generate_with_uncertainty_routing( + prompt=enhanced_prompt, + num_samples=config["deepthink_samples"], + confidence_threshold=config["confidence_threshold"], + temperature=config["temperature"], + top_p=config["top_p"] + ) + + total_tokens += generation_result["completion_tokens"] + + # Log routing decision + logger.info(f"Routing decision: {generation_result['routing_decision']} " + f"(confidence: {generation_result['confidence_score']:.3f})") + + final_response = generation_result["final_response"] + + # Clean up the response if needed + final_response = _clean_response(final_response) + + logger.info(f"Deep Think completed successfully. Total tokens: {total_tokens}") + + return final_response, total_tokens + + except Exception as e: + logger.error(f"Error in Deep Think plugin: {str(e)}") + logger.debug(f"Exception traceback:", exc_info=True) + + # Fallback to simple generation + try: + logger.info("Attempting fallback to simple generation") + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": initial_query} + ], + max_tokens=config["max_tokens"], + temperature=config["temperature"], + top_p=config["top_p"] + ) + + logger.info("Fallback generation successful") + return response.choices[0].message.content.strip(), response.usage.completion_tokens + + except Exception as fallback_error: + logger.error(f"Fallback generation also failed: {str(fallback_error)}") + logger.debug(f"Fallback exception traceback:", exc_info=True) + return f"Error in Deep Think plugin: {str(e)}", 0 + +def _parse_config(request_config: Dict[str, Any]) -> Dict[str, Any]: + """Parse and validate configuration parameters.""" + + default_config = { + "deepthink_samples": 3, + "confidence_threshold": 0.7, + "max_tokens": 16382, + "temperature": 0.7, + "top_p": 0.95, + "enable_self_discover": True, + "reasoning_modules_limit": 7 + } + + # Override with request config values + for key, value in request_config.items(): + if key in default_config: + default_config[key] = value + + # Validate ranges + default_config["deepthink_samples"] = max(1, min(10, default_config["deepthink_samples"])) + default_config["confidence_threshold"] = max(0.0, min(1.0, default_config["confidence_threshold"])) + default_config["temperature"] = max(0.0, min(2.0, default_config["temperature"])) + default_config["top_p"] = max(0.0, min(1.0, default_config["top_p"])) + default_config["reasoning_modules_limit"] = max(3, min(15, default_config["reasoning_modules_limit"])) + + return default_config + +def _extract_task_description(initial_query: str, system_prompt: str) -> str: + """Extract a task description for SELF-DISCOVER from the query and system prompt.""" + + # Combine system prompt and query to understand the task + combined_text = f"{system_prompt}\n\n{initial_query}" + + # Try to identify the type of task based on keywords and patterns + task_keywords = { + "mathematical": ["solve", "calculate", "equation", "math", "number", "formula"], + "analytical": ["analyze", "evaluate", "assess", "examine", "compare"], + "creative": ["create", "design", "generate", "brainstorm", "invent"], + "logical": ["reason", "logic", "prove", "deduce", "conclude"], + "planning": ["plan", "strategy", "approach", "method", "steps"], + "problem_solving": ["problem", "solution", "solve", "fix", "resolve"] + } + + detected_types = [] + combined_lower = combined_text.lower() + + for task_type, keywords in task_keywords.items(): + if any(keyword in combined_lower for keyword in keywords): + detected_types.append(task_type) + + if detected_types: + primary_type = detected_types[0] + task_description = f"This is primarily a {primary_type} task that requires {', '.join(detected_types)} thinking." + else: + task_description = "This is a general reasoning task that requires systematic thinking and analysis." + + # Add context from the query + if len(initial_query) > 50: + task_description += f" The specific task involves: {initial_query[:200]}..." + else: + task_description += f" The specific task is: {initial_query}" + + return task_description + +def _create_enhanced_prompt( + system_prompt: str, + initial_query: str, + reasoning_structure: Dict[str, Any] = None, + config: Dict[str, Any] = None +) -> str: + """Create an enhanced prompt that incorporates the reasoning structure.""" + + base_prompt = f"""System: {system_prompt} + +Task: {initial_query}""" + + if reasoning_structure: + import json + structure_text = json.dumps(reasoning_structure, indent=2) + + enhanced_prompt = f"""{base_prompt} + +REASONING STRUCTURE: +Please follow this discovered reasoning structure to solve the problem systematically: + +{structure_text} + +INSTRUCTIONS: +1. Use the reasoning structure above to guide your thinking process +2. Work through each component of the structure systematically +3. Wrap your detailed reasoning process in tags +4. After your reasoning, provide a clear and concise final answer +5. Be thorough in your analysis but also aim for clarity and accuracy + + +[Follow the reasoning structure step-by-step to analyze and solve the problem] + + +Based on my systematic analysis, the answer is:""" + else: + enhanced_prompt = f"""{base_prompt} + +INSTRUCTIONS: +Please solve this problem using careful step-by-step reasoning. + +1. Wrap your detailed reasoning process in tags +2. Consider the problem from multiple angles +3. Work through the solution systematically +4. Provide a clear and well-supported final answer + + +[Provide your detailed step-by-step reasoning here] + + +Based on my analysis, the answer is:""" + + return enhanced_prompt + +def _clean_response(response: str) -> str: + """Clean up the final response.""" + + # Remove any trailing whitespace + response = response.strip() + + # Ensure the response doesn't end abruptly + if response and not response.endswith(('.', '!', '?', ':', '"', "'")): + # Don't add punctuation if it's a number or simple phrase + if not (response.replace(' ', '').replace(',', '').replace('.', '').isdigit() or len(response.split()) <= 3): + response += "." + + return response diff --git a/setup.py b/setup.py index 5e4c2fd9..c3b9d0c3 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="optillm", - version="0.1.12", + version="0.1.1", packages=find_packages(include=['optillm', 'optillm.*']), # This ensures all subpackages are included py_modules=['optillm'], package_data={