In [None]:
import pandas as pd
from openai import OpenAI
import json
import re
from pydantic import BaseModel
import time
from tqdm import tqdm
import os
import pandas as pd
from dotenv import load_dotenv
import os
import requests
from pydantic import BaseModel
import os
import openai
import os
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
import requests
import json
import os
import getpass
import re
import random
import time


load_dotenv()
fireworks_key = os.getenv('FIREWORKS_API_KEY')

In [None]:
processed_df = pd.read_parquet('data/processed_test_data.parquet')

In [None]:

# Set up API key - will prompt if not in environment
api_key = os.getenv("FIREWORKS_API_KEY")
if not api_key:
    api_key = getpass.getpass("Enter your Fireworks API key: ")

# Constants
API_URL = "https://api.fireworks.ai/inference/v1/completions"
HEADERS = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}
MODEL = "accounts/fireworks/models/deepseek-r1"
TARGET_TOKEN_COUNT = 2000  # Target token count for extensive reasoning

def estimate_tokens(text):
    """
    Estimate the number of tokens in a text based on word count.
    This is a rough approximation - typically tokens are around 3/4 of words.
    """
    return len(text.split())

def extend_reasoning(question, max_extensions=10):
    """
    Extend the model's reasoning process until it reaches TARGET_TOKEN_COUNT,
    regardless of whether it naturally completes or not.
    
    Args:
        question: The question to solve
        max_extensions: Maximum number of reasoning extensions to attempt
    
    Returns:
        dict: The final response data containing the reasoning and answer
    """
    print(f"\n=== EXTENDING REASONING FOR ===\n{question}")
    
    # Step 1: Initialize with a basic thinking prompt
    continuation_phrases = [
        "Let me think more about this.",
        "Actually, I should reconsider my approach.",
        "Let me analyze this further.",
        "Let me double-check this calculation.",
        "I need to think about this from another angle.",
        "Let me explore an alternative solution method.",
        "I should verify these results with another approach.",
        "Let me ensure the reasoning so far is correct.",
        "Let me consider if there are any edge cases.",
        "I'll try a different way to solve this problem."
    ]
    
    initial_prompt = f"""{question}\n\n<think>"""
    
    # Log progress
    print("\n=== STEP 1: INITIATING REASONING ===")
    print(f"INITIAL PROMPT:\n{initial_prompt}")
    
    # Step 2: Generate initial reasoning
    full_reasoning = ""
    complete_response = ""
    num_extensions = 0
    
    current_prompt = initial_prompt
    
    while num_extensions <= max_extensions:
        # Make API call to generate or continue reasoning
        if num_extensions == 0:
            print("\n=== GENERATING INITIAL REASONING ===")
        else:
            print(f"\n=== EXTENSION {num_extensions}: CONTINUING REASONING ===")
        
        payload = {
            "model": MODEL,
            "max_tokens": 1000,  # Using a smaller chunk size per call
            "temperature": 0.7,
            "top_p": 1,
            "top_k": 40,
            "presence_penalty": 0,
            "frequency_penalty": 0,
            "prompt": current_prompt
        }
        
        # Make API call
        response = requests.post(API_URL, headers=HEADERS, data=json.dumps(payload))
        
        if response.status_code != 200:
            print(f"ERROR: API call failed with status {response.status_code}")
            print(response.text)
            return None
        
        response_data = response.json()
        
        # Extract the generated text
        if "choices" in response_data and len(response_data["choices"]) > 0:
            generation = response_data["choices"][0]["text"]
            print(f"\n--- GENERATION {num_extensions} ---\n{generation}")
        else:
            print("Error: No response from the API")
            return None
        
        # Check if the model completed its thinking with </think> tag
        if "</think>" in generation:
            print("\n=== FOUND </think> TAG, REMOVING IT TO CONTINUE REASONING ===")
            # Remove everything from </think> onwards for continuation
            thinking_part = generation.split("</think>")[0]
            full_reasoning += thinking_part
        else:
            # No closing tag, just append the generation
            full_reasoning += generation
            
        # Record which continuation was used in this iteration (if not the first)
        if num_extensions > 0:
            # Store the continuation phrase that led to this generation
            used_continuation = continuation_phrases[(num_extensions - 1) % len(continuation_phrases)]
            print(f"\nCONTINUATION PHRASE USED: '{used_continuation}'")
            # This will help track which phrase led to which generation in the log
        
        # Estimate the current token count
        current_token_count = estimate_tokens(full_reasoning)
        print(f"\nCURRENT TOKEN COUNT ESTIMATE: ~{current_token_count}")
        
        # Check if we've reached the target token count
        if current_token_count >= TARGET_TOKEN_COUNT:
            print(f"\n=== REACHED TARGET TOKEN COUNT ({TARGET_TOKEN_COUNT}) ===")
            # Add a closing tag to the final reasoning
            final_reasoning = full_reasoning + "\n</think>"
            break
        
        # Haven't reached target count, continue extending
        num_extensions += 1
        if num_extensions <= max_extensions:
            # Add a continuation phrase
            continuation = random.choice(continuation_phrases)
            print(f"\nADDING CONTINUATION: '{continuation}'")
            
            # Explicitly add the continuation phrase to the full reasoning
            full_reasoning += f"\n{continuation}\n"
            
            # Update the current prompt with the full reasoning including continuation
            current_prompt = initial_prompt + full_reasoning
        else:
            # Reached max extensions, close the reasoning
            print(f"\n=== REACHED MAX EXTENSIONS ({max_extensions}) ===")
            final_reasoning = full_reasoning + "\n</think>"
            break
    
    # After reaching target token count or max extensions, generate a final answer
    print("\n=== GENERATING FINAL ANSWER ===")
    
    final_prompt = f"""{question}\n\n<think>{full_reasoning}</think>"""
    
    final_payload = {
        "model": MODEL,
        "max_tokens": 500,
        "temperature": 0.7,
        "prompt": final_prompt
    }
    
    final_response = requests.post(API_URL, headers=HEADERS, data=json.dumps(final_payload))
    
    if final_response.status_code == 200:
        final_data = final_response.json()
        if "choices" in final_data and len(final_data["choices"]) > 0:
            final_answer = final_data["choices"][0]["text"].strip()
            print(f"\nFINAL ANSWER:\n{final_answer}")
        else:
            final_answer = ""
    else:
        print(f"ERROR: Final answer generation failed with status {final_response.status_code}")
        final_answer = ""
    
    # Track all continuation phrases used
    used_continuations = []
    for i in range(min(num_extensions, len(continuation_phrases))):
        used_continuations.append(continuation_phrases[i % len(continuation_phrases)])
    
    # Construct and return the final output
    result = {
        "question": question,
        "reasoning": full_reasoning,
        "full_reasoning_with_tags": f"<think>{full_reasoning}</think>",
        "answer": final_answer,
        "extensions": num_extensions,
        "estimated_token_count": current_token_count,
        "continuation_phrases_used": used_continuations
    }
    
    print("\n=== REASONING EXTENSION COMPLETE ===")
    print(f"Extensions made: {num_extensions}")
    print(f"Estimated token count: ~{current_token_count}")
    print(f"Full reasoning and trace saved to result dictionary")
    
    return result

# Test with sample questions
questions = [
    "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
    
    "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?"
]

# Function to print continuation details
def print_full_trace(result):
    print("\n" + "="*80)
    print("FULL REASONING TRACE")
    print("="*80)
    print(result["reasoning"])
    print("\n" + "="*80)
    print("TOTAL EXTENSIONS:", result["extensions"])
    print("ESTIMATED TOKEN COUNT:", result["estimated_token_count"])
    print("="*80)

# Run the test
for i, question in enumerate(questions):
    print(f"\n{'='*80}\nQUESTION {i+1}: {question}\n{'='*80}")
    result = extend_reasoning(question, max_extensions=10)
    
    if result:
        # Print the full trace
        print_full_trace(result)
        
        # Save the result to a file
        filename = f"extended_reasoning_{i+1}.json"
        with open(filename, 'w') as f:
            json.dump(result, f, indent=2)
        print(f"Results saved to {filename}")


QUESTION 1: Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

=== EXTENDING REASONING FOR ===
Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

=== STEP 1: INITIATING REASONING ===
INITIAL PROMPT:
Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

<think>

=== GENERATING INITIAL REASONING ===

--- GENER

In [8]:
response

ChatCompletion(id='fc1f3944-c4b6-4e35-baf4-3ac0a1d844c2', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="<think>\nOkay, let me try to figure out how much Janet makes every day at the farmers' market from selling her duck eggs. Alright, so the problem says her ducks lay 16 eggs per day. Then she eats three for breakfast every morning and bakes muffins for her friends every day with four. The rest she sells at the farmers' market for $2 per fresh duck egg. So, the question is asking how much money she makes daily from the sales.\n\nLet me break this down step by step. First, she starts with 16 eggs each day. Then, she consumes some of them: three for breakfast and four for muffins. So that's 3 + 4 eggs used for her own purposes. After that, whatever eggs are left over, she sells at the market. Then, we need to calculate how many that is and multiply by $2 to find the total daily earnings.\n\nWait, hold on. Let me make sure I got that 

In [None]:
import pandas as pd
import pandas as pd
import pyarrow as pa


# Enhanced JSON parsing function
def safe_parse_qa_result(json_str, question_id, original_question):
    """
    Safely parse the JSON response, handling missing fields.
    
    Args:
        json_str: JSON string to parse
        question_id: ID of the current question (for error reporting)
        original_question: The original question text
        
    Returns:
        Validated QAResult object
    """
    try:
        # First try to parse as-is
        return QAResult.model_validate_json(json_str)
    except Exception as e:
        print(f"Fixing JSON for question {question_id}: {str(e)}")
        
        try:
            # Try to parse with more flexibility
            parsed_json = json.loads(json_str)
            
            # Add missing fields if needed
            if "question" not in parsed_json:
                parsed_json["question"] = original_question
            
            if "answer" not in parsed_json:
                parsed_json["answer"] = "No answer provided"
            
            # Try validation again with the fixed data
            return QAResult.model_validate(parsed_json)
        except Exception as nested_e:
            print(f"Creating fallback response for question {question_id}")
            
            # Create a minimal valid object
            return QAResult(
                question=original_question,
                answer="Error: Failed to parse model response"
            )

# Function to fix only the failed questions
def fix_failed_questions(test_df, output_file="fixed_reasoning_traces.txt"):
    # Initialize the client
    client = OpenAI(
        base_url="https://api.fireworks.ai/inference/v1",
        api_key=os.getenv("FIREWORKS_API_KEY"),
    )
    
    # Find failed questions by examining the error log
    failed_indices = []
    with open("reasoning_traces.txt", 'r', encoding='utf-8') as f:
        content = f.read()
        error_matches = re.findall(r"ERROR: Error processing question (\d+)", content)
        failed_indices = [int(idx) - 1 for idx in error_matches]  # Convert to 0-based indices
    
    print(f"Found {len(failed_indices)} failed questions to reprocess")
    
    # Open file for logging fixed responses
    with open(output_file, 'w', encoding='utf-8') as f:
        # Process only the failed questions
        for idx in tqdm(failed_indices):
            try:
                question = test_df['question'].iloc[idx]
                
                # Log the question
                f.write(f"\nFixing Question {idx + 1}:\n{question}\n")
                f.write("-" * 80 + "\n")
                
                # Construct the messages payload
                messages = [{"role": "user", "content": question}]
                
                # Make the API call to the model
                response = client.chat.completions.create(
                    model="accounts/fireworks/models/deepseek-r1",
                    messages=messages,
                    response_format={"type": "json_object", "schema": QAResult.model_json_schema()},
                    max_tokens=3000,
                )
                
                # Extract the content of the response
                response_content = response.choices[0].message.content
                
                # Extract the reasoning part
                reasoning_match = re.search(r"<think>(.*?)</think>", response_content, re.DOTALL)
                reasoning = reasoning_match.group(1).strip() if reasoning_match else "No reasoning provided."
                
                # Extract the JSON part
                json_match = re.search(r"</think>\s*(\{.*\})", response_content, re.DOTALL)
                json_str = json_match.group(1).strip() if json_match else "{}"
                
                # Use the enhanced parsing function
                qa_result = safe_parse_qa_result(json_str, idx + 1, question)
                
                # Store in DataFrame
                test_df.at[idx, 'reasoning_trace'] = reasoning
                test_df.at[idx, 'model_answer'] = qa_result.answer
                
                # Log to file
                f.write("Reasoning:\n")
                f.write(reasoning + "\n")
                f.write("\nQA Result:\n")
                f.write(qa_result.model_dump_json(indent=4) + "\n")
                f.write("=" * 80 + "\n")
                
                # Add a small delay to avoid rate limiting
                time.sleep(0.5)
                
            except Exception as e:
                error_msg = f"Error still occurs when fixing question {idx + 1}: {str(e)}"
                print(error_msg)
                f.write(f"\nERROR: {error_msg}\n")
                f.write("=" * 80 + "\n")
                continue
    
    # Try to save the updated DataFrame
    try:
        test_df.to_parquet('fixed_test_data.parquet')
        print("Successfully saved fixed data to parquet file")
    except Exception as e:
        print(f"Error saving DataFrame: {str(e)}")
        print("Results are still available in the text file")
    
    return test_df

# Run the fixing process
fixed_df = fix_failed_questions(processed_df)

NameError: name 'processed_df' is not defined

In [10]:
fixed_df = pd.read_parquet('data/fixed_test_data.parquet')
fixed_df.head()

Unnamed: 0,question,answer,reasoning_trace,model_answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,"Okay, let's try to figure out how much money J...",\boxed{18}
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,"Okay, let me figure out this robe problem. So,...",The problem states that a robe requires 2 bolt...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,"Okay, so Josh is trying to flip a house. Let m...",\boxed{70000}
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,"Okay, let's see. James is doing 3 sprints, and...","James runs 3 sprints each time, and each sprin..."
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...","First, I determine the total amount of feed ea...","First, determine the total daily feed required..."


In [59]:
combined_df = pd.concat([processed_df, fixed_df], ignore_index=True)
combined_df.to_parquet('data/combined_test_data.parquet')

In [60]:
combined_df_processed = combined_df[:100]
combined_df_processed = combined_df_processed[combined_df_processed['model_answer'] != 'No answer provided']
combined_df_processed

Unnamed: 0,question,answer,reasoning_trace,model_answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,"Okay, let's try to figure out how much money J...",\boxed{18}
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,"Okay, let me figure out this robe problem. So,...",The problem states that a robe requires 2 bolt...
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,"Okay, so Josh is trying to flip a house. Let m...",\boxed{70000}
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,"Okay, let's see. James is doing 3 sprints, and...","James runs 3 sprints each time, and each sprin..."
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...","First, I determine the total amount of feed ea...","First, determine the total daily feed required..."
...,...,...,...,...
95,"Out of the 200 Grade 5 students, 2/5 are boys ...",There are 200 x 2/5 = <<200*2/5=80>>80 boys in...,"Alright, let's tackle this problem step by ste...",\boxed{40}``\n\n**Step-by-Step Explanation:**\...
96,Harry slept 9 hours last night. His friend Jam...,James slept 9 hours * 2/3 = <<9*2/3=6>>6 hours...,"Okay, let's see. Harry slept for 9 hours last ...",Harry slept 9 hours. To find James's sleep dur...
97,"When Freda cooks canned tomatoes into sauce, t...",Tomatoes lose half their volume when made into...,"Okay, let me try to solve this problem. Hmm, F...",oxed{12}
98,Cars have lined up on the motorway. Some of th...,The cars that exited do not need to be include...,"Okay, let me try to figure this out step by st...",\boxed{5}


In [61]:
import pandas as pd
import re
import openai
import os
from tqdm import tqdm

# Function to normalize decimal numbers to whole numbers if applicable
def normalize_decimal(answer_str):
    if not isinstance(answer_str, str):
        return answer_str
    
    # Try to convert to float
    try:
        # Look for numeric patterns in the answer
        numeric_match = re.search(r'[-+]?\d*\.?\d+', answer_str)
        if numeric_match:
            number_str = numeric_match.group(0)
            number = float(number_str)
            
            # Check if it's a whole number
            if number.is_integer():
                # Replace the decimal number with integer in the original string
                integer_str = str(int(number))
                # Only replace the specific matched number pattern
                return answer_str.replace(number_str, integer_str)
        
        # If no match or not convertible to float, just return the original
        return answer_str
    except:
        # If any error occurs, return the original string
        return answer_str

# Function to extract answers from \boxed{X} format
def extract_boxed_answer(text):
    if not isinstance(text, str):
        return None
    
    # Look for \boxed{X} pattern
    pattern = r'\\boxed\{(.*?)\}'
    matches = re.findall(pattern, text)
    
    if matches:
        answer = matches[0].strip()
        # Convert decimal to whole number if possible
        return normalize_decimal(answer)
    return None

# Function to use GPT-4o to extract the answer when \boxed{} is not present
def extract_with_gpt4o(model_answer):
    # Set your OpenAI API key
    openai.api_key = os.environ.get("OPENAI_API_KEY")
    
    prompt = f"""
    Below is a model's answer for a math problem from the GSM8k dataset. 
    Extract just the final numerical answer from this reasoning. 
    Return ONLY the number or calculation result, with no additional text or explanation.
    If there are multiple numbers, identify the one that represents the final answer.
    If the answer is a decimal number that can be expressed as a whole number (like 75.00), convert it to the whole number (75).
    
    Reasoning trace:
    {model_answer}
    
    Final answer (number only):
    """
    
    try:
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts final numerical answers from math reasoning traces."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        extracted_answer = response.choices[0].message.content.strip()
        # Normalize decimal numbers in GPT-4o response as well
        return normalize_decimal(extracted_answer)
    except Exception as e:
        print(f"Error calling GPT-4o: {e}")
        return None

# Main function to process the dataframe
def process_answers(df):
    # Create a new column for extracted answers
    df['extracted_answer'] = None
    
    # Track which rows need GPT-4o
    needs_gpt4o = []
    
    # First pass: extract all boxed answers
    for i, row in df.iterrows():
        boxed_answer = extract_boxed_answer(row['model_answer'])
        print(boxed_answer)
        if boxed_answer is not None:
            df.at[i, 'extracted_answer'] = boxed_answer
        else:
            needs_gpt4o.append(i)
    
    # Second pass: use GPT-4o for remaining rows
    if needs_gpt4o:
        print(f"Using GPT-4o to extract answers for {len(needs_gpt4o)} rows...")
        for i in tqdm(needs_gpt4o):
            df.at[i, 'extracted_answer'] = extract_with_gpt4o(df.at[i, 'model_answer'])
    
    # Final pass to ensure all decimal answers are normalized
    for i, row in df.iterrows():
        if row['extracted_answer'] is not None:
            df.at[i, 'extracted_answer'] = normalize_decimal(str(row['extracted_answer']))
    
    return df

# This is your original line that calls the function
combined_df_processed = process_answers(combined_df_processed)
print(combined_df_processed[['question', 'model_answer', 'extracted_answer']])

18
3
70000
None
20
64
None
None
None
None
None
694
None
18
60
125
None
None
7
None
15
None
None
None
26
2
None
16
None
104
109
80
None
70
23
9
None
2
10
18
200
None
None
None
None
None
800
None
None
None
5
15
None
40
None
None
83
57
None
17
1430
25000
1596
None
36
48
595
36
None
7425
None
221
None
88
60
None
None
None
None
None
None
623
None
None
None
9360
8000
24
225
28
4
348
40
3
None
5
58
Using GPT-4o to extract answers for 43 rows...


  0%|          | 0/43 [00:00<?, ?it/s]

100%|██████████| 43/43 [00:17<00:00,  2.51it/s]

                                             question  \
0   Janet’s ducks lay 16 eggs per day. She eats th...   
1   A robe takes 2 bolts of blue fiber and half th...   
2   Josh decides to try flipping a house.  He buys...   
3   James decides to run 3 sprints 3 times a week....   
4   Every day, Wendi feeds each of her chickens th...   
..                                                ...   
95  Out of the 200 Grade 5 students, 2/5 are boys ...   
96  Harry slept 9 hours last night. His friend Jam...   
97  When Freda cooks canned tomatoes into sauce, t...   
98  Cars have lined up on the motorway. Some of th...   
99  Mary is an avid gardener. Yesterday, she recei...   

                                         model_answer extracted_answer  
0                                          \boxed{18}               18  
1   The problem states that a robe requires 2 bolt...                3  
2                                       \boxed{70000}            70000  
3   James runs 3 sprint




In [62]:
# count number of nones
combined_df_processed

Unnamed: 0,question,answer,reasoning_trace,model_answer,extracted_answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,"Okay, let's try to figure out how much money J...",\boxed{18},18
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,"Okay, let me figure out this robe problem. So,...",The problem states that a robe requires 2 bolt...,3
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,"Okay, so Josh is trying to flip a house. Let m...",\boxed{70000},70000
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,"Okay, let's see. James is doing 3 sprints, and...","James runs 3 sprints each time, and each sprin...",540
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...","First, I determine the total amount of feed ea...","First, determine the total daily feed required...",20
...,...,...,...,...,...
95,"Out of the 200 Grade 5 students, 2/5 are boys ...",There are 200 x 2/5 = <<200*2/5=80>>80 boys in...,"Alright, let's tackle this problem step by ste...",\boxed{40}``\n\n**Step-by-Step Explanation:**\...,40
96,Harry slept 9 hours last night. His friend Jam...,James slept 9 hours * 2/3 = <<9*2/3=6>>6 hours...,"Okay, let's see. Harry slept for 9 hours last ...",Harry slept 9 hours. To find James's sleep dur...,3
97,"When Freda cooks canned tomatoes into sauce, t...",Tomatoes lose half their volume when made into...,"Okay, let me try to solve this problem. Hmm, F...",oxed{12},12
98,Cars have lined up on the motorway. Some of th...,The cars that exited do not need to be include...,"Okay, let me try to figure this out step by st...",\boxed{5},5


In [63]:


def extract_ground_truth(answer_text):
    """
    Extract the ground truth answer that follows '#### ' in the answer text.
    Example: 'It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric\n#### 3'
    should return '3'
    """
    if not isinstance(answer_text, str):
        return None
    
    # Look for the pattern '#### X'
    pattern = r'####\s*(.*?)$'
    match = re.search(pattern, answer_text)
    
    if match:
        return match.group(1).strip()
    return None

def normalize_answer(answer):
    """
    Normalize answers for comparison (strip whitespace, convert to lowercase, etc.)
    """
    if not isinstance(answer, str):
        return str(answer) if answer is not None else ""
    
    # Remove whitespace and convert to lowercase
    answer = answer.strip().lower()
    
    # Remove commas from numbers
    answer = re.sub(r'(\d),(\d)', r'\1\2', answer)
    
    # Remove dollar signs, percentage signs, etc.
    answer = re.sub(r'[$%]', '', answer)
    
    # Try to extract just the numerical part if it's a complex string
    numeric_match = re.search(r'[-+]?\d*\.?\d+', answer)
    if numeric_match:
        return numeric_match.group(0)
    
    return answer

def evaluate_performance(df):
    """
    Evaluate the performance of extracted model answers against ground truth.
    """
    print("Extracting ground truth answers...")
    df['ground_truth'] = df['answer'].apply(extract_ground_truth)
    
    # Check if extraction worked
    missing_ground_truth = df['ground_truth'].isna().sum()
    if missing_ground_truth > 0:
        print(f"Warning: {missing_ground_truth} rows have missing ground truth answers")
    
    # Normalize both extracted answers and ground truth for fair comparison
    print("Normalizing answers for comparison...")
    df['normalized_extracted'] = df['extracted_answer'].apply(normalize_answer)
    df['normalized_ground_truth'] = df['ground_truth'].apply(normalize_answer)
    
    # Calculate accuracy
    correct = (df['normalized_extracted'] == df['normalized_ground_truth'])
    accuracy = correct.mean()
    
    print(f"\nEvaluation Results:")
    print(f"Total examples: {len(df)}")
    print(f"Correct answers: {correct.sum()}")
    print(f"Accuracy: {accuracy:.2%}")
    
    # Display a few examples of correct and incorrect predictions
    print("\nSample of correct predictions:")
    correct_samples = df[correct].head(3)
    for i, row in correct_samples.iterrows():
        print(f"Question: {row['question']}...")
        print(f"Ground Truth: {row['ground_truth']}")
        print(f"Model Answer: {row['extracted_answer']}")
        print("-" * 80)
    
    print("\nSample of incorrect predictions:")
    incorrect_samples = df[~correct].head(3)
    for i, row in incorrect_samples.iterrows():
        print(f"Question: {row['question']}...")
        print(f"Ground Truth: {row['ground_truth']}")
        print(f"Model Answer: {row['extracted_answer']}")
        print("-" * 80)
    
    # Return evaluation metrics
    return {
        'accuracy': accuracy,
        'correct_count': correct.sum(),
        'total_count': len(df),
        'incorrect_examples': df[~correct][['question', 'ground_truth', 'extracted_answer']]
    }

def evaluate_with_tolerance(df, tolerance=0.01):
    """
    Evaluate with tolerance for numerical answers to account for rounding differences
    """
    results = {}
    
    # First try exact match
    exact_match = evaluate_performance(df)
    results['exact_match'] = exact_match['accuracy']
    
    # Try numerical comparison with tolerance for numeric answers
    print("\nEvaluating with numerical tolerance...")
    
    def is_numeric(s):
        try:
            float(s)
            return True
        except (ValueError, TypeError):
            return False
    
    correct_with_tolerance = []
    
    for i, row in df.iterrows():
        if row['normalized_extracted'] == row['normalized_ground_truth']:
            correct_with_tolerance.append(True)
        elif is_numeric(row['normalized_extracted']) and is_numeric(row['normalized_ground_truth']):
            # Apply tolerance for numerical comparisons
            try:
                extracted = float(row['normalized_extracted'])
                ground_truth = float(row['normalized_ground_truth'])
                correct_with_tolerance.append(abs(extracted - ground_truth) <= tolerance)
            except:
                correct_with_tolerance.append(False)
        else:
            correct_with_tolerance.append(False)
    
    accuracy_with_tolerance = np.mean(correct_with_tolerance)
    print(f"Accuracy with tolerance: {accuracy_with_tolerance:.2%}")
    
    results['with_tolerance'] = accuracy_with_tolerance
    return results

# Full pipeline: Load data, extract answers, evaluate
def run_full_pipeline(df):
    
    print(f"Loaded DataFrame with {len(df)} rows and columns: {df.columns.tolist()}")
    
    # Assume extracted_answer column already exists from previous step
    if 'extracted_answer' not in df.columns:
        print("Error: 'extracted_answer' column not found. Run extract_answers script first.")
        return
    
    # Evaluate model performance
    eval_results = evaluate_with_tolerance(df)
    
    # Save evaluation results
    result_df = df[['question', 'answer', 'ground_truth', 'model_answer', 'extracted_answer', 'normalized_ground_truth', 'normalized_extracted', 'reasoning_trace']] 
    print("\nEvaluation complete.")
    return result_df

combined_df_processed = run_full_pipeline(combined_df_processed)

Loaded DataFrame with 97 rows and columns: ['question', 'answer', 'reasoning_trace', 'model_answer', 'extracted_answer']
Extracting ground truth answers...
Normalizing answers for comparison...

Evaluation Results:
Total examples: 97
Correct answers: 96
Accuracy: 98.97%

Sample of correct predictions:
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?...
Ground Truth: 18
Model Answer: 18
--------------------------------------------------------------------------------
Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?...
Ground Truth: 3
Model Answer: 3
--------------------------------------------------------------------------------
Question: Josh decides to try flipping a house.  He buys

In [64]:
combined_df_processed

Unnamed: 0,question,answer,ground_truth,model_answer,extracted_answer,normalized_ground_truth,normalized_extracted,reasoning_trace
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,18,\boxed{18},18,18,18,"Okay, let's try to figure out how much money J..."
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,3,The problem states that a robe requires 2 bolt...,3,3,3,"Okay, let me figure out this robe problem. So,..."
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,70000,\boxed{70000},70000,70000,70000,"Okay, so Josh is trying to flip a house. Let m..."
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,540,"James runs 3 sprints each time, and each sprin...",540,540,540,"Okay, let's see. James is doing 3 sprints, and..."
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...",20,"First, determine the total daily feed required...",20,20,20,"First, I determine the total amount of feed ea..."
...,...,...,...,...,...,...,...,...
95,"Out of the 200 Grade 5 students, 2/5 are boys ...",There are 200 x 2/5 = <<200*2/5=80>>80 boys in...,40,\boxed{40}``\n\n**Step-by-Step Explanation:**\...,40,40,40,"Alright, let's tackle this problem step by ste..."
96,Harry slept 9 hours last night. His friend Jam...,James slept 9 hours * 2/3 = <<9*2/3=6>>6 hours...,3,Harry slept 9 hours. To find James's sleep dur...,3,3,3,"Okay, let's see. Harry slept for 9 hours last ..."
97,"When Freda cooks canned tomatoes into sauce, t...",Tomatoes lose half their volume when made into...,12,oxed{12},12,12,12,"Okay, let me try to solve this problem. Hmm, F..."
98,Cars have lined up on the motorway. Some of th...,The cars that exited do not need to be include...,5,\boxed{5},5,5,5,"Okay, let me try to figure this out step by st..."


In [65]:
combined_df_processed[combined_df_processed['extracted_answer'] != combined_df_processed['normalized_ground_truth']]

Unnamed: 0,question,answer,ground_truth,model_answer,extracted_answer,normalized_ground_truth,normalized_extracted,reasoning_trace
12,Carlos is planting a lemon tree. The tree will...,He makes $10.5 selling lemons each year becaus...,13,oxed{12}.,12,13,12,"Okay, let's see. Carlos is planting a lemon tr..."


In [68]:
import pandas as pd
import openai
import json
import re
from pydantic import BaseModel
import time
from tqdm import tqdm
import os

# Define the output schema using Pydantic
class QAResult(BaseModel):
    question: str
    answer: str

# Function to call GPT-4o with retry mechanism
def call_gpt4o(messages, temperature=0, max_retries=3, retry_delay=5):
    """Call GPT-4o API with retry logic"""
    openai_client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    
    for attempt in range(max_retries):
        try:
            response = openai_client.chat.completions.create(
                model="gpt-4o",
                messages=messages,
                temperature=temperature
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"API call attempt {attempt+1} failed: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Max retries reached. Returning empty string.")
                return ""

# Function to normalize decimal numbers to whole numbers if applicable
def normalize_decimal(answer_str):
    if not isinstance(answer_str, str):
        return str(answer_str) if answer_str is not None else ""
    
    # Try to convert to float
    try:
        # Look for numeric patterns in the answer
        numeric_match = re.search(r'[-+]?\d*\.?\d+', answer_str)
        if numeric_match:
            number_str = numeric_match.group(0)
            number = float(number_str)
            
            # Check if it's a whole number
            if number.is_integer():
                # Replace the decimal number with integer in the original string
                integer_str = str(int(number))
                # Only replace the specific matched number pattern
                return answer_str.replace(number_str, integer_str)
        
        # If no match or not convertible to float, just return the original
        return answer_str
    except:
        # If any error occurs, return the original string
        return answer_str

# Function to summarize reasoning trace
def summarize_reasoning(reasoning_trace):
    """
    Summarize the model's reasoning trace into a more concise form while preserving critical reasoning steps.
    """
    summarizer_prompt = [
        {"role": "system", "content": """You are an expert at summarizing mathematical reasoning traces.
        Your goal is to produce concise but complete summaries of reasoning traces that:
        1. Preserve all key reasoning steps and intermediate calculations
        2. Highlight potential errors or invalid assumptions in the original reasoning
        3. Identify what paths of reasoning were explored and abandoned
        4. Maintain the logical flow and dependencies between steps
        5. Avoid introducing new reasoning or calculations not present in the original trace
        6. Format the summary in a clear, step-by-step manner
        7. Include key equations and numerical results with proper notation
        
        Your summary should be detailed enough that another model could understand the full reasoning process,
        but concise enough to eliminate redundancy and verbose explanations."""},
        {"role": "user", "content": f"""Below is a reasoning trace from a math problem. 
        Summarize this reasoning trace while preserving all key steps, intermediate calculations, 
        and potential errors in the original reasoning.
        
        REASONING TRACE:
        {reasoning_trace}
        
        SUMMARY:"""}
    ]
    
    summary = call_gpt4o(summarizer_prompt)
    return summary

# Function to evaluate question, reasoning summary, and answer
def evaluate_reasoning(question, summarized_reasoning, answer):
    """
    Evaluate the summarized reasoning trace and answer, determining if the model should try again.
    Returns evaluation result and feedback.
    """
    evaluator_prompt = [
        {"role": "system", "content": """You are an expert mathematical reasoning evaluator.
        Your job is to critically evaluate a summarized reasoning trace and determine if:
        1. The reasoning is sound and leads to the correct answer
        2. There are logical errors, calculation mistakes, or misunderstandings of the problem
        3. The reasoning path was incomplete and requires further steps
        4. The final answer is correct

        Be particularly attentive to:
        - Units and conversions
        - Order of operations
        - Algebraic manipulations
        - Conceptual misunderstandings
        - Edge cases or constraints missed in the reasoning
        
        Provide a clear YES or NO verdict on whether the model should try again, 
        followed by specific, actionable feedback explaining why."""},
        {"role": "user", "content": f"""Question:
        {question}
        
        Summarized Reasoning:
        {summarized_reasoning}
        
        Final Answer:
        {answer}
        
        Evaluate the reasoning and the final answer. 
        First, determine if the answer is CORRECT or INCORRECT.
        Then, provide a detailed explanation of any errors or issues in the reasoning.
        Finally, explicitly state whether the model should TRY AGAIN (YES) or if the reasoning is sound (NO).
        
        Your response must begin with either "CORRECT" or "INCORRECT" on the first line.
        Your response must end with either "TRY AGAIN: YES" or "TRY AGAIN: NO" on the last line."""}
    ]
    
    evaluation = call_gpt4o(evaluator_prompt)
    
    # Parse evaluation to determine if we should try again
    try_again = "TRY AGAIN: YES" in evaluation
    is_correct = evaluation.strip().startswith("CORRECT")
    
    return {
        "try_again": try_again,
        "is_correct": is_correct,
        "evaluation": evaluation
    }

# Function to generate new reasoning using DeepSeek-R1
def generate_new_reasoning(question, summarized_reasoning):
    """
    Generate new reasoning using the DeepSeek-R1 model with the summarized reasoning as context.
    """
    # Initialize the Fireworks client
    client = openai.OpenAI(
        base_url="https://api.fireworks.ai/inference/v1",
        api_key=os.getenv("FIREWORKS_API_KEY"),
    )
    
    # Construct the message with the <think> tag and summarized reasoning
    prompt = f"{question}\n\n<think>\n{summarized_reasoning}\n\nWait, I need to continue reasoning."

    print(f"Prompt for DeepSeek-R1:\n{prompt}")
    
    messages = [{"role": "user", "content": prompt}]
    
    # Make the API call to the model
    try:
        response = client.chat.completions.create(
            model="accounts/fireworks/models/deepseek-r1",
            messages=messages,
            response_format={"type": "json_object", "schema": QAResult.model_json_schema()},
            max_tokens=3000,
        )
        
        # Extract the content of the response
        response_content = response.choices[0].message.content

        print(f"Response from DeepSeek-R1:\n{response_content}")
        
        # Extract the reasoning part (everything after our prompt until </think>)
        full_think_content = response_content.split("</think>")[0] if "</think>" in response_content else response_content
        
        # Remove our original prompt to get just the new reasoning
        original_prompt_parts = prompt.split("\n\nWait, I need to continue reasoning.")
        new_reasoning_start = full_think_content.find("Wait, I need to continue reasoning.") + len("Wait, I need to continue reasoning.")
        new_reasoning = full_think_content[new_reasoning_start:].strip()
        
        # Extract the answer part (after </think>)
        answer_part = ""
        if "</think>" in response_content:
            answer_part = response_content.split("</think>")[1].strip()
            try:
                qa_result = json.loads(answer_part)
                answer_part = qa_result.get("answer", "")
            except:
                answer_part = answer_part
        
        # Combine the new reasoning with the answer
        result = {"reasoning": new_reasoning, "answer": answer_part}
        return result
    
    except Exception as e:
        print(f"Error calling DeepSeek-R1: {e}")
        return {"reasoning": "", "answer": ""}

# Function to extract answer from the model's reasoning and JSON output
def extract_answer_from_response(response_dict):
    """
    Extract the answer from the model response dictionary.
    """
    # Try to get answer from the answer field
    if response_dict.get("answer"):
        answer = response_dict["answer"]
        
        # Try to extract from JSON if it looks like JSON
        if answer.startswith("{") and answer.endswith("}"):
            try:
                answer_json = json.loads(answer)
                if "answer" in answer_json:
                    answer = answer_json["answer"]
            except:
                pass
        
        return normalize_decimal(answer)
    
    # If no answer field, try to extract from reasoning
    reasoning = response_dict.get("reasoning", "")
    
    # Look for \boxed{X} pattern
    pattern = r'\\boxed\{(.*?)\}'
    matches = re.findall(pattern, reasoning)
    
    if matches:
        answer = matches[0].strip()
        return normalize_decimal(answer)
    
    # If still no answer, use GPT-4o to extract it
    extract_prompt = [
        {"role": "system", "content": "Extract the final numerical answer from the reasoning trace."},
        {"role": "user", "content": f"""Extract only the final numerical answer from this reasoning:
        
        {reasoning}
        
        Return ONLY the number with no additional text:"""}
    ]
    
    extracted = call_gpt4o(extract_prompt)
    return normalize_decimal(extracted)

# Main function to run the recursive reasoning pipeline
def run_recursive_reasoning_pipeline(incorrect_df, max_iterations=2):
    """
    Run the summarizer -> evaluator -> reasoning pipeline on incorrect questions.
    
    Args:
        incorrect_df: DataFrame with incorrect answers
        max_iterations: Maximum number of reasoning iterations to try
        
    Returns:
        DataFrame with original and new reasoning attempts
    """
    # Create a new DataFrame to store results
    results_df = incorrect_df.copy()
    
    # Add columns for the pipeline outputs
    results_df['summarized_reasoning'] = None
    results_df['evaluation_result'] = None
    results_df['new_reasoning'] = None
    results_df['new_answer'] = None
    results_df['is_corrected'] = False
    
    for i, row in tqdm(results_df.iterrows(), total=len(results_df), desc="Processing incorrect questions"):
        question = row['question']
        original_reasoning = row['reasoning_trace'] if 'reasoning_trace' in row else row['model_answer']
        ground_truth = row['normalized_ground_truth']
        
        print(f"\nProcessing question {i+1}/{len(results_df)}:")
        print(f"Question: {question[:100]}...")
        
        # Current reasoning starts with the original
        current_reasoning = original_reasoning
        current_answer = row['extracted_answer']
        
        # Iterate through the pipeline up to max_iterations
        for iteration in range(max_iterations):
            print(f"\nIteration {iteration+1}:")
            
            # Step 1: Summarize the reasoning trace
            print("Summarizing reasoning...")
            summarized_reasoning = summarize_reasoning(current_reasoning)
            if iteration == 0:  # Only save the first summary
                results_df.at[i, 'summarized_reasoning'] = summarized_reasoning
            
            # Step 2: Evaluate the reasoning and answer
            print("Evaluating reasoning and answer...")
            eval_result = evaluate_reasoning(question, summarized_reasoning, current_answer)
            if iteration == 0:  # Only save the first evaluation
                results_df.at[i, 'evaluation_result'] = eval_result['evaluation']
            
            # Check if we should try again
            if not eval_result['try_again']:
                print("Evaluator says no need to try again.")
                # break
                print("Commenting out breakpoint for now, should have a more robust evaluator.")
                
            # Step 3: Generate new reasoning with DeepSeek-R1
            print("Generating new reasoning with DeepSeek-R1...")
            new_response = generate_new_reasoning(question, summarized_reasoning)
            
            # Update current reasoning and answer for next iteration
            if new_response["reasoning"]:
                current_reasoning = new_response["reasoning"]
                current_answer = extract_answer_from_response(new_response)
            
            # If this is the final iteration or we're stopping, save the results
            if iteration == max_iterations - 1 or not eval_result['try_again']:
                results_df.at[i, 'new_reasoning'] = current_reasoning
                results_df.at[i, 'new_answer'] = current_answer
                
                # Check if the new answer matches ground truth
                normalized_new_answer = normalize_decimal(current_answer)
                is_correct = normalized_new_answer == ground_truth
                results_df.at[i, 'is_corrected'] = is_correct
                
                print(f"New answer: {current_answer}")
                print(f"Correct: {is_correct}")
        
        # Save intermediate results after each question
        results_df.to_csv('recursive_reasoning_results.csv', index=False)
    
    # Calculate overall improvement
    original_correct = 0
    new_correct = sum(results_df['is_corrected'])
    improvement = new_correct / len(results_df) * 100
    
    print(f"\nResults summary:")
    print(f"Total incorrect questions processed: {len(results_df)}")
    print(f"Questions corrected: {new_correct}")
    print(f"Improvement rate: {improvement:.2f}%")
    
    return results_df

# Function to execute the pipeline
def execute_pipeline(combined_df_processed):
    # Identify incorrect questions
    print("Identifying incorrect questions...")
    incorrect_df = combined_df_processed[combined_df_processed['normalized_extracted'] != combined_df_processed['normalized_ground_truth']]
    print(f"Found {len(incorrect_df)} incorrect questions.")
    
    # Sample a small subset for testing if needed
    # Uncomment for testing with just a few examples
    # incorrect_df = incorrect_df.head(3)
    
    # Run the recursive reasoning pipeline
    print("Starting recursive reasoning pipeline...")
    results = run_recursive_reasoning_pipeline(incorrect_df, max_iterations=2)
    
    # Save final results
    results.to_csv('recursive_reasoning_final_results.csv', index=False)
    
    # Calculate and print improvement stats
    original_accuracy = len(combined_df_processed[combined_df_processed['normalized_extracted'] == combined_df_processed['normalized_ground_truth']]) / len(combined_df_processed)
    corrected_count = results['is_corrected'].sum()
    
    # Calculate new overall accuracy
    total_corrected = len(combined_df_processed[combined_df_processed['normalized_extracted'] == combined_df_processed['normalized_ground_truth']]) + corrected_count
    new_accuracy = total_corrected / len(combined_df_processed)
    
    print("\n=== FINAL RESULTS ===")
    print(f"Original accuracy: {original_accuracy:.2%}")
    print(f"Questions corrected by pipeline: {corrected_count}/{len(incorrect_df)} ({corrected_count/len(incorrect_df):.2%})")
    print(f"New overall accuracy: {new_accuracy:.2%}")
    print(f"Absolute accuracy improvement: {new_accuracy - original_accuracy:.2%}")
    
    # Create a summary DataFrame for easy sharing
    summary = pd.DataFrame({
        'Metric': [
            'Total questions', 
            'Originally correct',
            'Originally incorrect',
            'Corrected by pipeline',
            'Final correct',
            'Original accuracy',
            'Pipeline correction rate',
            'Final accuracy',
            'Absolute improvement'
        ],
        'Value': [
            len(combined_df_processed),
            len(combined_df_processed) - len(incorrect_df),
            len(incorrect_df),
            corrected_count,
            len(combined_df_processed) - len(incorrect_df) + corrected_count,
            f"{original_accuracy:.2%}",
            f"{corrected_count/len(incorrect_df):.2%}",
            f"{new_accuracy:.2%}",
            f"{new_accuracy - original_accuracy:.2%}"
        ]
    })
    
    # Save summary
    summary.to_csv('reasoning_pipeline_summary.csv', index=False)
    print("\nSummary saved to 'reasoning_pipeline_summary.csv'")
    
    return results, summary

# Example usage:
results, summary = execute_pipeline(combined_df_processed)

Identifying incorrect questions...
Found 1 incorrect questions.
Starting recursive reasoning pipeline...


Processing incorrect questions:   0%|          | 0/1 [00:00<?, ?it/s]


Processing question 13/1:
Question: Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, w...

Iteration 1:
Summarizing reasoning...
Evaluating reasoning and answer...
Evaluator says no need to try again.
Commenting out breakpoint for now, should have a more robust evaluator.
Generating new reasoning with DeepSeek-R1...
Prompt for DeepSeek-R1:
Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?

<think>
1. **Initial Setup**: Carlos invests $90 in a lemon tree. Each year, the tree produces 7 lemons, sold at $1.5 each, generating $10.5 in revenue. The annual maintenance cost is $3, resulting in an annual profit of $7.5.

2. **Objective**: Determine how many years it takes for Carlos to start earning money, meaning when his cumulative profit

Processing incorrect questions: 100%|██████████| 1/1 [02:02<00:00, 122.03s/it]

Response from DeepSeek-R1:
<think>
Okay, so Carlos is planting a lemon tree, right? He spends $90 upfront to plant it. Each year, the tree grows 7 lemons, which he can sell for $1.5 each. So first, I need to figure out how much money he makes each year from selling the lemons. Then, subtract the annual cost of $3 for water and feed to get his net profit per year. Once I have that, I can determine how many years it will take for his total profits to exceed the initial $90 investment.

Let me start by calculating his annual revenue from selling lemons. If he sells 7 lemons each year at $1.5 per lemon, then the revenue per year is 7 multiplied by 1.5. Let me do that multiplication. 7 times 1.5... 7 times 1 is 7, and 7 times 0.5 is 3.5, so total is 10.5. So that's $10.5 per year from selling lemons.

But he also has costs each year: $3 for water and feed. So his net profit each year is revenue minus costs, which would be 10.5 minus 3. Let me subtract that: 10.5 - 3 equals 7.5. So he makes 


