In [1]:
from google import genai
from google.genai import types
import pandas as pd
import json
from itertools import cycle
import ctypes
import time
from random import randint
from loguru import logger 
from tenacity import retry, stop_after_delay, stop_after_attempt
import os

client = genai.Client(api_key='GEMINI_API_KEY')

In [2]:
@retry(stop=(stop_after_delay(1) | stop_after_attempt(3)))
def generate_content(prompt, system_prompt = "", temperature=0.1, max_output_tokens=4096):
    safety_settings = [
                                                {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
                                                {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
                                                {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                                                {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                                            ]
    if system_prompt:
        config = types.GenerateContentConfig(max_output_tokens=max_output_tokens,
                                            temperature=temperature,
                                            system_instruction=[
                                                system_prompt
                                            ],
                                            safety_settings=safety_settings
                                            )
    else:
        config = types.GenerateContentConfig(max_output_tokens=max_output_tokens,
                                            temperature=temperature,
                                            safety_settings=safety_settings
                                            )
    result = client.models.generate_content(model="models/gemini-2.5-flash-preview-05-20",
                                            contents=[
                                                prompt
                                            ], 
                                            config = config
    )
    result_text = result.text
    if not result_text:
        raise Exception()
    return result_text

In [3]:
def decompose_problem(problem):
    """Prompt the model to decompose the problem into steps."""
    prompt = f"""
    You are an expert problem solver. Given the following problem, break it down into clear, logical steps needed to solve it. Provide only the steps as a numbered list, no explanations:
    
    Problem: {problem}
    
    Steps:
    """
    system_prompt = "You are a precise and logical problem solver."
    response_text = generate_content(prompt, system_prompt = system_prompt, temperature=0.2, max_output_tokens=4096)
    logger.info(response_text)
    steps = response_text.strip().split("\n")[2:]  # Skip "Steps:" header
    return [step.strip() for step in steps if step.strip()]

def execute_step(step, context):
    """Execute a single reasoning step using the model, with context from previous steps."""
    prompt = f"""
    Given the problem and previous context, solve the following step and provide the result. If calculations are needed, show them clearly. If the step involves reasoning, explain briefly. Return only the result and a brief explanation (if needed).

    Problem: {context['problem']}
    Previous context: {context.get('previous_steps', [])}
    Current step: {step}
    
    Result:
    """
    system_prompt = "You are a precise and logical problem solver."
    response_text = generate_content(prompt, system_prompt = system_prompt, temperature=0.2, max_output_tokens=4096)
    logger.info(response_text)
    return response_text.strip().split("\n")[-1]  # Extract result

def validate_step(step, result, context):
    """Validate the result of a step and decide if refinement is needed."""
    prompt = f"""
    You are a validator for a reasoning step. Given the problem, the step, and its result, check if the result is correct and logical. If incorrect or unclear, suggest a correction or clarification. Return 'Valid' if correct, or a correction suggestion if not.

    Problem: {context['problem']}
    Step: {step}
    Result: {result}
    
    Validation:
    """
    system_prompt = "You are a precise validator of logical steps."
    response_text = generate_content(prompt, system_prompt = system_prompt, temperature=0.2, max_output_tokens=4096)
    logger.info(response_text)
    return response_text.strip().split("\n")[-1]

def autodicot(problem):
    """Main AutoDiCoT function to solve a problem with directed reasoning."""
    context = {"problem": problem, "previous_steps": []}
    
    # Step 1: Decompose the problem
    steps = decompose_problem(problem)
    print("Decomposed Steps:", steps)
    
    results = []
    for step in steps:
        # Step 2: Execute the step
        result = execute_step(step, context)
        print(f"Step: {step}\nResult: {result}")
        
        # Step 3: Validate the result
        validation = validate_step(step, result, context)
        if validation != "Valid":
            print(f"Validation failed: {validation}")
            # Refine by re-executing with correction suggestion
            context["previous_steps"].append(f"{step}: {result} (Correction: {validation})")
            result = execute_step(step, context)  # Re-run with updated context
            print(f"Refined Result: {result}")
        
        context["previous_steps"].append(f"{step}: {result}")
        results.append(result)
        time.sleep(1)
        
    # Step 4: Consolidate final answer
    final_prompt = f"""
    Given the problem and all reasoning steps, provide the final answer concisely.
    
    Problem: {problem}
    Reasoning steps: {context['previous_steps']}
    
    Final Answer:
    """
    system_prompt = "You are a precise problem solver."
    response_text = generate_content(final_prompt, system_prompt = system_prompt, temperature=0.2, max_output_tokens=4096)
    logger.info(response_text)
    final_answer = response_text.strip().split("\n")[-1]
    return final_answer


In [4]:
problem = "A store offers a 20% discount on a $50 item. If there's a 5% sales tax applied after the discount, what is the final price?"
final_answer = autodicot(problem)
print(f"\nFinal Answer: {final_answer}")

[32m2025-06-16 10:19:02.094[0m | [1mINFO    [0m | [36m__main__[0m:[36mdecompose_problem[0m:[36m12[0m - [1m1. Calculate the discount amount.
2. Subtract the discount amount from the original price to find the price after discount.
3. Calculate the sales tax amount on the discounted price.
4. Add the sales tax amount to the discounted price to find the final price.[0m


Decomposed Steps: ['3. Calculate the sales tax amount on the discounted price.', '4. Add the sales tax amount to the discounted price to find the final price.']


[32m2025-06-16 10:19:03.691[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mSales tax amount = 5% of $40 = 0.05 * 40 = $2.00[0m


Step: 3. Calculate the sales tax amount on the discounted price.
Result: Sales tax amount = 5% of $40 = 0.05 * 40 = $2.00


[32m2025-06-16 10:19:05.689[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:19:07.655[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mFinal price = $40 + $2.00 = $42.00[0m


Step: 4. Add the sales tax amount to the discounted price to find the final price.
Result: Final price = $40 + $2.00 = $42.00


[32m2025-06-16 10:19:10.663[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:19:12.328[0m | [1mINFO    [0m | [36m__main__[0m:[36mautodicot[0m:[36m86[0m - [1mThe final price is $42.00.[0m



Final Answer: The final price is $42.00.


In [5]:
problem = """A researcher is studying the effectiveness of two teaching methods (A and B) on student exam scores.
A sample of 50 students is randomly split into two groups: 25 students are taught using Method A, and 25 using Method B.
The exam scores (out of 100) for Method A have a mean of 78 and a standard deviation of 10, while Method B scores have a mean of 82 and a standard deviation of 8.
Assume the scores are normally distributed.
The researcher wants to determine if there is a statistically significant difference between the two methods at a 5% significance level using a two-sample t-test.
Additionally, calculate the 95% confidence interval for the difference in mean scores.
What is the conclusion, and what is the confidence interval?"""
final_answer = autodicot(problem)
print(f"\nFinal Answer: {final_answer}")

[32m2025-06-16 10:19:19.300[0m | [1mINFO    [0m | [36m__main__[0m:[36mdecompose_problem[0m:[36m12[0m - [1m1.  State the null and alternative hypotheses.
2.  Identify the significance level (alpha).
3.  Determine the appropriate formula for the two-sample t-statistic, considering whether to assume equal or unequal variances.
4.  Calculate the t-statistic.
5.  Calculate the degrees of freedom for the t-test.
6.  Determine the critical t-value(s) for the given significance level and degrees of freedom, or calculate the p-value.
7.  Compare the calculated t-statistic to the critical t-value(s) or compare the p-value to the significance level.
8.  Make a decision regarding the null hypothesis (reject or fail to reject).
9.  State the conclusion of the hypothesis test in the context of the problem.
10. Calculate the standard error of the difference between the means.
11. Determine the t-multiplier for the 95% confidence interval using the calculated degrees of freedom.
12. Calcula

Decomposed Steps: ['3.  Determine the appropriate formula for the two-sample t-statistic, considering whether to assume equal or unequal variances.', '4.  Calculate the t-statistic.', '5.  Calculate the degrees of freedom for the t-test.', '6.  Determine the critical t-value(s) for the given significance level and degrees of freedom, or calculate the p-value.', '7.  Compare the calculated t-statistic to the critical t-value(s) or compare the p-value to the significance level.', '8.  Make a decision regarding the null hypothesis (reject or fail to reject).', '9.  State the conclusion of the hypothesis test in the context of the problem.', '10. Calculate the standard error of the difference between the means.', '11. Determine the t-multiplier for the 95% confidence interval using the calculated degrees of freedom.', '12. Calculate the 95% confidence interval for the difference in mean scores.', '13. State the final conclusion based on both the hypothesis test and the confidence interval.

[32m2025-06-16 10:19:24.495[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mThe appropriate formula for the two-sample t-statistic, assuming unequal variances (Welch's t-test), is:

$t = \frac{(\bar{x}_1 - \bar{x}_2) - (\mu_1 - \mu_2)}{\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}}$

This formula is chosen because the standard deviations of the two samples are different (10 and 8), and there is no information or instruction to assume equal variances. The Welch's t-test is more robust when the assumption of equal variances might not hold.[0m


Step: 3.  Determine the appropriate formula for the two-sample t-statistic, considering whether to assume equal or unequal variances.
Result: This formula is chosen because the standard deviations of the two samples are different (10 and 8), and there is no information or instruction to assume equal variances. The Welch's t-test is more robust when the assumption of equal variances might not hold.


[32m2025-06-16 10:19:26.160[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:19:29.846[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mt = (x̄1 - x̄2) / sqrt((s1^2 / n1) + (s2^2 / n2))
t = (78 - 82) / sqrt((10^2 / 25) + (8^2 / 25))
t = -4 / sqrt((100 / 25) + (64 / 25))
t = -4 / sqrt(4 + 2.56)
t = -4 / sqrt(6.56)
t = -4 / 2.5612496
t ≈ -1.5617[0m


Step: 4.  Calculate the t-statistic.
Result: t ≈ -1.5617


[32m2025-06-16 10:19:37.009[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:19:43.596[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mdf = ( (s1^2/n1) + (s2^2/n2) )^2 / ( ( (s1^2/n1)^2 / (n1 - 1) ) + ( (s2^2/n2)^2 / (n2 - 1) ) )
Given: n1=25, s1=10, n2=25, s2=8
s1^2/n1 = 10^2/25 = 100/25 = 4
s2^2/n2 = 8^2/25 = 64/25 = 2.56

Numerator: (4 + 2.56)^2 = (6.56)^2 = 43.0336
Denominator: ( (4^2) / (25-1) ) + ( (2.56^2) / (25-1) ) = (16/24) + (6.5536/24) = 0.66666... + 0.27306... = 0.93973...

df = 43.0336 / 0.93973... ≈ 45.793
Rounding down to the nearest whole number for conservative critical values:

df = 45[0m


Step: 5.  Calculate the degrees of freedom for the t-test.
Result: df = 45


[32m2025-06-16 10:19:57.102[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mThe calculation of degrees of freedom (df) depends on whether equal variances are assumed for the two-sample t-test.

1.  **If equal variances are assumed (Pooled t-test):**
    df = n1 + n2 - 2
    df = 25 + 25 - 2 = 48

2.  **If unequal variances are assumed (Welch's t-test):**
    This is generally preferred when standard deviations are different, as they are here (10 vs 8).
    The formula for Welch's degrees of freedom is:
    df = ( (s1^2/n1) + (s2^2/n2) )^2 / ( (s1^2/n1)^2 / (n1-1) + (s2^2/n2)^2 / (n2-1) )

    Let's plug in the values:
    n1 = 25, s1 = 10
    n2 = 25, s2 = 8

    s1^2/n1 = 10^2/25 = 100/25 = 4
    s2^2/n2 = 8^2/25 = 64/25 = 2.56

    Numerator = (4 + 2.56)^2 = (6.56)^2 = 43.0336

    Denominator = (4^2 / (25-1)) + (2.56^2 / (25-1))
    = (16 / 24) + (6.5536 / 24)
    = 0.6666... + 0.273066...
    = 0.939733...

    df = 43.0336 / 0.939733... = 45.79

Step: 6.  Determine the critical t-value(s) for the given significance level and degrees of freedom, or calculate the p-value.
Result: Critical t-values: For a two-tailed test with df = 45 and α = 0.05, the critical t-values are ±2.014.


[32m2025-06-16 10:20:04.936[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid. The degrees of freedom (df = 45) are correctly calculated using the Satterthwaite approximation for Welch's t-test (appropriate given the different standard deviations), and the critical t-value of ±2.014 for a two-tailed test with df = 45 and α = 0.05 is accurate.[0m


Validation failed: Valid. The degrees of freedom (df = 45) are correctly calculated using the Satterthwaite approximation for Welch's t-test (appropriate given the different standard deviations), and the critical t-value of ±2.014 for a two-tailed test with df = 45 and α = 0.05 is accurate.


[32m2025-06-16 10:20:06.017[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mCritical t-values: For a two-tailed test with df = 45 and α = 0.05, the critical t-values are ±2.014.[0m


Refined Result: Critical t-values: For a two-tailed test with df = 45 and α = 0.05, the critical t-values are ±2.014.


[32m2025-06-16 10:20:09.749[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mSince |-1.5617| (calculated t-statistic) < |2.014| (critical t-value), we fail to reject the null hypothesis.[0m


Step: 7.  Compare the calculated t-statistic to the critical t-value(s) or compare the p-value to the significance level.
Result: Since |-1.5617| (calculated t-statistic) < |2.014| (critical t-value), we fail to reject the null hypothesis.


[32m2025-06-16 10:20:16.292[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:20:18.395[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mFail to reject the null hypothesis. The absolute value of the calculated t-statistic (1.5617) is less than the absolute value of the critical t-value (2.014).[0m


Step: 8.  Make a decision regarding the null hypothesis (reject or fail to reject).
Result: Fail to reject the null hypothesis. The absolute value of the calculated t-statistic (1.5617) is less than the absolute value of the critical t-value (2.014).


[32m2025-06-16 10:20:20.111[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:20:22.324[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mFail to reject the null hypothesis. There is no statistically significant difference between the two teaching methods (A and B) at the 5% significance level.[0m


Step: 9.  State the conclusion of the hypothesis test in the context of the problem.
Result: Fail to reject the null hypothesis. There is no statistically significant difference between the two teaching methods (A and B) at the 5% significance level.


[32m2025-06-16 10:20:30.633[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:20:33.686[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mSE = sqrt((s1^2 / n1) + (s2^2 / n2)) = sqrt((10^2 / 25) + (8^2 / 25)) = sqrt((100 / 25) + (64 / 25)) = sqrt(4 + 2.56) = sqrt(6.56) ≈ 2.561[0m


Step: 10. Calculate the standard error of the difference between the means.
Result: SE = sqrt((s1^2 / n1) + (s2^2 / n2)) = sqrt((10^2 / 25) + (8^2 / 25)) = sqrt((100 / 25) + (64 / 25)) = sqrt(4 + 2.56) = sqrt(6.56) ≈ 2.561


[32m2025-06-16 10:20:36.475[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:20:39.460[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mt-multiplier = 2.014. This is the same critical t-value used for a two-tailed test at α = 0.05 with df = 45, as a 95% confidence interval corresponds to an α of 0.05.[0m


Step: 11. Determine the t-multiplier for the 95% confidence interval using the calculated degrees of freedom.
Result: t-multiplier = 2.014. This is the same critical t-value used for a two-tailed test at α = 0.05 with df = 45, as a 95% confidence interval corresponds to an α of 0.05.


[32m2025-06-16 10:20:46.078[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid. The calculation of degrees of freedom (df = 45) is consistent with the Satterthwaite approximation for Welch's t-test, which is appropriate given the different standard deviations. For a 95% confidence interval, the t-multiplier is indeed the critical t-value for a two-tailed test with α = 0.05 (i.e., t_0.025) and df = 45. Looking up this value in a t-distribution table confirms that t_0.025,45 ≈ 2.014. The explanation provided is also accurate.[0m


Validation failed: Valid. The calculation of degrees of freedom (df = 45) is consistent with the Satterthwaite approximation for Welch's t-test, which is appropriate given the different standard deviations. For a 95% confidence interval, the t-multiplier is indeed the critical t-value for a two-tailed test with α = 0.05 (i.e., t_0.025) and df = 45. Looking up this value in a t-distribution table confirms that t_0.025,45 ≈ 2.014. The explanation provided is also accurate.


[32m2025-06-16 10:20:47.945[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mt-multiplier = 2.014[0m


Refined Result: t-multiplier = 2.014


[32m2025-06-16 10:20:51.042[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mCI = (x̄1 - x̄2) ± (t-multiplier * SE)
CI = (78 - 82) ± (2.014 * 2.561)
CI = -4 ± 5.157854
CI = [-9.158, 1.158][0m


Step: 12. Calculate the 95% confidence interval for the difference in mean scores.
Result: CI = [-9.158, 1.158]


[32m2025-06-16 10:21:05.145[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mTo validate the 95% confidence interval for the difference in mean scores, we need to perform the following calculations:

1.  **Identify the given data:**
    *   Method A: n1 = 25, x̄1 = 78, s1 = 10
    *   Method B: n2 = 25, x̄2 = 82, s2 = 8
    *   Confidence Level = 95% (α = 0.05)

2.  **Calculate the difference in sample means:**
    Difference (x̄1 - x̄2) = 78 - 82 = -4

3.  **Calculate the standard error (SE) of the difference between means.** Since the standard deviations are different (10 vs 8), it's appropriate to use the formula for unequal variances (Welch's t-test approach for the standard error). Note that when n1=n2, the standard error formula for pooled and unpooled variances becomes identical.
    SE = sqrt((s1^2 / n1) + (s2^2 / n2))
    SE = sqrt((10^2 / 25) + (8^2 / 25))
    SE = sqrt((100 / 25) + (64 / 25))
    SE = sqrt(4 + 2.56)
    SE = sqrt(6.56)
   

Validation failed: The final answer is $\boxed{Valid}$


[32m2025-06-16 10:21:07.346[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mCI = (x̄1 - x̄2) ± (t-multiplier * SE) = (78 - 82) ± (2.014 * 2.561) = -4 ± 5.157854.
Lower bound = -4 - 5.157854 = -9.157854 ≈ -9.158
Upper bound = -4 + 5.157854 = 1.157854 ≈ 1.158
CI = [-9.158, 1.158][0m


Refined Result: CI = [-9.158, 1.158]


[32m2025-06-16 10:21:10.060[0m | [1mINFO    [0m | [36m__main__[0m:[36mexecute_step[0m:[36m29[0m - [1mBased on the hypothesis test, we fail to reject the null hypothesis, indicating no statistically significant difference between the two teaching methods at the 5% significance level. The 95% confidence interval for the difference in mean scores is [-9.158, 1.158]. Since this interval includes zero, it supports the conclusion that there is no statistically significant difference between the mean scores of Method A and Method B.[0m


Step: 13. State the final conclusion based on both the hypothesis test and the confidence interval.
Result: Based on the hypothesis test, we fail to reject the null hypothesis, indicating no statistically significant difference between the two teaching methods at the 5% significance level. The 95% confidence interval for the difference in mean scores is [-9.158, 1.158]. Since this interval includes zero, it supports the conclusion that there is no statistically significant difference between the mean scores of Method A and Method B.


[32m2025-06-16 10:21:18.712[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_step[0m:[36m45[0m - [1mValid[0m
[32m2025-06-16 10:21:20.656[0m | [1mINFO    [0m | [36m__main__[0m:[36mautodicot[0m:[36m86[0m - [1mThe conclusion is that there is no statistically significant difference between the two teaching methods (A and B) at the 5% significance level. The 95% confidence interval for the difference in mean scores (Method A - Method B) is [-9.158, 1.158].[0m



Final Answer: The conclusion is that there is no statistically significant difference between the two teaching methods (A and B) at the 5% significance level. The 95% confidence interval for the difference in mean scores (Method A - Method B) is [-9.158, 1.158].
