In [1]:
import openai
import ipywidgets as widgets

print(f"openai: {openai.__version__}. Suggested version: 1.75.0")
print(f"ipywidgets: {widgets.__version__}. Suggested version: 8.1.6")

openai: 1.75.0. Suggested version: 1.75.0
ipywidgets: 8.1.6. Suggested version: 8.1.6


# run the following code if the environment does not match

In [None]:
!pip install openai==1.75.0 ipywidgets==8.1.6
!pip install datasets
# you may need to restart the computer for the environment to be installed correctly

In [2]:
from datasets import load_dataset

# Load GSM8K dataset from Hugging Face
gsm8k = load_dataset("gsm8k", "main")

# View one example
sample = gsm8k['train'][0]
print("Question:", sample['question'])
print("Answer:", sample['answer'])

Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72


# Change directory to where your data is located

In [3]:
import os
data_dir = "grade-school-math-master/grade_school_math/data"
os.chdir(data_dir)

In [4]:
import json

# Load the GSM8K train set
def load_jsonl(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_data = load_jsonl("train.jsonl")

# Show the first example
print(train_data[0])

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}


In [5]:
# Setup (run only once)
import json
import random
from IPython.display import display, Markdown, Javascript
import ipywidgets as widgets

def load_data(file_path="train.jsonl"):
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def on_submit(b):
    output.clear_output()
    user_answer = answer_input.value.strip()
    try:
        final_answer_str = correct_answer.split("####")[-1].strip()
        user_val = float(user_answer.replace(",", ""))
        correct_val = float(final_answer_str.replace(",", ""))
        result = "✅ Correct!" if user_val == correct_val else f"❌ Incorrect. The correct answer is: {final_answer_str}"
    except Exception as e:
        result = f"⚠️ Error in processing the answer: {e}"
    with output:
        print(result)
        print("\nCorrect Answer Explanation:")
        print(correct_answer)

def rerun_current_cell(ev=None):
    display(Javascript("Jupyter.notebook.execute_cell()"))

data = load_data()

answer_input = widgets.Text(
    placeholder='Type your answer here...',
    description='Your Answer:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

submit_button = widgets.Button(description="Submit Answer", button_style='success')
output = widgets.Output()
next_button = widgets.Button(description="Next Question", button_style='info')

submit_button.on_click(on_submit)
next_button.on_click(rerun_current_cell)

## Random question and Answer check

In [6]:
# Run for each question

# Pick a new random question and update the global variables
current = random.choice(data)
question = current["question"]
correct_answer = current["answer"]

# Clear previous input
answer_input.value = ""

# Display question and interactive widgets
display(Markdown(f"### 🧮 Question:\n{question}"))
display(answer_input, submit_button, output, next_button)

### 🧮 Question:
Lennon is a sales rep and is paid $0.36 in mileage reimbursement when he travels to meet with clients.  On Monday he drove 18 miles.  Tuesday he drove 26 miles.  Wednesday and Thursday he drove 20 miles each day and on Friday he drove 16 miles.  How much money will he be reimbursed?

Text(value='', description='Your Answer:', layout=Layout(width='50%'), placeholder='Type your answer here...',…

Button(button_style='success', description='Submit Answer', style=ButtonStyle())

Output()

Button(button_style='info', description='Next Question', style=ButtonStyle())

# Enter Your Own Openai API Key

In [7]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# Global variable to store API key in memory
openai_api_key_holder = {"key": None}

# Password input widget for API key
api_input = widgets.Password(
    description='API Key:',
    placeholder='sk-...',
    layout=widgets.Layout(width='50%')
)

# Submit button
submit_api_button = widgets.Button(description='Set API Key', button_style='primary')
api_output = widgets.Output()

# Function to capture API key
def set_api_key(b):
    openai_api_key_holder["key"] = api_input.value.strip()
    with api_output:
        clear_output()
        if openai_api_key_holder["key"].startswith("sk-"):
            print("✅ OpenAI API key set successfully.")
        else:
            print("⚠️ Invalid API key format. Please double check.")

submit_api_button.on_click(set_api_key)

# Display the widget
display(api_input, submit_api_button, api_output)

Password(description='API Key:', layout=Layout(width='50%'), placeholder='sk-...')

Button(button_style='primary', description='Set API Key', style=ButtonStyle())

Output()

In [7]:
import random
import openai
from IPython.display import display, Markdown, Javascript
import ipywidgets as widgets

# -- Global Widgets --
answer_input = widgets.Text(placeholder='Type your answer...', description='Your Answer:', layout=widgets.Layout(width='50%'))
submit_button = widgets.Button(description="Submit Answer", button_style='success')
hint_button = widgets.Button(description="Need a Hint?", button_style='info')
full_explanation_btn = widgets.Button(description="Show Full Steps", button_style='warning')
next_button = widgets.Button(description="Next Question", button_style='info')

output = widgets.Output()
hint_output = widgets.Output()
full_explanation_output = widgets.Output()

# -- Functions --

def get_hint_with_openai(question_text):
    try:
        client = openai.OpenAI(api_key=openai_api_key_holder["key"])
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a math tutor giving helpful hints step-by-step."},
                {"role": "user", "content": f"Give me a hint for this math problem without solving it:\n{question_text}"}
            ],
            max_tokens=150,
            temperature=0.5
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"❌ Error: {e}"

def get_full_solution_with_openai(question_text):
    try:
        client = openai.OpenAI(api_key=openai_api_key_holder["key"])
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a math tutor giving full step-by-step solutions to word problems."},
                {"role": "user", "content": f"Give a full step-by-step solution to this math problem:\n{question_text}"}
            ],
            max_tokens=400,
            temperature=0.3
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"❌ Error: {e}"

def on_submit(b):
    output.clear_output()
    user_answer = answer_input.value.strip()
    try:
        final_answer_str = correct_answer.split("####")[-1].strip()
        user_val = float(user_answer.replace(",", ""))
        correct_val = float(final_answer_str.replace(",", ""))
        result = f"✅ Correct!" if user_val == correct_val else f"❌ Incorrect. The correct answer is: {final_answer_str}"
    except Exception as e:
        result = f"⚠️ Error in processing the answer: {e}"
    
    with output:
        print(result)
        print("\nCorrect Answer Explanation:")
        print(correct_answer)

def on_hint(b):
    hint_output.clear_output()
    with hint_output:
        display(Markdown("⏳ Generating hint..."))
    hint = get_hint_with_openai(question)
    hint_output.clear_output()
    with hint_output:
        display(Markdown(f"💡 **Hint:**\n{hint}"))

def on_show_full_solution(b):
    full_explanation_output.clear_output()
    with full_explanation_output:
        display(Markdown("⏳ Generating full solution..."))
    explanation = get_full_solution_with_openai(question)
    full_explanation_output.clear_output()
    with full_explanation_output:
        display(Markdown(f"📘 **Full Explanation:**\n{explanation}"))

def rerun_current_cell(ev=None):
    display(Javascript("Jupyter.notebook.execute_cell()"))

# -- Bind Buttons --
submit_button.on_click(on_submit)
hint_button.on_click(on_hint)
full_explanation_btn.on_click(on_show_full_solution)
next_button.on_click(rerun_current_cell)

## Random question, Answer check, Gpt hint, Gpt solution

In [9]:
# Sample a new question
sample = random.choice(data)
question = sample["question"]
correct_answer = sample["answer"]

# Clear outputs and reset inputs
output.clear_output()
hint_output.clear_output()
full_explanation_output.clear_output()
answer_input.value = ""

# Display question and interface
display(Markdown(f"### 🧮 Question:\n{question}"))
display(answer_input, submit_button, hint_button, full_explanation_btn, next_button, output, hint_output, full_explanation_output)

<IPython.core.display.Javascript object>

### 🧮 Question:
James spends 30 minutes twice a day on meditation.  How many hours a week does he spend meditating?

Text(value='', description='Your Answer:', layout=Layout(width='50%'), placeholder='Type your answer...')

Button(button_style='success', description='Submit Answer', style=ButtonStyle())

Button(button_style='info', description='Need a Hint?', style=ButtonStyle())



Button(button_style='info', description='Next Question', style=ButtonStyle())

Output(outputs=({'output_type': 'stream', 'text': '✅ Correct!\n\nCorrect Answer Explanation:\nIf the last four…

Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<IPython.core.display.Markdown object>…

Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': '<IPython.core.display.Markdown object>…

In [10]:
# === Adaptive State Tracker ===
user_state = {
    "difficulty": "easy",
    "correct_streak": 0
}

# === Difficulty Categorizer ===
def categorize_question(q):
    length = len(q["question"].split())
    if length < 15:
        return "easy"
    elif length < 30:
        return "medium"
    else:
        return "hard"

# === Difficulty Adjuster ===
def adjust_difficulty(state, correct):
    if correct:
        state["correct_streak"] += 1
        if state["correct_streak"] >= 2:
            if state["difficulty"] == "easy":
                state["difficulty"] = "medium"
            elif state["difficulty"] == "medium":
                state["difficulty"] = "hard"
            state["correct_streak"] = 0
    else:
        state["correct_streak"] = 0
        if state["difficulty"] == "hard":
            state["difficulty"] = "medium"
        elif state["difficulty"] == "medium":
            state["difficulty"] = "easy"

# === Pull New Question by Difficulty ===
def get_question_by_difficulty(data, level):
    pool = [q for q in data if categorize_question(q) == level]
    return random.choice(pool) if pool else random.choice(data)

# === Adaptive Submit Callback ===
def adaptive_on_submit(b):
    user_answer = answer_input.value.strip()

    client = openai.OpenAI(api_key=openai_api_key_holder["key"])
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a strict math grader. Return only 'Correct' or 'Incorrect'."},
            {"role": "user", "content": f"""Question: {current_q['question']}
Student's Answer: {user_answer}
Correct Answer: {current_q['answer']}

Is the student's answer correct? Just respond with 'Correct' or 'Incorrect'."""}
        ],
        temperature=0
    )
    result = response.choices[0].message.content.strip()
    correct = result == "Correct"

    adjust_difficulty(user_state, correct)

    result_text = "✅ Correct!" if correct else "❌ Incorrect."
    with output:
        display(Markdown(f"**{result_text}**"))
        display(Markdown(f"**Expected Answer:** {current_q['answer']}"))
        display(Markdown(f"**Streak:** {user_state['correct_streak']} | **Difficulty:** {user_state['difficulty'].upper()}"))

def ask_adaptive_question():
    next()  # call the function below

def next():
    question = get_question_by_difficulty(data, user_state["difficulty"])
    input_box = widgets.Text(placeholder='Type your answer...')
    out = widgets.Output()
    hint_out = widgets.Output()
    full_out = widgets.Output()

    def on_submit(b):
        user_answer = input_box.value.strip()
        try:
            client = openai.OpenAI(api_key=openai_api_key_holder["key"])
            res = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a strict math grader. Return only 'Correct' or 'Incorrect'."},
                    {"role": "user", "content": f"""Question: {question['question']}
Student's Answer: {user_answer}
Correct Answer: {question['answer']}
Is the student's answer correct? Just respond with 'Correct' or 'Incorrect'."""}
                ],
                temperature=0
            )
            result = res.choices[0].message.content.strip()
            correct = result == "Correct"
            adjust_difficulty(user_state, correct)

            with out:
                display(Markdown(f"**{'✅ Correct!' if correct else '❌ Incorrect.'}**"))
                display(Markdown(f"**Expected Answer:** {question['answer']}"))
                display(Markdown(f"**Streak:** {user_state['correct_streak']} | **Difficulty:** {user_state['difficulty'].upper()}"))
        except Exception as e:
            with out:
                print("Error:", e)

    def on_hint(b):
        hint_out.clear_output()
        try:
            client = openai.OpenAI(api_key=openai_api_key_holder["key"])
            res = client.chat.completions.create(
                model="gpt-3.5-turbo",# we use gpt-3.5-turbo to save budget: the hint does not require high performance model.
                messages=[
                    {"role": "system", "content": "You are a helpful tutor. Provide a hint only."},
                    {"role": "user", "content": f"Give me a hint for this math problem:\n{question['question']}"}
                ],
                temperature=0.5
            )
            hint = res.choices[0].message.content.strip()
            with hint_out:
                display(Markdown(f"💡 **Hint:** {hint}"))
        except Exception as e:
            with hint_out:
                print("Hint error:", e)

    def on_full(b):
        full_out.clear_output()
        try:
            client = openai.OpenAI(api_key=openai_api_key_holder["key"])
            res = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a math tutor. Give a full step-by-step explanation."},
                    {"role": "user", "content": f"Explain this math problem step-by-step:\n{question['question']}"}
                ],
                temperature=0.3
            )
            explanation = res.choices[0].message.content.strip()
            with full_out:
                display(Markdown(f"📘 **Full Explanation:**\n{explanation}"))
        except Exception as e:
            with full_out:
                print("Explanation error:", e)

    # Create buttons and bind
    submit_btn = widgets.Button(description="Submit Answer", button_style='success')
    hint_btn = widgets.Button(description="Need a Hint?", button_style='info')
    expl_btn = widgets.Button(description="Show Full Steps", button_style='warning')
    next_btn = widgets.Button(description="Next Question", button_style='primary')

    submit_btn.on_click(on_submit)
    hint_btn.on_click(on_hint)
    expl_btn.on_click(on_full)
    next_btn.on_click(lambda b: next())  

    # Layout block
    display(widgets.VBox([
        widgets.HTML(f"<h3>🎯 Level: <code>{user_state['difficulty'].upper()}</code></h3>"),
        widgets.HTML(f"<b>Question:</b> {question['question']}"),
        input_box,
        widgets.HBox([submit_btn, hint_btn, expl_btn, next_btn]),
        out,
        hint_out,
        full_out
    ]))


## Question Based on Correctness

In [12]:
ask_adaptive_question()

VBox(children=(HTML(value='<h3>🎯 Level: <code>EASY</code></h3>'), HTML(value='<b>Question:</b> Fifteen more th…

VBox(children=(HTML(value='<h3>🎯 Level: <code>EASY</code></h3>'), HTML(value='<b>Question:</b> If 12 bags of o…

VBox(children=(HTML(value='<h3>🎯 Level: <code>MEDIUM</code></h3>'), HTML(value='<b>Question:</b> A 40 meters r…

VBox(children=(HTML(value='<h3>🎯 Level: <code>MEDIUM</code></h3>'), HTML(value="<b>Question:</b> Steph needs t…

VBox(children=(HTML(value='<h3>🎯 Level: <code>HARD</code></h3>'), HTML(value='<b>Question:</b> Bridget counted…

VBox(children=(HTML(value='<h3>🎯 Level: <code>HARD</code></h3>'), HTML(value='<b>Question:</b> Paul made two b…

VBox(children=(HTML(value='<h3>🎯 Level: <code>HARD</code></h3>'), HTML(value='<b>Question:</b> Buying a toaste…

# GPT Answer Comparison and Accuracy Evaluation 

In [21]:
# Do not run this cell to override the output
import re
import pandas as pd
from tqdm import tqdm
import random

# Helper to extract final number from GPT response
def extract_final_number(answer):
    match = re.search(r"####\s*([\d,\.]+)", answer)
    if match:
        return match.group(1).replace(",", "").strip()
    return None

# Fallback in case #### <number> isn't found
def extract_fallback_number(text):
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", text)
    return numbers[-1] if numbers else None

# Use N random test questions
test_subset = random.sample(data, 100)  # we are testing 100 samples here.

all_results = []

for sample in tqdm(test_subset):
    question = sample["question"]
    correct_answer = sample["answer"]
    prompt = f"{question}\n\nAnswer the question above. End with '#### <final answer>'."

    retries = 0
    gpt_final = None
    while retries < 2:
        response = openai.OpenAI(api_key=openai_api_key_holder["key"]).chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful math tutor. Always end with '#### <final answer>'."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2
        )
        generated_answer = response.choices[0].message.content
        gpt_final = extract_final_number(generated_answer)
        if gpt_final:
            break
        retries += 1

    # Use fallback if no proper final answer extracted
    if not gpt_final:
        print("⚠️ Missing '####' in answer. Using fallback.")
        gpt_final = extract_fallback_number(generated_answer)

    ref_final = extract_final_number(correct_answer)
    correct = gpt_final == ref_final if gpt_final and ref_final else False

    all_results.append({
        "question": question,
        "gpt_answer": generated_answer,
        "correct_answer": correct_answer,
        "gpt_final": gpt_final,
        "ref_final": ref_final,
        "correct": correct
    })

 19%|███████████████▍                                                                 | 19/100 [01:00<04:43,  3.50s/it]

⚠️ Missing '####' in answer. Using fallback.


 28%|██████████████████████▋                                                          | 28/100 [01:25<03:34,  2.98s/it]

⚠️ Missing '####' in answer. Using fallback.


 47%|██████████████████████████████████████                                           | 47/100 [02:26<03:09,  3.58s/it]

⚠️ Missing '####' in answer. Using fallback.


 48%|██████████████████████████████████████▉                                          | 48/100 [02:32<03:38,  4.20s/it]

⚠️ Missing '####' in answer. Using fallback.


 63%|███████████████████████████████████████████████████                              | 63/100 [03:21<01:38,  2.65s/it]

⚠️ Missing '####' in answer. Using fallback.


 64%|███████████████████████████████████████████████████▊                             | 64/100 [03:25<01:51,  3.09s/it]

⚠️ Missing '####' in answer. Using fallback.


 76%|█████████████████████████████████████████████████████████████▌                   | 76/100 [04:07<01:30,  3.75s/it]

⚠️ Missing '####' in answer. Using fallback.


 77%|██████████████████████████████████████████████████████████████▎                  | 77/100 [04:13<01:40,  4.37s/it]

⚠️ Missing '####' in answer. Using fallback.


 84%|████████████████████████████████████████████████████████████████████             | 84/100 [04:41<01:08,  4.31s/it]

⚠️ Missing '####' in answer. Using fallback.


 98%|███████████████████████████████████████████████████████████████████████████████▍ | 98/100 [05:38<00:10,  5.38s/it]

⚠️ Missing '####' in answer. Using fallback.


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [05:45<00:00,  3.45s/it]


In [22]:
# Create results DataFrame and show accuracy
df_eval = pd.DataFrame(all_results)
df_eval = df_eval.rename(columns={"ref_final": "reference_answer"})
display(df_eval[["question", "gpt_final", "reference_answer", "correct"]])
print(f"\n✅ GPT Accuracy: {df_eval['correct'].mean():.2%}")

Unnamed: 0,question,gpt_final,reference_answer,correct
0,A hotel has 10 rooms and is currently full. Ea...,60,60,True
1,"In the school's library, there are 2300 differ...",736,736,True
2,If a rectangle has a width of 42 inches and an...,10,10,True
3,Boris has 100 pieces of Halloween candy. His d...,20,20,True
4,There are 516 cars in a parking lot. One-third...,86,86,True
...,...,...,...,...
95,50% of substitute teachers walk out after 1 ho...,21,21,True
96,"In a 60-item exam, Liza got 90% of the items c...",4,4,True
97,Henry took 9 pills a day for 14 days. Of these...,574.00,41,False
98,"Belinda’s dog is missing, so she made 200 flye...",20,20,True



✅ GPT Accuracy: 93.00%


In [28]:
# Show only incorrect results
incorrect_df = df_eval[df_eval["correct"] == False]
display(incorrect_df[["question", "gpt_final", "reference_answer", "gpt_answer"]])
print(f"\n✅ GPT Accuracy: {df_eval['correct'].mean():.2%}")
print(f"\n❌ Number of incorrect answers: {len(incorrect_df)}")

Unnamed: 0,question,gpt_final,reference_answer,gpt_answer
46,A 750 ml bottle of spirits costs $30.00 and ha...,98.0,98,To determine how much money a restaurant makes...
47,Travis goes through 2 boxes of cereal a week. ...,312.0,312,To find out how much Travis spends on cereal i...
62,John has to get a new blanket. He decides to ...,240.0,2240,"To find the cost of the quilt, we first need t..."
66,Southton buries their time capsule 15 feet und...,72.0,48,To find the depth at which Northton's time cap...
75,Tonya has $150.00 on her credit card. If she ...,100.0,120,Tonya currently has a balance of $150.00 on he...
76,Kurt's old refrigerator cost $0.85 a day in el...,12.0,12,To find out how much money Kurt saves in a 30-...
97,Henry took 9 pills a day for 14 days. Of these...,574.0,41,To find out how much Henry spent in total on t...



✅ GPT Accuracy: 93.00%

❌ Number of incorrect answers: 7


The table shows the incorrect results from GPT-4o’s performance on 100 randomly selected math word problems from the GSM8K dataset. The model was instructed to end each response with the format #### <final answer>, making it easier to extract and compare answers automatically.

Initially, the model often failed to follow this format, which led to many missing answers and low accuracy. To address this, we added a fallback method that extracts the last number from the response when #### is missing. This fix greatly improved answer extraction and raised the model’s performance.

Even with this fix, a few mismatches remained. Some were due to genuine calculation or interpretation errors. However, several were caused by formatting differences — for example, the model returned values like “98.00” instead of “98”, “312.00” instead of “312”, and “12.00” instead of “12”. These are numerically identical but were counted as incorrect because the evaluation uses exact string matching.

As a result, the reported accuracy was 93%. But if we account for these formatting-only mismatches and treat them as correct, the actual GPT accuracy would be 96%. This highlights both the model's strong performance on word problems and the importance of using more flexible comparison methods when evaluating numeric outputs.










# Use of generative AI statement

We have completed the coding and development of this project with the assistance of ChatGPT-4o, which supported implementation, debugging, and refinement of key components throughout the process.