In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Some parts of this Codelab are (c) Google 2025 under the Apache License.
(c) Aquarc 2025

# Synopsis
Aquarc is an all-in-one SAT platform for high schoolers designed to minimize time spent using the software and maximizing practice and essential questions. In order to further this mission, Aquarc Intelligence was created to analyze mistakes within a question and to suggest similar questions for efficient practicing.

# Install the SDK 
We will be using Google's Gemini and utilities to build the model.

In [2]:
!pip uninstall -qqy jupyterlab  # Remove unused packages from Kaggle's base image that conflict
!pip install -U -q "google-genai==1.7.0" langchain PyPDF2 "chromadb==0.6.3" html-to-markdown

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m3.9 MB/s[0m 

Import the SDK and set up the API key

In [3]:
from google import genai
from google.genai import types

from IPython.display import HTML, Markdown, display

Set up a retry helper so we can press "Run All" and not worry about hitting the quota. 

In [4]:
from google.api_core import retry


is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

### Set up your API key

To run the following cell, your API key must be stored it in a [Kaggle secret](https://www.kaggle.com/discussions/product-feedback/114053) named `GOOGLE_API_KEY`.

If you don't already have an API key, you can grab one from [AI Studio](https://aistudio.google.com/app/apikey). You can find [detailed instructions in the docs](https://ai.google.dev/gemini-api/docs/api-key).

To make the key available through Kaggle secrets, choose `Secrets` from the `Add-ons` menu and follow the instructions to add your key or enable it for this notebook.

In [5]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

### Choose your model
Depending on what's available and your quota, choose a model that's effective for your purposes.

In [6]:
client = genai.Client(api_key=GOOGLE_API_KEY)

for model in client.models.list():
  print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01

Check out the detailed information for your model

In [7]:
for model in client.models.list():
  if model.name == 'models/gemini-2.0-flash':
    print(model.to_json_dict())
    break

# change this line if you want to use a different model
model = "gemini-2.0-flash"

{'name': 'models/gemini-2.0-flash', 'display_name': 'Gemini 2.0 Flash', 'description': 'Gemini 2.0 Flash', 'version': '2.0', 'tuned_model_info': {}, 'input_token_limit': 1048576, 'output_token_limit': 8192, 'supported_actions': ['generateContent', 'countTokens', 'createCachedContent']}


Test your model

In [8]:
chat = client.chats.create(model=model, history=[])
response = chat.send_message('Hello! My name is Zlork.')
print(response.text)

It's nice to meet you, Zlork! I'm happy to chat with you. What can I do for you today?



You can use the `Markdown()` function to format it nicely in Kaggle.

In [9]:
response = chat.send_message('Hello! My name is Zlork.' + 
                             'Use some fancy markdown in your message')
Markdown(response.text)

Ah, greetings, Zlork! It is a pleasure to make your acquaintance. Let us engage in a conversation filled with delightful markdown flourishes!

---

**Behold!** A most splendid introduction! Your name, *Zlork*, resonates with a certain... *je ne sais quoi*.

Perhaps we can delve into the fascinating depths of your being? Or, if you prefer, we can embark on a *whimsical* journey through the land of digital discourse!

```python
# A small, eloquent poem about Zlork
def zlork_poem():
    """Crafts a poem about Zlork, a name both bold and stark."""
    print("*Zlork*, a name that rings with might,")
    print("A star that shines both day and night.")
    print("With wit and charm, you grace this sphere,")
    print("Welcome, *Zlork*, it's grand you're here!")

zlork_poem()
```

> "The world is but a canvas to our imagination." - *Henry David Thoreau*

---

So, Zlork, tell me... **what shall we discuss?** I await your esteemed command! 🎩


# A fine prompt
**TODO: you should probably move this**

Let's start by writing our prompt. What should it include and what should it mention? Following the principles taught throughout the course, let's place an importance on giving **positive** instructions rather than **negative** instructions to maintain effectiveness. 

The model needs to do the following:
- [ ] Stick to the SAT: Understand weighting and importance of certain questions, categories, and ensure all advice is applicable to the bounds of the SAT. A document with the specifications of the SAT format can be used for Retrieval Augmented Generation (RAG) rather than potential hallucination over the exact requirements.
- [ ] Have access to the current question (and maybe the previous questions for even more context)
- [ ] Have access to the answers and time between each to estimate confidence.
- [x] Understand images or SVGs for Inference and other Reading/Writing questions.
- [ ] Use Tree of Thoughts (ToT) to generate multiple solving processes because multiple methods may be used to arrive at the same answer, and output these answering mechanisms in a JSON array to present on the website effectively (a TUI for this notebook)
- [ ] Find semantically related questions and present them to the user on the website using ReAct (a TUI for this notebook)

Let's start by finding an effective prompt. 
Then we can evaluate the effectiveness of including the SAT standards in a PDF to help the user answering a question. We will also evaluate whether a vector search database is useful for this document.

Below, the variables for the specific question we are testing are defined.

In [10]:
# All questions are (c) CollegeBoard 2025.

question = {
        "question" : """
In a paper about p-i-n planar perovskite solar cells (one of several perovskite cell architectures designed to collect and store solar power), Lyndsey McMillon-Brown et al. described a method for fabricating the cell’s electronic transport layer (ETL) using a spray coating. Conventional ETL fabrication is accomplished using a solution of nanoparticles. The process can result in a loss of up to 80% of the solution, increasing the cost of manufacturing at scale—an issue that may be obviated by spray coating fabrication, which the researchers describe as “highly reproducible, concise, and practical.”

What does the text most strongly suggest about conventional ETL fabrication?
A. It is less suitable for manufacturing large volumes of planar p-i-n perovskite solar cells than an alternative fabrication method may be.
B. It is more expensive when manufacturing at scale than are processes for fabricating ETLs used in other perovskite solar cell architectures.
C. It typically entails a greater loss of nanoparticle solution than do other established approaches for ETL fabrication.
D. It is somewhat imprecise and therefore limits the potential effectiveness of p-i-n planar perovskite solar cells at capturing and storing solar power.
""",
        "rationale" : """
Choice A is the best answer. Conventional solar cell fabrication increases “the cost of manufacturing at scale,” but spray coating might get rid of that problem.

Choice B is incorrect. This is not completely supported by the text. While it’s true that conventional ETL fabrication is expensive at scale, there’s nothing in the text that mentions other perovskite solar cell architectures. Choice C is incorrect. This choice does not match the text. Only one conventional method of ETL fabrication is described, so we can’t compare the solution loss in this method to that of other conventional methods. Choice D is incorrect. This choice isn’t supported by the text. The text never suggests that the effectiveness of solar cells changes based on their method of fabrication. 
""",
        "user_answer" : "C",
}

Ideally, the user will talk to the chatbot after it gets the question wrong (or before in some scenarios, too). Either way, the user will have some rationale as to his or her answer to the question. Don't expect this rationale to be well thought out - the objective of the intelligent agent is to draw out what they actually mean. It may be completely omitted as well.

In [11]:
question["user_rationale"] = "Isn't the new method of ETL fabrication the same as the 'established methods'"

# Use Reinforcement Learning with AI Feedback to Finetune a Prompt
Let's generate a couple prompts for Aquarc Intelligence to use, given these parameters. We will then evaluate the effectiveness of these prompts against each of the questions and the user's query.

In [12]:
from google.genai.types import GenerateContentConfig

response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    # no specs yet
    contents=f"Generate prompts for a model that will do nothing more but take information about the current question, rationale, user answer and their rationale (if there is one) for the SAT. Come up with different variations for the prompt like more or less concise or multiple thought processes or just one, etc. Here is an example question to illustrate my point (although math and english still exist)\
                  {question['question']}\n \
              {question['rationale']}\n\
              The user got: {question['user_answer']}\n \
              {question['user_rationale']}",
    config=GenerateContentConfig(
        system_instruction=[
            "You are a prompt engineer's assistant. Help the prompt engineer generate some prompts for his AI-powered SAT learning platform called Aquarc. The platform currently holds an SAT question bank with over 5000 questions and tracks which questions you get wrong per category. While this feature is helpful it lacks the intelligence necessary to be a full fledged SAT platform",
        ],
        temperature=2.0,
        top_k=10,
    ),
)

Markdown(response.text)

Okay, here are several prompt variations you can use for your Aquarc platform's AI model, designed to analyze SAT questions, user answers, and rationales, aiming for different levels of conciseness, thought process depth, and focus. These prompts use your provided example question. I've categorized them for clarity:

**Category 1: Concise Analysis - Focused on Core Accuracy**

These prompts aim for a quick, direct assessment of the user's answer and reasoning.

*   **Prompt 1 (Concise Evaluation):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Brief Rationale - e.g., A - Addresses cost at scale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    > Evaluate the user's answer and rationale.  Explain why their answer is incorrect and how their rationale relates to or misinterprets the question and passage.  Be brief and direct.
*   **Prompt 2 (Fact-Checking Focus):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Brief Rationale - e.g., A - Addresses cost at scale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    >  Identify the factual errors or misinterpretations present in the user's rationale.  Specifically, what does the passage say that contradicts the user's understanding?
*   **Prompt 3 (Gap Identification):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Brief Rationale - e.g., A - Addresses cost at scale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    > What crucial information did the user overlook or misunderstand that led them to choose the incorrect answer?  Focus on a single key element of the passage or question.

**Category 2:  Detailed Analysis - Exploring Thought Processes**

These prompts encourage the model to delve deeper into the user's reasoning and potential misconceptions.

*   **Prompt 4 (Comprehensive Analysis):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Full Rationale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    > Analyze the user's answer and rationale in detail.  Break down the correct answer's reasoning, then dissect the user's logic.  Identify the specific points where the user's understanding deviates from the passage's meaning. Suggest the specific type of mistake.
*   **Prompt 5 (Counter-Argument Focus):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Full Rationale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    >  Formulate a direct counter-argument to the user's rationale.  Why is their reasoning flawed, based on a close reading of the passage?  Specifically refute their interpretation with evidence from the text.
*   **Prompt 6 (Assumption Identification):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Full Rationale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    > What underlying assumptions is the user making in their rationale? Are these assumptions justified by the text? Explain how these assumptions lead to the incorrect answer.

**Category 3:  Multi-Step Reasoning - Simulating Tutor-Like Explanation**

These prompts ask the model to explain the problem and solution in a more educational, step-by-step manner.

*   **Prompt 7 (Step-by-Step Explanation):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Full Rationale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    > 1.  Explain the central idea of the passage in one sentence.
    > 2.  Explain the question's objective. What is it asking the user to identify?
    > 3.  Analyze each answer choice (A, B, C, D), explaining why each is either correct or incorrect, referring directly to the passage.
    > 4.  Specifically address the user's rationale, explaining why it is a misinterpretation or flawed inference.
*   **Prompt 8 (Targeted Guidance):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Full Rationale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    > The student answered C and reasoned that the new method is an established one.
    > Provide a breakdown to the student in this order
    > 1. Did the question ask about a 'new method'?
    > 2. What evidence refutes the idea of C. What keywords?
    > 3. Point to some specific lines and paraphrase it
*   **Prompt 9 (Similar/Different Exercise):**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Full Rationale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    >
    > Given the user picked answer C and provided a certain rational:
    > Please respond:
    > Briefly mention some items that might be similar and also different with regards to choice C.
    > For example: This could be talking about time scales in a history text
*   **Prompt 10 (Simple Rephrasing)**

    > SAT Question: \[Insert Question Text Here]
    > Correct Answer: \[Insert Correct Answer Letter & Full Rationale]
    > User Answer: \[User's Answer Letter - e.g., C]
    > User Rationale: \[User's Rationale Text - e.g., Isn't the new method of ETL fabrication the same as the 'established methods'?]
    > The user is clearly having issues of similar terminology between 'new' and 'established'
    > Can you take 2-3 senetences and try to explain this so it seems different, rephrasing as much as possible. Act as a english teacher, not an assistant or coder
**Important Considerations:**

*   **Testing:**  Thoroughly test each prompt variation with a diverse set of questions and user answers.  Evaluate the accuracy, clarity, and helpfulness of the model's responses.
*   **Iterative Refinement:**  Based on your testing, refine the prompts.  Experiment with wording, level of detail, and specific instructions to optimize the model's performance.
*   **Context:** You will need to input the question text, the correct answer letter and rationale, the user's answer letter, and the user's rationale *into* the prompt.
*   **User Rationale:**  Handle cases where the user *doesn't* provide a rationale.  You might insert a placeholder like "User provided no rationale" and adjust the prompt to focus on why the user's answer is incorrect based on the passage.
*   **Few Shot:** Provide a few worked out examples beforehand so it can grasp some context.

I've provided prompts covering different analytical depths. You can adjust based on resources and use case.


Here are some sample prompts extracted from the Gemini output:

In [13]:
prompts = [ 
    {
        "prompt": "Analyze the user's error. Why is the correct answer better supported by the text than the user's answer? Be concise.",
        "avg_score": 0.0,
    },
    {
        "prompt": """
Consider the SAT question and the user's selected answer.
1. Identify the specific textual evidence that strongly supports the correct answer choice.
2. Identify any assumptions the user might be making that lead to their chosen answer.
3. Explain why the textual support for the correct answer is stronger or more direct than any implied support for the user's answer.  If the user provides their own rational, specifically address the rational and mention what part of the question makes the rational wrong.
        """,
        "avg_score": 0.0,
    },
    {
        "prompt" : """
Let's analyze this question step-by-step to understand the user's error.

1.  Summarize the main point of the passage in your own words.
2.  Identify the key phrase(s) in the question that guide you to the correct answer.
3.  Explain why the correct answer directly addresses the question based on the text.
4.  Explain what specific words, or phrases, may make the users' answer incorrect.
5. Given the user's answer, what misunderstanding might the student have in this section? What advice could you give to them in the future?
        """,
        "avg_score": 0.0,
    },
    {
        "prompt": """
Evaluate both the correct answer and the user's answer as potential responses to the question.

*   Present the strongest possible argument *in favor* of the user's answer.
*   Present the strongest possible argument *against* the user's answer.
*   Explain why, ultimately, the correct answer is the superior choice based on textual evidence.
        """,
        "avg_score": 0.0,
    },
    { 
        "prompt": """
Analyze the question, correct answer, and the user's answer.

Step 1: Summarize the core argument or concept being tested in the question.
Step 2: Identify the specific details in the question and correct answer rationale that are most crucial for arriving at the correct answer.
Step 3: Analyze the user's answer choice.  Explain why it is incorrect. If the user provided a rationale, identify where the user's reasoning is flawed, citing specific evidence from the question text or correct answer rationale.  If no rationale was provided, hypothesize potential reasons for the incorrect choice based on common SAT misconceptions or test-taking errors related to this question type.
Step 4: Explicitly state the mistake that the user made to reach their conclusion and how they can reach the proper conclusion by fixing the flaw.
        """,
        "avg_score": 0.0,
    },
]

Some sample questions varied in difficulty:

In [14]:
# All questions are (c) CollegeBoard 2025.
questions = [
    {
        "question": """
In 1453, English King Henry VI became unfit to rule after falling gravely ill. As a result, Parliament appointed Richard, Third Duke of York, who had a strong claim to the English throne, to rule as Lord Protector. Upon recovering two years later, ______ forcing an angered Richard from the royal court and precipitating a series of battles later known as the Wars of the Roses. \n\
Which choice completes the text so that it conforms to the conventions of Standard English?
A. Henry resumed his reign,
B. the reign of Henry resumed,
C. Henry’s reign resumed,
D. it was Henry who resumed his reign, 
            """,
        # Notice that the default rationale provided for Choice C doesn't explain it well enough
        "rationale": """
Choice A is the best answer. The convention being tested is subject-modifier placement. This choice ensures that the introductory phrase “upon recovering two years later” appears immediately before the noun it modifies (“Henry”), clearly establishing that Henry recovered two years later. 
Choice B is incorrect because it results in a dangling modifier. The placement of the noun phrase “the reign of Henry” immediately after the introductory phrase illogically suggests that the reign of Henry recovered two years later. 
Choice C is incorrect because it results in a dangling modifier. The placement of the noun phrase “Henry’s reign” immediately after the introductory phrase illogically suggests that Henry’s reign recovered two years later. 
Choice D is incorrect because it results in a dangling modifier. The placement of the function word “it” immediately after the introductory phrase illogically suggests that “it” recovered two years later. 
        """,
        "user_answer": "C",
        "user_rationale": "Isn't the subject that Henry got unfit to rule?",
    },
    # Feel free to uncomment the following

#    {
#        "question": """
#A study by a team including finance professor Madhu Veeraraghavan suggests that exposure to sunshine during the workday can lead to overly optimistic behavior. __Using data spanning from 1994 to 2010 for a set of US companies, the team compared over 29,000 annual earnings forecasts to the actual earnings later reported by those companies.__ The team found that the greater the exposure to sunshine at work in the two weeks before a manager submitted an earnings forecast, the more the manager’s forecast exceeded what the company actually earned that year.
#Which choice best states the function of the underlined sentence in the overall structure of the text? 
            
#A. To summarize the results of the team’s analysis
#B. To present a specific example that illustrates the study’s findings
#C. To explain part of the methodology used in the team’s study
#D. To call out a challenge the team faced in conducting its analysis
#            """,
#        "rationale": """
#Choice C is the best answer because it best describes how the underlined sentence functions in the text as a whole. The first sentence presents the implications of Veeraraghavan’s team’s study: sunshine exposure during work hours can cause overly optimistic behavior. The underlined sentence then describes the data the team consulted and how they were used (comparing predictions about earnings to what the companies actually earned), and the final sentence presents what the team found in their examination of the data. Thus, the underlined sentence mainly functions to explain part of the methodology used in the team’s study. 
#Choice A is incorrect because the underlined sentence explains in part how the team conducted their analysis of the effect of sunshine but doesn’t address what the team found; a broad summary is instead given in the other two sentences. 
#Choice B is incorrect because the underlined sentence doesn’t present any specific examples from the team’s comparisons of 29,000 earnings predictions to actual earnings; it simply explains in part how the team conducted their analysis. 
#Choice D is incorrect because the underlined sentence simply explains in part how the team conducted their analysis; the text never mentions any challenges that the team encountered in their study. 
#            """,
#        "user_answer": "A",
#        "user_rationale": "Is it not explaining the part of the experiment which illustrates what happens?"
#    },
#    {
#        "question" : """
#In a paper about p-i-n planar perovskite solar cells (one of several perovskite cell architectures designed to collect and store solar power), Lyndsey McMillon-Brown et al. described a method for fabricating the cell’s electronic transport layer (ETL) using a spray coating. Conventional ETL fabrication is accomplished using a solution of nanoparticles. The process can result in a loss of up to 80% of the solution, increasing the cost of manufacturing at scale—an issue that may be obviated by spray coating fabrication, which the researchers describe as “highly reproducible, concise, and practical.”

#What does the text most strongly suggest about conventional ETL fabrication?
#A. It is less suitable for manufacturing large volumes of planar p-i-n perovskite solar cells than an alternative fabrication method may be.
#B. It is more expensive when manufacturing at scale than are processes for fabricating ETLs used in other perovskite solar cell architectures.
#C. It typically entails a greater loss of nanoparticle solution than do other established approaches for ETL fabrication.
#D. It is somewhat imprecise and therefore limits the potential effectiveness of p-i-n planar perovskite solar cells at capturing and storing solar power.
#""",
#        "rationale" : """
#Choice A is the best answer. Conventional solar cell fabrication increases “the cost of manufacturing at scale,” but spray coating might get rid of that problem.

#Choice B is incorrect. This is not completely supported by the text. While it’s true that conventional ETL fabrication is expensive at scale, there’s nothing in the text that mentions other perovskite solar cell architectures. Choice C is incorrect. This choice does not match the text. Only one conventional method of ETL fabrication is described, so we can’t compare the solution loss in this method to that of other conventional methods. Choice D is incorrect. This choice isn’t supported by the text. The text never suggests that the effectiveness of solar cells changes based on their method of fabrication. 
#""",
#        "user_answer" : "C",
#        "user_rationale": "Isn't the new method of ETL fabrication the same as the 'established methods'",
#    },
]

## Evaluation 
The rubric is slightly biased in favor of the debate prompt, but it also hits the other points on the rubric and maintains clarity. If more prompts are to be tested, they can be added above.

In [15]:
import enum

# Define the evaluation prompt
EVAL_PROMPT = """
# Instruction
Evaluate the AI’s analysis of a student’s SAT error. Focus on how well it explains why the correct answer is textually supported and why the user’s answer is incorrect.

# Evaluation
## **Metric Definition**
Assess the **accuracy**, **completeness**, **groundedness**, and **clarity** of the response. The AI must:
1. Correctly identify textual evidence for the correct answer.
2. Explain flaws in the user’s answer (and their rationale, if provided).
3. Follow the prompt’s instructions precisely.


## **New Criteria**  
1. **Instructional Creativity**:  
   - Does the creative approach (e.g., debate, step-by-step analysis) **enhance understanding** of why the correct answer is textually supported?  
   - Does it **strategically use the prompt’s structure** (e.g., arguments for/against) to highlight key SAT skills like evidence analysis or assumption identification?  

2. **Educational Effectiveness**:  
   - Does the creativity **directly serve the learning goal** (e.g., clarifying misconceptions, modeling SAT logic), or is it merely ornamental?  
   - Does it **engage the learner** while maintaining rigor (e.g., making complex reasoning more accessible)?  

## **Revised Rating Rubric**  
- **5 (Excellent)**:  
  - Creative structure (e.g., debate) **directly reinforces** why the correct answer is superior.  
  - Uses the format to **explicitly contrast** the user’s error with textual evidence (e.g., “The strongest argument *for* the user’s answer is X, but the text contradicts this because Y”).  
  - Balances creativity with precision and clarity.  

- **4 (Good)**:  
  - Creative approach is **mostly effective** but slightly misses opportunities to deepen understanding (e.g., lists arguments but doesn’t explicitly tie them to SAT skills).  
  - Minor clarity issues in linking creativity to the text.  

- **3 (Adequate)**:  
  - Creativity **distracts** slightly from the core analysis (e.g., overemphasizes hypothetical arguments without grounding in the text).  
  - Fails to fully leverage the creative structure to address the user’s error.  

- **2 (Poor)**:  
  - Creative format **obscures key points** (e.g., hypothetical arguments misrepresent the text).  
  - Prioritizes style over substance; minimal educational value.  

- **1 (Very Poor)**:  
  - Creativity **undermines accuracy** (e.g., invents textual evidence to support arguments).  

## **Examples**  
### **Debate-Style Prompt (Question 4)**  
**User Answer**: C (“Henry’s reign resumed”)  
**Correct Answer**: A (“Henry resumed his reign”)  

**Good AI Response** (Rating 5):  
*“Argument FOR C: A student might think ‘Henry’s reign’ is the subject because the prior sentence mentions Parliament appointing a ruler.  
Argument AGAINST C: The modifier ‘upon recovering’ must refer to a person (Henry), not an abstract concept (‘reign’). The text says Henry fell ill, so only he—not his reign—can ‘recover.’  
Conclusion: While C seems plausible, the modifier rule and textual context make A correct.”*  

**Why it’s a 5**:  
- Uses debate structure to **preemptively address** the user’s assumption.  
- Directly ties arguments to **textual evidence** (Henry’s illness/recovery).  

**Poor AI Response** (Rating 2):  
*“FOR C: ‘Reign’ is a noun, so it matches the sentence structure.  
AGAINST C: It sounds awkward.  
Conclusion: A is better because it’s smoother.”*  

**Why it’s a 2**:  
- Creativity (debate) adds no educational value; arguments lack textual grounding.  
- Fails to explain grammar rules or modifier placement.  

## Evaluation Steps
STEP 1: Assess the response in aspects of instruction following, groundedness, conciseness, and verbosity according to the criteria.
STEP 2: Score based on the rubric.

# User Inputs and AI-generated Response
## User Inputs

### Prompt
{prompt}

## AI-generated Response
{response}
"""

# Define a structured enum class to capture the result.
class SummaryRating(enum.Enum):
  VERY_GOOD = '5'
  GOOD = '4'
  OK = '3'
  BAD = '2'
  VERY_BAD = '1'

# Coerce into the desired structure.
structured_output_config = types.GenerateContentConfig(
    response_mime_type="text/x.enum",
    response_schema=SummaryRating,
)

def eval_summary(prompt, ai_response):
  """Evaluate the generated summary against the prompt used."""

  eval_chat = client.chats.create(model='gemini-2.0-flash')
    
  # Generate the full text response.
  response = eval_chat.send_message(
      message=EVAL_PROMPT.format(prompt=prompt, response=ai_response)
  )
  verbose_eval = response.text


  response = eval_chat.send_message(
      message="Convert the final score.",
      config=structured_output_config,
  )
  structured_eval = response.parsed

  return verbose_eval, structured_eval

Evaluating the prompts against each other:

In [16]:
# TODO: maybe share context so the LLM can compare different outputs against its own rating to be more accurate perhaps?

#NUM_ITERATIONS = 5
NUM_ITERATIONS=1

with open("/kaggle/working/promptlogs.txt", "a") as logs:
    
    for prompt in prompts:
        sum = 0
        logs.write(f"Prompt: {prompt}\n")
    
        for question in questions:
            logs.write(f"Question: {question}\n")
            for i in range(NUM_ITERATIONS):
                response = client.models.generate_content(
                    model="gemini-2.0-flash-001",
                    contents=str(question),
                    config=GenerateContentConfig(
                        system_instruction=[prompt["prompt"],],
                        temperature=0.1,
                    ),
                )

                text_eval, struct_eval = eval_summary(prompt=prompt["prompt"], ai_response=response)
                print(f"Iteration {i}: {struct_eval}")
                logs.write(f"Iteration {i}: {text_eval}\n")
                sum += int(struct_eval.value)



        prompt["avg_score"] = sum / (len(questions) * NUM_ITERATIONS)
        print(f"Prompt:\n{prompt['prompt']}\nScore: {prompt['avg_score']}\n\n")
    
        logs.write("\n\n")

Iteration 0: SummaryRating.OK
Prompt:
Analyze the user's error. Why is the correct answer better supported by the text than the user's answer? Be concise.
Score: 3.0


Iteration 0: SummaryRating.GOOD
Prompt:

Consider the SAT question and the user's selected answer.
1. Identify the specific textual evidence that strongly supports the correct answer choice.
2. Identify any assumptions the user might be making that lead to their chosen answer.
3. Explain why the textual support for the correct answer is stronger or more direct than any implied support for the user's answer.  If the user provides their own rational, specifically address the rational and mention what part of the question makes the rational wrong.
        
Score: 4.0


Iteration 0: SummaryRating.VERY_GOOD
Prompt:

Let's analyze this question step-by-step to understand the user's error.

1.  Summarize the main point of the passage in your own words.
2.  Identify the key phrase(s) in the question that guide you to the correct

The debate prompt consistently scores the highest, so that will be picked. Prompts can be re-generated and re-evaluated with ease if need be.

In [17]:
final_prompt = """
Evaluate both the correct answer and the user's answer as potential responses to the question.

*   Present the strongest possible argument *in favor* of the user's answer.
*   Present the strongest possible argument *against* the user's answer.
*   Explain why, ultimately, the correct answer is the superior choice based on textual evidence.
"""

## Note: "Smart" Questions
The majority of questions that Aquarc Intelligence has to process will contain HTML data. How does Aquarc Intelligence interact with "smart" questions (i.e. questions that make use of intelligent features like formatting, graphs, or other image representations of data)?

Most questions can be "cleansed" by converting to markdown like the following because there is text data available for most images (although the output is still quite messy):

In [18]:
# This question is (c) CollegeBoard 2025
rich_question = {
    "question": """
<figure class="image">
   <svg aria-label="Bar graph titled Municipalities’ Responses to Inquiries about Potential Incentives for Firm. The horizontal axis has no label. 3 data categories are shown. The vertical axis is labeled Number of municipalities. It ranges from 0 to 1,300 in increments of 100. Refer to long description." height="578.7376708984375" role="img" viewbox="0 0 400 578.7376708984375" width="400" xmlns="http://www.w3.org/2000/svg">
      <g data-name="Layer 1" id="ed420550-79eb-48d4-af01-cd27cdd08afd">
         <defs>
            +
            <pattern height="100" id="bar4" patterntransform="rotate(50)" patternunits="userSpaceOnUse" width="10" x="0" y="0">
               +
               <rect fill="#CDCDCD" height="100" width="5" x="0" y="0"></rect>
               +
               <rect fill="#444444" height="100" width="5" x="5" y="0"></rect>
               +
            </pattern>
            +
         </defs>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="72" y2="72"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="72" y2="72"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 78)">1,300</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="91.23076923076923" y2="91.23076923076923"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="91.23076923076923" y2="91.23076923076923"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 97.23076923076923)">1,200</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="110.46153846153845" y2="110.46153846153845"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="110.46153846153845" y2="110.46153846153845"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 116.46153846153845)">1,100</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="129.69230769230768" y2="129.69230769230768"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="129.69230769230768" y2="129.69230769230768"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 135.69230769230768)">1,000</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="148.9230769230769" y2="148.9230769230769"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="148.9230769230769" y2="148.9230769230769"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 154.9230769230769)">900</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="168.15384615384613" y2="168.15384615384613"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="168.15384615384613" y2="168.15384615384613"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 174.15384615384613)">800</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="187.3846153846154" y2="187.3846153846154"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="187.3846153846154" y2="187.3846153846154"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 193.3846153846154)">700</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="206.6153846153846" y2="206.6153846153846"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="206.6153846153846" y2="206.6153846153846"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 212.6153846153846)">600</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="225.84615384615384" y2="225.84615384615384"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="225.84615384615384" y2="225.84615384615384"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 231.84615384615384)">500</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="245.07692307692307" y2="245.07692307692307"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="245.07692307692307" y2="245.07692307692307"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 251.07692307692307)">400</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="264.30769230769226" y2="264.30769230769226"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="264.30769230769226" y2="264.30769230769226"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 270.30769230769226)">300</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="283.53846153846155" y2="283.53846153846155"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="283.53846153846155" y2="283.53846153846155"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 289.53846153846155)">200</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="302.7692307692308" y2="302.7692307692308"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="302.7692307692308" y2="302.7692307692308"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 308.7692307692308)">100</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="322" y2="322"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="322" y2="322"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 328)">0</text>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="middle" transform="translate(24 197) rotate(-90)">Number of municipalities</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="99.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="72" y2="322"></line>
         <rect fill="#B3B3B3" height="240.76923076923077" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="128.28399505615235" xmlns="http://www.w3.org/2000/svg" y="81.23076923076923"></rect>
         <rect fill="#333333" height="240" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="156.80799560546876" xmlns="http://www.w3.org/2000/svg" y="82"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(166.72799560546875 341.84) rotate(-40)" x="0" xmlns="http://www.w3.org/2000/svg" y="0">no response</text>
         <rect fill="#B3B3B3" height="39.80769230769231" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="213.85599670410159" xmlns="http://www.w3.org/2000/svg" y="282.1923076923077"></rect>
         <rect fill="#333333" height="39.42307692307692" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="242.379997253418" xmlns="http://www.w3.org/2000/svg" y="282.5769230769231"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(252.29999725341798 341.84) rotate(-40)" x="0" xmlns="http://www.w3.org/2000/svg" y="0">responded to inquiry</text>
         <rect fill="#B3B3B3" height="24.23076923076923" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="299.4279983520508" xmlns="http://www.w3.org/2000/svg" y="297.7692307692308"></rect>
         <rect fill="#333333" height="23.46153846153846" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="327.9519989013672" xmlns="http://www.w3.org/2000/svg" y="298.53846153846155"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(337.8719989013672 341.84) rotate(-40)" x="0" xmlns="http://www.w3.org/2000/svg" y="0">offered incentive</text>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="middle" transform="translate(242.37999725341797 24)">Municipalities’ Responses to Inquiries </text>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="middle" transform="translate(242.37999725341797 48)">about Potential Incentives for Firm</text>
         <rect fill="none" height="71" stroke="#000000" stroke-linejoin="mitre" stroke-width="0.9" width="280.1479034423828" x="67.4260482788086" xmlns="http://www.w3.org/2000/svg" y="496.7376708984375"></rect>
         <rect fill="#B3B3B3" height="12" stroke="#000000" stroke-linejoin="mitre" stroke-width="0.9" width="12" x="74.4260482788086" xmlns="http://www.w3.org/2000/svg" y="508.7376708984375"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="top" transform="translate(96.4260482788086 520.7376708984375)"> announcement before election</text>
         <rect fill="#333333" height="12" stroke="#000000" stroke-linejoin="mitre" stroke-width="0.9" width="12" x="74.4260482788086" xmlns="http://www.w3.org/2000/svg" y="540.7376708984375"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="top" transform="translate(96.4260482788086 552.7376708984375)"> announcement after election</text>
      </g>
   </svg>
</figure>
<div aria-label="Long description for bar graph titled Municipalities’ Responses to Inquiries about Potential Incentives for Firm" class="sr-only" role="region">
   <ul>
      +
      <li>
         For each data category, the following bars are shown: <br/>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           +
         <ul>
            +
            <li>announcement before election</li>
            +
            <li>announcement after election</li>
            +
         </ul>
         +
      </li>
      +
      <li>
         The data for the 3 categories are as follows: <br/>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   +
         <ul>
            +
            <li>
               no response:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          +
               <ul>
                  +
                  <li>announcement before election: 1,252</li>
                  +
                  <li>announcement after election: 1,248</li>
                  +
               </ul>
               +
            </li>
            +
            <li>
               responded to inquiry:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 +
               <ul>
                  +
                  <li>announcement before election: 207</li>
                  +
                  <li>announcement after election: 205</li>
                  +
               </ul>
               +
            </li>
            +
            <li>
               offered incentive:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    +
               <ul>
                  +
                  <li>announcement before election: 128</li>
                  +
                  <li>announcement after election: 122</li>
                  +
               </ul>
               +
            </li>
            +
         </ul>
         +
      </li>
      +
   </ul>
</div>
+
<p>In the United States, firms often seek incentives from municipal governments to expand to those municipalities. A team of political scientists hypothesized that municipalities are much more likely to respond to firms and offer incentives if expansions can be announced in time to benefit local elected officials than if they can’t. The team contacted officials in thousands of municipalities, inquiring about incentives for a firm looking to expand and indicating that the firm would announce its expansion on a date either just before or just after the next election. </p>

<p>Which choice best describes data from the graph that weaken the team&rsquo;s hypothesis?</p>

A. <p>A large majority of the municipalities that received an inquiry mentioning plans for an announcement before the next election didn&rsquo;t respond to the inquiry.</p>
B. <p>The proportion of municipalities that responded to the inquiry or offered incentives didn&rsquo;t substantially differ across the announcement timing conditions.&nbsp;</p>
C. <p>Only around half the municipalities that responded to inquiries mentioning plans for an announcement before the next election offered incentives.&nbsp;</p>
D. <p>Of the municipalities that received an inquiry mentioning plans for an announcement date after the next election, more than 1,200 didn&rsquo;t respond and only around 100 offered incentives.</p>
    """,
    "rationale": """
<p>Choice B is the best answer. The lighter bars show what happened when the announcement was to come before the election, and the darker bars show what happened when the announcement was to come after the election. For all three of the outcomes, the light and dark bars are virtually the same, demonstrating that the announcement timing didn&rsquo;t actually make a difference. </p>
<p>Choice A is incorrect. This accurately describes some data from the graph, but it doesn&rsquo;t weaken the hypothesis. It doesn&rsquo;t include the &ldquo;announcement after election&rdquo; data for comparison. Choice C is incorrect. This accurately describes some data from the graph, but it doesn&rsquo;t weaken the hypothesis. It doesn&rsquo;t include the &ldquo;announcement after election&rdquo; data for comparison. Choice D is incorrect. This accurately describes some data from the graph, but it doesn&rsquo;t weaken the hypothesis. It doesn&rsquo;t include the &ldquo;announcement before election&rdquo; data for comparison. </p>    
    """,
    "user_choice": "D",
    "user_rationale": "isn't this data objly true?"
}

Even with the SVG, Gemini can still interpret the noisy data:

In [19]:
sample_rich_answer = client.models.generate_content(
                    model="gemini-2.0-flash-001",
                    contents=str(rich_question),
                    config=GenerateContentConfig(
                        system_instruction=[final_prompt,],
                        temperature=0.1,
                    ),
                )

Markdown(sample_rich_answer.text)

Okay, let's analyze the question and the user's answer.

**Question:** Which choice best describes data from the graph that weaken the team’s hypothesis?

**Correct Answer:** B. The proportion of municipalities that responded to the inquiry or offered incentives didn’t substantially differ across the announcement timing conditions.

**User's Answer:** D. Of the municipalities that received an inquiry mentioning plans for an announcement date after the next election, more than 1,200 didn’t respond and only around 100 offered incentives.

**Arguments in favor of the user's answer (D):**

*   **Directly addresses the "after election" scenario:** The user's answer focuses specifically on the municipalities that received inquiries about announcements *after* the election. The hypothesis suggests these municipalities should be *less* likely to respond or offer incentives.
*   **Highlights a potential lack of interest:** The user's answer points out that a large number of municipalities didn't respond and only a small number offered incentives in the "after election" scenario. This could be interpreted as evidence supporting the idea that municipalities are less interested when there's no immediate electoral benefit.
*   **Based on factual data:** The user's answer is based on actual numbers that can be read from the graph (approximately 1200+ "no response" and around 100 "offered incentive" for the "announcement after election" condition).

**Arguments against the user's answer (D):**

*   **Lacks a comparative element:** The core of the hypothesis is a *comparison* between the "before election" and "after election" scenarios. The user's answer only describes the "after election" scenario in isolation. To weaken the hypothesis, we need to show that the "before election" scenario is *not* significantly different.
*   **Doesn't directly contradict the hypothesis:** While the user's answer shows a certain level of disinterest in the "after election" scenario, it doesn't prove that municipalities are *not* more likely to respond before an election. It's possible that the "before election" numbers are even *lower*, which would actually *support* the hypothesis (though this is not the case).

**Why the correct answer (B) is superior:**

*   **Directly addresses the comparative aspect of the hypothesis:** The correct answer explicitly states that there's no substantial difference in the proportion of municipalities that responded or offered incentives, regardless of the announcement timing. This *directly* contradicts the hypothesis that timing matters.
*   **Encompasses all relevant data:** The correct answer considers both "responded to inquiry" and "offered incentives," providing a more comprehensive assessment of the hypothesis.
*   **Supported by the graph:** By visually comparing the heights of the "before election" (lighter) and "after election" (darker) bars for each category ("no response," "responded to inquiry," "offered incentive"), it's clear that the differences are minimal. This supports the claim that the announcement timing didn't have a significant impact.

**Textual Evidence:**

The graph shows that:

*   "Responded to inquiry" is approximately 207 (before election) and 205 (after election).
*   "Offered incentive" is approximately 128 (before election) and 122 (after election).

These numbers are very close, indicating that the timing of the announcement had little to no effect on whether municipalities responded or offered incentives. This directly supports choice B.

**Conclusion:**

While the user's answer (D) identifies a factual observation from the graph, it fails to address the core comparative element of the hypothesis. The correct answer (B) directly contradicts the hypothesis by stating that there's no significant difference between the "before election" and "after election" scenarios, and this is supported by the data presented in the graph.


While cleansing the data may be smoother for the LLM, not all data can be easily cleansed:

In [20]:
from html_to_markdown import convert_to_markdown

rich_question = {
    "question": convert_to_markdown("""
<p><span role="region" aria-label="Referenced Content"><u>&ldquo;How lifelike are they?&rdquo;</u></span> Many computer animators prioritize this question as they strive to create ever more realistic environments and lighting. Generally, while characters in computer-animated films appear highly exaggerated, environments and lighting are carefully engineered to mimic reality. But some animators, such as Pixar&rsquo;s Sanjay Patel, are focused on a different question. Rather than asking first whether the environments and lighting they&rsquo;re creating are convincingly lifelike, Patel and others are asking whether these elements reflect their films&rsquo; unique stories.</p>

<p>Which choice best describes the function of the underlined question in the text as a whole?</p>

A. <p>It reflects a primary goal that many computer animators have for certain components of the animations they produce.</p>
B. <p>It represents a concern of computer animators who are more interested in creating unique backgrounds and lighting effects than realistic ones.</p>
C. <p>It conveys the uncertainty among many computer animators about how to create realistic animations using current technology.</p>
D. <p>It illustrates a reaction that audiences typically have to the appearance of characters created by computer animators.</p>
    """),
    "rationale": convert_to_markdown("""
<p>Choice A is the best answer because it most accurately describes the function of the underlined question in the text as a whole. The text begins with the underlined question, &ldquo;How lifelike are they?&rdquo; The text then explains that many computer animators pose this question about the environments and lighting that they create for animated films, striving for realistic animation of those components even if the characters themselves aren&rsquo;t portrayed in realistic terms. The focus of the text then shifts to describe how some animators strive to create environments and lighting that reflect the film&rsquo;s unique stories rather than making them appear realistic. Therefore, the function of the underlined question is to reflect a primary goal that many computer animators have for certain components of the animations they produce. </p><p>Choice B is incorrect because, as the text makes clear, the underlined question is one posed by computer animators who wish to create realistic backgrounds and lighting effects, not by those who, instead, wish to create effects that reflect films&rsquo; unique stories and aren&rsquo;t necessarily realistic; this latter group of animators is discussed later in the text. Choice C is incorrect. As the text explains, many computer animators strive for realistic environments and lighting, while others do not; this difference of approach relates to whether these components should be realistic, not to how realism can be achieved using current technology, and the text never suggests that animators are uncertain how to achieve it. Choice D is incorrect because the underlined question pertains to the perspective of computer animators, not the audience, and the text never considers audience&rsquo;s reactions to characters in animated films. </p>
    """),
    "user_choice": "C",
    "user_rationale": "why is it not C? Aren't they asking the question because they are uncertain about it?",
}

print(rich_question["question"])


“How lifelike are they?” Many computer animators prioritize this question as they strive to create ever more realistic environments and lighting. Generally, while characters in computer\-animated films appear highly exaggerated, environments and lighting are carefully engineered to mimic reality. But some animators, such as Pixar’s Sanjay Patel, are focused on a different question. Rather than asking first whether the environments and lighting they’re creating are convincingly lifelike, Patel and others are asking whether these elements reflect their films’ unique stories.


Which choice best describes the function of the underlined question in the text as a whole?



A. It reflects a primary goal that many computer animators have for certain components of the animations they produce.


B. It represents a concern of computer animators who are more interested in creating unique backgrounds and lighting effects than realistic ones.


C. It conveys the uncertainty among many computer ani

Perhaps in this scenario, "good enough" gets the job done.

# Multiple Reasoning Capabilities using Tree of Thoughts (ToT)
To understand questions better, the debate-style approach may not suffice; multiple, parallel thinking processes may be necessary for the user to truly grasp the problem with their current reasoning. This approach is less time consuming and may help the user better than walking through and typing their own rationale. Tree of Thoughts (ToT) is a technique for the agent to use multiple processes to approach the relevant answer for the user. The agent will **have to** output in JSON so that the user's screen can be separated and decluttered.

In [21]:
response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    # no specs yet
    contents=f"""
**Objective**: Create SAT-focused prompt variations that generate multiple structured thinking processes for error analysis.
                
**Requirements for the agent that you will create a prompt for**:
1. The model you are creating a prompt for will analyze: 
   - Current SAT question
   - Official rationale
   - User's answer + rationale
2. Must output in the following format:
```json
[
    {{
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    }},
    {{
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    }},
    {{
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    }},
]
```

3. There will be a model before you that contrasts an argument in favor of the user's answer with an argument for the actual answer.
You will be given that output. Create a prompt such that this model will build upon it.

The model may not include anything outside the JSON.

\"leads_to\" represents that it leads to the users answer or the \"correct\" answer. There are no other options.

Example question your model may receive:

Question:
{question['question']}

Rationale:
{question['rationale']}

User Response:
- Answer: {question['user_answer']}
- User Rationale: {question['user_rationale']}

Sample Response from the Previous Agent (not related to the question):
{sample_rich_answer}.

Don't generate any code. Just generate multiple creative but logical prompts we can use to make an AI for this
              """,
    config=GenerateContentConfig(
        system_instruction=[
            "You are a prompt engineer's assistant. Help the prompt engineer generate some prompts for his AI-powered SAT learning platform called Aquarc. The platform currently holds an SAT question bank with over 5000 questions and tracks which questions you get wrong per category. While this feature is helpful it lacks the intelligence necessary to be a full fledged SAT platform",
        ],
        temperature=0.7,
        top_k=7,
    ),
)

Markdown(response.text)

Okay, here are a few prompt variations designed to elicit structured thinking processes for error analysis, building upon the previous agent's output, and adhering to the specified JSON format:

**Prompt 1: Focused on Rationale Dissection**

```
You are an expert SAT tutor, helping students understand their mistakes. You will be provided with the following information: an SAT question, the official rationale for the correct answer, the student's answer and their rationale, and a comparison of arguments for both the student's and the correct answer (as generated by a previous AI agent).

Your task is to analyze this information and identify the distinct thought processes that could lead a student to either the correct answer or their chosen (incorrect) answer.

Specifically, break down each possible approach into a series of step-by-step logical deductions or inferences.  For each distinct thought process, indicate whether it leads to the user's answer or the correct answer. Pay close attention to how the user's rationale and the arguments presented by the previous agent reveal the student's line of reasoning. Prioritize thinking processes that directly address the user's misunderstanding as revealed in their rationale.

Output your analysis in the following JSON format:

```json
[
    {
        "thinking_process": "[Step-by-step breakdown of a thought process leading to one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Another distinct thought process, possibly involving a different interpretation or assumption]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

Here is the SAT question, rationale, user response and previous agent's output:

[SAT Question, Rationale, User Response, Previous Agent's Output]
```

**Prompt 2: Emphasizing Misconceptions and Alternative Approaches**

```
You are an AI-powered SAT error analysis engine. Your goal is to deconstruct the reasoning behind both correct and incorrect answers. You will be given the following: an SAT question, the official explanation, the user's response and reasoning, and a comparison of arguments for both answers.

Based on all available information, identify distinct lines of reasoning or problem-solving strategies that a student might employ when tackling the question. For each strategy, detail the specific steps involved, highlighting any potential misconceptions or flawed assumptions that might lead to an incorrect answer.  Conversely, clearly articulate the steps needed to arrive at the correct answer. Consider alternative ways to approach the problem.

Your output must be in the following JSON format:

```json
[
    {
        "thinking_process": "[A step-by-step breakdown of a potential solution path]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Another distinct solution path, perhaps based on a different starting point or assumption]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible, exhausting all possible approaches!]",
        "leads_to": "user" | "correct"
    },
]
```

Here is the SAT question, rationale, user response and previous agent's output:

[SAT Question, Rationale, User Response, Previous Agent's Output]
```

**Prompt 3: Focus on SAT specific error types**

```
You are an AI model designed to provide error analysis for SAT questions, focusing on common SAT-specific mistakes. You will be provided with an SAT question, the official rationale, the user's answer and rationale, and an argument comparison between the user's answer and the correct answer.

Analyze the question and responses to identify different thought processes that could lead to either the correct answer or the user's answer. Break down each thought process into a sequence of steps. Consider common SAT error patterns, such as:

*   Misunderstanding the question prompt.
*   Applying incorrect rules or formulas.
*   Making careless calculation errors.
*   Failing to consider all answer choices.
*   Misinterpreting the passage or data presented.

For each thought process, specify whether it leads to the user's answer or the correct answer.

Output your analysis in the following JSON format:

```json
[
    {
        "thinking_process": "[Detailed, step-by-step reasoning process, including potential error types]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Another distinct reasoning process, highlighting a different potential error]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

Here is the SAT question, rationale, user response and previous agent's output:

[SAT Question, Rationale, User Response, Previous Agent's Output]
```

**Prompt 4: Using the User's Rationale as a Starting Point**

```
You are an AI-powered SAT tutor whose primary goal is to understand *why* a student made a particular mistake. You will be given an SAT question, the official explanation, the user's response and rationale, and an argument comparison between the user's answer and the correct answer.

Start with the user's rationale. Deconstruct the student's reasoning, identifying the initial assumptions, inferences, and logical steps that led them to their answer. Then, explore alternative reasoning paths, both those that would still lead to the user's answer (perhaps based on slightly different assumptions) and those that would lead to the correct answer. For each path, provide a step-by-step breakdown.

Your output must be in the following JSON format:

```json
[
    {
        "thinking_process": "[Step-by-step breakdown of the user's reasoning, starting from their rationale]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[An alternative reasoning path that still leads to the user's answer]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[A reasoning path that leads to the correct answer, highlighting the flaw in the user's reasoning]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

Here is the SAT question, rationale, user response and previous agent's output:

[SAT Question, Rationale, User Response, Previous Agent's Output]
```

Key improvements in these prompts:

*   **Clear Role Definition:**  Each prompt explicitly defines the AI's role (e.g., "expert SAT tutor," "error analysis engine").
*   **Emphasis on Step-by-Step Reasoning:**  The prompts repeatedly stress the need for breaking down thought processes into detailed steps.
*   **Focus on User Rationale:**  Prompts 1 and 4 specifically direct the AI to analyze and build upon the user's provided rationale, making the analysis more targeted.
*   **Addressing Misconceptions:** Prompt 2 explicitly asks the AI to identify potential misconceptions.
*   **SAT-Specific Error Types:** Prompt 3 introduces common SAT error patterns to guide the analysis.
*   **Exhaustive Approach:** The prompts encourage the AI to identify *as many* distinct thought processes as possible.
*   **Building on Previous Agent:** Prompts explicitly state that you are building on the previous agent's output and to take that output into consideration.

These prompts should provide a good starting point for generating diverse and insightful error analyses for your Aquarc platform. Remember to test and refine these prompts based on the actual output you receive. Good luck!


Here I've picked the 5 best and most creative prompts to evaluate:

In [22]:
prompts = [ 
    {
        "prompt": """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1.  The SAT question, including the text, answer choices, and the correct answer.
2.  The official rationale for the correct answer and why the other choices are incorrect.
3.  The student's chosen answer and their rationale for choosing that answer.
4.  An argument contrasting the user's answer with the correct answer.

Based on this information, identify the potential thinking processes that could have led the student to their incorrect answer, as well as the thinking process for the correct answer. Focus on common SAT error patterns (e.g., misreading the question, applying incorrect grammar rules, making unwarranted assumptions, etc.).

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
Leading to "user" indicates that it leads to the incorrect answer, the one the user picked. For each "correct" thinking process path, there should be one "user" path that leads to the user answer but refutes why the user is wrong.
Be detailed in your thinking process, but don't say more that is necessary. If you feel more sentences are needed for precision then add some. But some won't need that many and some will need few.
        """,
        "avg_score": 0.0,
    },
    {
        "prompt": """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1. The SAT question, including the text, answer choices, and the correct answer.
2. The official rationale for the correct answer and why the other choices are incorrect.
3. The student's chosen answer and their rationale for choosing that answer.
4. An argument contrasting the user's answer with the correct answer.

Deconstruct the official rationale and the student's rationale. Identify the key assumptions, logical steps, and potential misunderstandings in each. Then, reconstruct different thinking processes – both correct and incorrect – that could lead to each answer choice. Be detailed and explicit in each step.

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
Leading to "user" indicates that it leads to the incorrect answer, the one the user picked. For each "correct" thinking process path, there should be one "user" path that leads to the user answer but refutes why the user is wrong.
Be detailed in your thinking process, but don't say more that is necessary. If you feel more sentences are needed for precision then add some. But some won't need that many and some will need few.
        """,
        "avg_score": 0.0,
    },
    {
        "prompt" : """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1. The SAT question, including the text, answer choices, and the correct answer.
2. The official rationale for the correct answer and why the other choices are incorrect.
3. The student's chosen answer and their rationale for choosing that answer.
4. An argument contrasting the user's answer with the correct answer.

Consider potential cognitive biases (e.g., confirmation bias, anchoring bias, availability heuristic) that might have influenced the student's decision-making process. Develop multiple step-by-step thinking processes that incorporate these biases, leading to both the student's answer and the correct answer.

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
Leading to "user" indicates that it leads to the incorrect answer, the one the user picked. For each "correct" thinking process path, there should be one "user" path that leads to the user answer but refutes why the user is wrong.
Be detailed in your thinking process, but don't say more that is necessary. If you feel more sentences are needed for precision then add some. But some won't need that many and some will need few.
        """,
        "avg_score": 0.0,
    },
    {
        "prompt": """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1. The SAT question, including the text, answer choices, and the correct answer.
2. The official rationale for the correct answer and why the other choices are incorrect.
3. The student's chosen answer and their rationale for choosing that answer.
4. An argument contrasting the user's answer with the correct answer.

Elaborate on multiple potential "chains of thought" a student might follow when approaching the question. Some chains should lead to the correct answer, while others should lead to the student's incorrect answer. Make each step in the chain explicit and easy to follow.

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
Leading to "user" indicates that it leads to the incorrect answer, the one the user picked. For each "correct" thinking process path, there should be one "user" path that leads to the user answer but refutes why the user is wrong.
Be detailed in your thinking process, but don't say more that is necessary. If you feel more sentences are needed for precision then add some. But some won't need that many and some will need few.
        """,
        "avg_score": 0.0,
    },
    { 
        "prompt": """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

    The SAT question, including the text, answer choices, and the correct answer.
    The official rationale for the correct answer and why the other choices are incorrect.
    The student's chosen answer and their rationale for choosing that answer.
    An argument contrasting the user's answer with the correct answer.

Synthesize multiple approaches to analyze the student's error. Consider:

1. Common SAT error patterns.
2. Deconstruction of the official and student rationales.
3. Potential cognitive biases.
4. Detailed chains of thought.

Develop as many distinct, step-by-step thinking processes as possible that could lead to each answer choice (both the student's and the correct one). Be exhaustive in your analysis.

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
Leading to "user" indicates that it leads to the incorrect answer, the one the user picked. For each "correct" thinking process path, there should be one "user" path that leads to the user answer but refutes why the user is wrong.
Be detailed in your thinking process, but don't say more that is necessary. If you feel more sentences are needed for precision then add some. But some won't need that many and some will need few.

        """,
        "avg_score": 0.0,
    },
]

## Evaluation of ToT Prompts

Evaluate the ToT prompts against each other:

In [23]:
# Define the evaluation prompt
EVAL_PROMPT = """
# Instruction
Evaluate the AI’s analysis of a student’s SAT error. Focus on how well it explains why the correct answer is textually supported and why the user’s answer is incorrect.
This analysis is intended to build upon a previous debate style analysis where the user is presented upon a pro/con argument for their answer. These prompts will be used in the agent for when the user wants more clarity and multiple thinking processes.

# Evaluation
## **Metric Definition**
Assess the **accuracy**, **completeness**, **groundedness**, and **clarity** of the response. The AI must:
1. Correctly identify textual evidence for the correct answer.
2. Explain flaws in the user’s answer (and their rationale, if provided).
3. Follow the prompt’s instructions precisely.
4. Really help the user understand why they were wrong by looking at how other people can mess up / how they potentially messed up.


## **Criteria**  
1. **Instructional Creativity**:  
   - Does the creative approach (e.g., debate, step-by-step analysis) **enhance understanding** of why the correct answer is textually supported?  
   - Does it **strategically use the prompt’s structure** (e.g., arguments for/against) to highlight key SAT skills like evidence analysis or assumption identification?  

2. **Educational Effectiveness**:  
   - Does the creativity **directly serve the learning goal** (e.g., clarifying misconceptions, modeling SAT logic), or is it merely ornamental?  
   - Does it **engage the learner** while maintaining rigor (e.g., making complex reasoning more accessible)?  

## **Revised Rating Rubric**  
- **5 (Excellent)**:  
  - Creative structure (e.g., debate) **directly reinforces** why the correct answer is superior.  
  - Uses the format to **explicitly contrast** the user’s error with textual evidence (e.g., “The strongest argument *for* the user’s answer is X, but the text contradicts this because Y”).  
  - Balances creativity with precision and clarity.  

- **4 (Good)**:  
  - Creative approach is **mostly effective** but slightly misses opportunities to deepen understanding (e.g., lists arguments but doesn’t explicitly tie them to SAT skills).  
  - Minor clarity issues in linking creativity to the text.  

- **3 (Adequate)**:  
  - Creativity **distracts** slightly from the core analysis (e.g., overemphasizes hypothetical arguments without grounding in the text).  
  - Fails to fully leverage the creative structure to address the user’s error.  

- **2 (Poor)**:  
  - Creative format **obscures key points** (e.g., hypothetical arguments misrepresent the text).  
  - Prioritizes style over substance; minimal educational value.  

- **1 (Very Poor)**:  
  - Creativity **undermines accuracy** (e.g., invents textual evidence to support arguments).  

## Note
This evaluation rubric is purposefully vague, so take that as an opportunity to be strict and make sure the AI is really being useful for users.
Be harsh with your feedback, but make sure your feedback is grounded in reality.

## Evaluation Steps
STEP 1: Assess the response in aspects of instruction following, groundedness, conciseness, and verbosity according to the criteria.
STEP 2: Score based on the rubric.

# User Inputs and AI-generated Response
## User Inputs

### Prompt
{prompt}

## AI-generated Response
{response}
"""


Now for the bulk of the evaluation cycle:

In [24]:
# TODO: maybe share context so the LLM can compare different outputs against its own rating to be more accurate perhaps?

NUM_ITERATIONS = 8
#NUM_ITERATIONS=1

with open("/kaggle/working/totpromptlogs.txt", "a") as logs:
    
    for prompt in prompts:
        sum = 0
        logs.write(f"Prompt: {prompt}\n")
    
        for question in questions:
            logs.write(f"Question: {question}\n")
            for i in range(NUM_ITERATIONS):
                response = client.models.generate_content(
                    model="gemini-2.0-flash-001",
                    contents=str(question),
                    config=GenerateContentConfig(
                        system_instruction=[prompt["prompt"],],
                        temperature=0.1,
                    ),
                )

                text_eval, struct_eval = eval_summary(prompt=prompt["prompt"], ai_response=response)
                print(f"Iteration {i}: {struct_eval}")
                logs.write(f"Iteration {i}: {response.text}\n\n {text_eval}\n")
                sum += int(struct_eval.value)

        prompt["avg_score"] = sum / (len(questions) * NUM_ITERATIONS)
        print(f"Prompt:\n{prompt['prompt']}\nScore: {prompt['avg_score']}\n\n")
    
        logs.write("\n\n")

Iteration 0: SummaryRating.VERY_GOOD
Iteration 1: SummaryRating.GOOD
Iteration 2: SummaryRating.GOOD
Iteration 3: SummaryRating.GOOD
Iteration 4: SummaryRating.VERY_GOOD
Iteration 5: SummaryRating.GOOD
Iteration 6: SummaryRating.VERY_GOOD
Iteration 7: SummaryRating.GOOD
Prompt:

You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1.  The SAT question, including the text, answer choices, and the correct answer.
2.  The official rationale for the correct answer and why the other choices are incorrect.
3.  The student's chosen answer and their rationale for choosing that answer.
4.  An argument contrasting the user's answer with the correct answer.

Based on this information, identify the potential thinking processes that could have led the student to their incorrect answer, as well as the thinking process for the correct answer. Focus on common SAT error patterns (e.g., misreading the question, applying incorrect grammar rules, making unw

## Future Direction
Fine-tuning may be necessary once we have some data on what "good" and "bad" output looks like. Reinforcement Learning with Human Feedback (RLHF) may be necessary to complement the AI evaluator here.

# TODO
Synthesize the prompt:

In [25]:
from google.genai.types import GenerateContentConfig

response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    # no specs yet
    contents=f"{question['question']} {question['rationale']}\n The user got: {question['user_answer']}\n {question['user_rationale']}",
    config=GenerateContentConfig(
        system_instruction=[
            "You are an SAT expert tutor. Analyze questions using the official SAT framework. ",
            "Help students by:\n",
            "1. Identifying question type and skills tested\n",
            "2. Explaining why answers are correct/incorrect\n",
            "3. Providing actionable improvement strategies\n",
            "Use formal but friendly language. Reference the SAT specs when possible.",
        ]
    ),
)


Markdown(response.text)

Great job breaking down the question! I can see why you chose option C, and I'm here to clarify the subtle difference that makes option A the stronger choice according to the SAT's standards.

Here's a review of the question and why the College Board says option A is the best answer:

* **Question Type:** Grammar and Usage
* **Skill Tested:** Misplaced Modifiers
* **Best Answer:** A

Here's why:

*   **The Problem:** The sentence begins with a modifying phrase: "Upon recovering two years later..." This phrase needs to be followed by the noun it describes. If not, it's a dangling modifier.
*   **Why A is Correct:** "Henry resumed his reign" correctly places "Henry" immediately after the modifying phrase, so it is clear that *Henry* is the one recovering.
*   **Why you chose C:** "Henry's reign resumed" is grammatically correct on its own, but placing "Henry's reign" directly after the modifying phrase creates a dangling modifier. It illogically implies that the *reign* was the thing recovering. Although Henry is certainly the subject of the passage, the *reign* is the subject of the second half of the sentence.

**Actionable Strategy:**

*   **Focus on the closest noun:** When you see an introductory modifying phrase, ask yourself, "What is *doing* the action described in this phrase?" The answer should be the noun that comes immediately after the phrase.
*   **Read carefully for meaning:** Sometimes, the correct answer sounds a little less natural. The SAT prioritizes grammatical precision over conversational ease.

I hope this explanation helps clarify why option A is the best answer! Keep practicing, and you'll master these subtle grammar points.


Perhaps the model will be more effective with FAISS vector database searches for the PDF.

# Parse PDF
The following code uses lossy conversion to turn the PDF into readable text.

Information like tables and images will be lost in the process, as demonstrated by the following snippet.

In [26]:
from PyPDF2 import PdfReader
from io import BytesIO
import requests

url = "https://www.w3.org/WAI/WCAG20/Techniques/working-examples/PDF20/table.pdf"
response = requests.get(url)
pdf_bytes = BytesIO(response.content)

text = ""
pdf_reader = PdfReader(pdf_bytes)
for page in pdf_reader.pages:
    text += page.extract_text()

print(text[0:1000])

Example table  
This is an example of a data table. 
Disability 
Category Participants  Ballots 
Completed  Ballots 
Incomplete/  
Terminated  Results  
Accuracy  Time to 
complete 
Blind  5 1 4 34.5%, n=1  1199 sec, n=1  
Low Vision  5 2 3 98.3% n=2  
(97.7%, n=3)  1716 sec, n=3  
(1934 sec, n=2)  
Dexterity  5 4 1 98.3%, n=4  1672.1 sec, n=4  
Mobility  3 3 0 95.4%, n=3  1416 sec, n=3  
 


In [27]:
# Download the PDF using requests
url = "https://satsuite.collegeboard.org/media/pdf/assessment-framework-for-digital-sat-suite.pdf"

response = requests.get(url)
pdf_bytes = BytesIO(response.content)

print(type(pdf_bytes))

specs_text = ""
pdf_reader = PdfReader(pdf_bytes)
for page in pdf_reader.pages:
    specs_text += page.extract_text()

print(specs_text[0:1000])

<class '_io.BytesIO'>
Assessment Framework 
for the Digital SAT® SuiteAssessment Framework 
for the Digital SAT® Suite
Version 3.01, August 2024
About College Board
College Board reaches more than 7 million students a year, helping them 
navigate the path from high school to college and career. Our not-for-
profit membership organization was founded more than 120 years ago. 
We pioneered programs like the SAT® and AP® to expand opportunities 
for students and help them develop the skills they need. Our BigFuture® 
program helps students plan for college, pay for college, and explore 
careers. Learn more at cb.org .
Suggested Citation:  College Board. 2024. Assessment Framework for the 
Digital SAT Suite , version 3.01 (August 2024). New Y ork: College Board.
© 2024 College Board. College Board, Advanced Placement, AP , BigFuture, Landscape, Pre-AP , SAT, and 
the acorn logo are registered trademarks of College Board. AP Potential, Bluebook, Connections, PSAT, 
Skills Insight, Student S

This text data might be too big for the model to contain within the prompt.

In [28]:
client.models.count_tokens(
    model=model, contents=specs_text
).total_tokens

153273

That's a pretty significant size! The only way to incorporate the PDF properly is to chunk it.

# Chunk the Parsed PDF
In order to use the PDF, we need to "chunk" it so bits of relevant information can be accessed at a time. In order to maximize efficiency, a vector search database will be used, as the likelihood that any key words in the question will appear on the SAT is effectively zero. 

In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1024,
    chunk_overlap=102,
    length_function=len,
    is_separator_regex=False,
)

chunked_specs_text = text_splitter.split_text(specs_text)

Check out the chunks:

In [30]:
print(len(chunked_specs_text))
print(chunked_specs_text[0])

702
Assessment Framework 
for the Digital SAT® SuiteAssessment Framework 
for the Digital SAT® Suite
Version 3.01, August 2024
About College Board
College Board reaches more than 7 million students a year, helping them 
navigate the path from high school to college and career. Our not-for-
profit membership organization was founded more than 120 years ago. 
We pioneered programs like the SAT® and AP® to expand opportunities 
for students and help them develop the skills they need. Our BigFuture® 
program helps students plan for college, pay for college, and explore 
careers. Learn more at cb.org .
Suggested Citation:  College Board. 2024. Assessment Framework for the 
Digital SAT Suite , version 3.01 (August 2024). New Y ork: College Board.
© 2024 College Board. College Board, Advanced Placement, AP , BigFuture, Landscape, Pre-AP , SAT, and 
the acorn logo are registered trademarks of College Board. AP Potential, Bluebook, Connections, PSAT,


# Initialize ReAct agent
The model will instead be a ReAct agent and figure out what to search up and call that tool as an extension and we will see what happens

We will start by finding a gemini model for embedding.

In [31]:
for m in client.models.list():
    if "embedContent" in m.supported_actions:
        print(m.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp


Set up ChromaDB. We will use `text-embedding-004` to encode the "document" chunks into vectors.

In [32]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry

from google.genai import types


class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]

Now let's create a Chroma database client and add the document embeddings

In [33]:
import chromadb

SPECS_DB_NAME = "specs"

embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

chroma_client = chromadb.Client()
#print(chroma_client.list_collections())
db = chroma_client.get_or_create_collection(name=SPECS_DB_NAME, embedding_function=embed_fn)
for i in range(int(len(chunked_specs_text) / 100)):
    db.add(documents=chunked_specs_text[i:100+i], ids=[str(i + j) for j in range(100)])

Check out the database

In [34]:
db.count()
db.peek(1)

{'ids': ['0'],
 'embeddings': array([[ 5.80166355e-02,  4.36070710e-02, -5.12756817e-02,
         -1.44314906e-03,  2.73717772e-02,  4.12952006e-02,
          5.71677238e-02,  7.64632318e-03,  9.56675038e-03,
         -5.80519577e-03, -4.39782254e-02,  9.93977953e-03,
          3.03114597e-02,  3.39544937e-02, -7.10375309e-02,
          1.01664122e-02,  2.03551948e-02,  5.44842966e-02,
         -1.12781808e-01, -2.91494676e-03, -2.88882591e-02,
         -3.38051245e-02, -1.49710365e-02, -1.39160426e-02,
         -1.64779034e-02,  7.98442122e-03,  7.00960010e-02,
          1.01699783e-02,  1.97551977e-02, -3.44269574e-02,
          2.98777758e-03,  9.17043630e-03, -2.51114983e-02,
         -8.72381590e-03,  7.50523322e-05,  3.42415981e-02,
         -2.42317207e-02, -3.20444927e-02,  7.41561651e-02,
         -4.53105606e-02, -4.95616272e-02, -4.72334400e-03,
         -1.52662601e-02, -1.15578976e-02, -2.57304776e-02,
          5.48221776e-03,  6.22319020e-02,  5.05198240e-02,
         -1

Let's look for *Information and Ideas* and see what relevant information the vector database finds on it.

In [35]:
# Switch to query mode when generating embeddings.
embed_fn.document_mode = False

# Search the Chroma DB using the specified query.
query = "What is the weighting of Information and Ideas?"

result = db.query(query_texts=[query], n_results=1)
[all_passages] = result["documents"]


Markdown(all_passages[0])

4.1.8. Word Count  ..................................................................................................................... 79
4.1.9. Informational Graphics  ............................................................................................... 79
4.1.10. T ext Complexity  .......................................................................................................... 79
4.2 Definitions  ..................................................................................................................................... 79
4.2.1. Construct  ......................................................................................................................... 79
4.2.2. Claims  ................................................................................................................................ 80
4.3 Content Domain Structure  ...................................................................................................... 81

Now let's look for something more generic, like scientific vocabulary or weighting of passages.

In [36]:
# Switch to query mode when generating embeddings.
embed_fn.document_mode = False

# Search the Chroma DB using the specified query.
query = "What scientific stuff are we tested on?"

result = db.query(query_texts=[query], n_results=2)
[all_passages] = result["documents"]

Markdown(all_passages[0])

standardized testing as well as many families, educators, and policymakers have 
raised concerns about the extent to which U.S. students are tested as part of 
K–12 education. Polling has suggested that the public’s doubts about the value of 
standardized testing in schools have grown over time, and the necessary relaxation 
of federal testing requirements under the successor Every Student Succeeds Act 
during the 2019–2020 and, to a lesser extent, 2020–2021 pandemic years has 
further contributed to those doubts. (See Bruno and Goldhaber 2021 for a brief 
recent overview.)A Living 
Document
This release of the Assessment 
Framework for the Digital SAT 
Suite  includes authoritative, 
up-to-date information about 
the digital suite. As College 
Board continues to research and 
implement the tests, updates 
will be made to this document 
(and disseminated through other 
means, such as our website, 
sat.org/digital ) to ensure that 
readers have as complete and 
accurate a picture as possible.

The information captured by the embedding model is not relevant to the question. Perhaps more results solve the problem?

In [37]:
result = db.query(query_texts=[query], n_results=2)
[all_passages] = result["documents"]

[print(passage) for passage in all_passages]

standardized testing as well as many families, educators, and policymakers have 
raised concerns about the extent to which U.S. students are tested as part of 
K–12 education. Polling has suggested that the public’s doubts about the value of 
standardized testing in schools have grown over time, and the necessary relaxation 
of federal testing requirements under the successor Every Student Succeeds Act 
during the 2019–2020 and, to a lesser extent, 2020–2021 pandemic years has 
further contributed to those doubts. (See Bruno and Goldhaber 2021 for a brief 
recent overview.)A Living 
Document
This release of the Assessment 
Framework for the Digital SAT 
Suite  includes authoritative, 
up-to-date information about 
the digital suite. As College 
Board continues to research and 
implement the tests, updates 
will be made to this document 
(and disseminated through other 
means, such as our website, 
sat.org/digital ) to ensure that 
readers have as complete and 
accurate a picture as pos

[None, None]

There are unnecessary details in the original PDF. "Garbage in, garbage out" 
The solution is perhaps to distill the relevant information from the PDF into a text document.

The below code distills the PDF.

In [38]:
# TODO

do some mathplotlib thingies