In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Some parts of this Codelab are (c) Google 2025 under the Apache License.
(c) Aquarc 2025

# Synopsis
Aquarc is an all-in-one SAT platform for high schoolers designed to minimize time spent using the software and maximizing practice and essential questions. In order to further this mission, Aquarc Intelligence was created to analyze mistakes within a question and to suggest similar questions for efficient practicing.

# Install the SDK 
We will be using Google's Gemini and utilities to build the model.

In [2]:
!pip uninstall -qqy jupyterlab  # Remove unused packages from Kaggle's base image that conflict
!pip install -U -q "google-genai==1.7.0" langchain PyPDF2 "chromadb==0.6.3" html-to-markdown

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m5.1 MB/s[0m

Import the SDK and set up the API key

In [3]:
from google import genai
from google.genai import types

from IPython.display import HTML, Markdown, display

Set up a retry helper so we can press "Run All" and not worry about hitting the quota. 

In [4]:
from google.api_core import retry


is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

### Set up your API key

To run the following cell, your API key must be stored it in a [Kaggle secret](https://www.kaggle.com/discussions/product-feedback/114053) named `GOOGLE_API_KEY`.

If you don't already have an API key, you can grab one from [AI Studio](https://aistudio.google.com/app/apikey). You can find [detailed instructions in the docs](https://ai.google.dev/gemini-api/docs/api-key).

To make the key available through Kaggle secrets, choose `Secrets` from the `Add-ons` menu and follow the instructions to add your key or enable it for this notebook.

In [5]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

### Choose your model
Depending on what's available and your quota, choose a model that's effective for your purposes.

In [6]:
client = genai.Client(api_key=GOOGLE_API_KEY)

for model in client.models.list():
  print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01

Check out the detailed information for your model

In [7]:
for model in client.models.list():
  if model.name == 'models/gemini-2.0-flash':
    print(model.to_json_dict())
    break

# change this line if you want to use a different model
model = "gemini-2.0-flash"

{'name': 'models/gemini-2.0-flash', 'display_name': 'Gemini 2.0 Flash', 'description': 'Gemini 2.0 Flash', 'version': '2.0', 'tuned_model_info': {}, 'input_token_limit': 1048576, 'output_token_limit': 8192, 'supported_actions': ['generateContent', 'countTokens']}


Test your model

In [8]:
chat = client.chats.create(model=model, history=[])
response = chat.send_message('Hello! My name is Zlork.')
print(response.text)

Nice to meet you, Zlork! It's a pleasure to make your acquaintance. Is there anything I can help you with today?



You can use the `Markdown()` function to format it nicely in Kaggle.

In [9]:
response = chat.send_message('Hello! My name is Zlork.' + 
                             'Use some fancy markdown in your message')
Markdown(response.text)

Greetings, **Zlork**!

It is with great pleasure that I extend a *warm* and *cordial* welcome. I hope this day finds you well.  I am at your service.  Please, do not hesitate to avail yourself of my capabilities.

Perhaps you'd like me to:

*   Suggest a **literary masterpiece**?
*   Compose a **limerick** with a most *absurd* subject?
*   Help you explore the depths of **quantum physics**?

The possibilities are, as they say, *endless!*


# A fine prompt
**TODO: you should probably move this**

Let's start by writing our prompt. What should it include and what should it mention? Following the principles taught throughout the course, let's place an importance on giving **positive** instructions rather than **negative** instructions to maintain effectiveness. 

The model needs to do the following:
- [ ] Stick to the SAT: Understand weighting and importance of certain questions, categories, and ensure all advice is applicable to the bounds of the SAT. A document with the specifications of the SAT format can be used for Retrieval Augmented Generation (RAG) rather than potential hallucination over the exact requirements.
- [ ] Have access to the current question (and maybe the previous questions for even more context)
- [ ] Have access to the answers and time between each to estimate confidence.
- [x] Understand images or SVGs for Inference and other Reading/Writing questions.
- [ ] Use Tree of Thoughts (ToT) to generate multiple solving processes because multiple methods may be used to arrive at the same answer, and output these answering mechanisms in a JSON array to present on the website effectively (a TUI for this notebook)
- [ ] Find semantically related questions and present them to the user on the website using ReAct (a TUI for this notebook)

Let's start by finding an effective prompt. 
Then we can evaluate the effectiveness of including the SAT standards in a PDF to help the user answering a question. We will also evaluate whether a vector search database is useful for this document.

Below, the variables for the specific question we are testing are defined.

In [10]:
# All questions are (c) CollegeBoard 2025.

question = {
        "question" : """
In a paper about p-i-n planar perovskite solar cells (one of several perovskite cell architectures designed to collect and store solar power), Lyndsey McMillon-Brown et al. described a method for fabricating the cell’s electronic transport layer (ETL) using a spray coating. Conventional ETL fabrication is accomplished using a solution of nanoparticles. The process can result in a loss of up to 80% of the solution, increasing the cost of manufacturing at scale—an issue that may be obviated by spray coating fabrication, which the researchers describe as “highly reproducible, concise, and practical.”

What does the text most strongly suggest about conventional ETL fabrication?
A. It is less suitable for manufacturing large volumes of planar p-i-n perovskite solar cells than an alternative fabrication method may be.
B. It is more expensive when manufacturing at scale than are processes for fabricating ETLs used in other perovskite solar cell architectures.
C. It typically entails a greater loss of nanoparticle solution than do other established approaches for ETL fabrication.
D. It is somewhat imprecise and therefore limits the potential effectiveness of p-i-n planar perovskite solar cells at capturing and storing solar power.
""",
        "rationale" : """
Choice A is the best answer. Conventional solar cell fabrication increases “the cost of manufacturing at scale,” but spray coating might get rid of that problem.

Choice B is incorrect. This is not completely supported by the text. While it’s true that conventional ETL fabrication is expensive at scale, there’s nothing in the text that mentions other perovskite solar cell architectures. Choice C is incorrect. This choice does not match the text. Only one conventional method of ETL fabrication is described, so we can’t compare the solution loss in this method to that of other conventional methods. Choice D is incorrect. This choice isn’t supported by the text. The text never suggests that the effectiveness of solar cells changes based on their method of fabrication. 
""",
        "user_answer" : "C",
}

Ideally, the user will talk to the chatbot after it gets the question wrong (or before in some scenarios, too). Either way, the user will have some rationale as to his or her answer to the question. Don't expect this rationale to be well thought out - the objective of the intelligent agent is to draw out what they actually mean. It may be completely omitted as well.

In [11]:
question["user_rationale"] = "Isn't the new method of ETL fabrication the same as the 'established methods'"

# Use Reinforcement Learning with AI Feedback to Finetune a Prompt
Let's generate a couple prompts for Aquarc Intelligence to use, given these parameters. We will then evaluate the effectiveness of these prompts against each of the questions and the user's query.

In [12]:
from google.genai.types import GenerateContentConfig

response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    # no specs yet
    contents=f"Generate prompts for a model that will do nothing more but take information about the current question, rationale, user answer and their rationale (if there is one) for the SAT. Come up with different variations for the prompt like more or less concise or multiple thought processes or just one, etc. Here is an example question to illustrate my point (although math and english still exist)\
                  {question['question']}\n \
              {question['rationale']}\n\
              The user got: {question['user_answer']}\n \
              {question['user_rationale']}",
    config=GenerateContentConfig(
        system_instruction=[
            "You are a prompt engineer's assistant. Help the prompt engineer generate some prompts for his AI-powered SAT learning platform called Aquarc. The platform currently holds an SAT question bank with over 5000 questions and tracks which questions you get wrong per category. While this feature is helpful it lacks the intelligence necessary to be a full fledged SAT platform",
        ],
        temperature=2.0,
        top_k=10,
    ),
)

Markdown(response.text)

Okay, here are several prompt variations designed to work with your Aquarc SAT platform, focusing on analyzing user answers and providing feedback based on the question, correct answer, user's answer, and associated rationales.

**Key Considerations for all Prompts:**

*   **Consistency:** You'll need to format the input data consistently across all questions.
*   **Clarity:** The more specific you are in the prompt, the better the model can perform.
*   **Temperature:** Start with a low temperature (e.g., 0.2) to ensure consistency, and increase if you want more creative or varied responses.
*   **Token Limit:** Be mindful of the token limits of your chosen LLM. Trim unnecessary text to stay within those limits.

Here's a question JSON representation which may make prompting simpler. Assume question categories are `reading`, `writing`, and `math`. Each time the user misses a question, make sure the 'incorrect questions list' is updated (easy enough to just put questions in the prompt once user answers).

```json
{
  "question": "In a paper about p-i-n planar perovskite solar cells...",
  "correct_answer": "A",
  "correct_answer_rationale": "Conventional solar cell fabrication increases...",
  "user_answer": "C",
  "user_answer_rationale": "Isn't the new method of ETL fabrication the same...",
  "choices": {
    "A": "It is less suitable for manufacturing large...",
    "B": "It is more expensive when manufacturing at scale...",
    "C": "It typically entails a greater loss of nanoparticle solution...",
    "D": "It is somewhat imprecise and therefore limits the potential..."
  },
  "category": "reading",
   "incorrect_questions": []
}
```

Now the prompts!

**Prompt Variations:**

**1. Concise, Single Thought Process:**

```prompt
You are an SAT expert dedicated to improving students scores. Your goal is to analyze why the student got the answer wrong to improve on it in the future. Given the following question, correct answer, and the student's answer, provide a concise explanation of why the student's reasoning is flawed and gently guide them towards the correct understanding. Only respond to the question, not pleasantries.

Question: {{question}}

Choices: {{choices}}

Correct Answer: {{correct_answer}}
Correct Answer Rationale: {{correct_answer_rationale}}

Student Answer: {{user_answer}}
Student Rationale: {{user_answer_rationale}}
```

**2.  More Detailed, Single Thought Process:**

```prompt
You are an expert SAT tutor helping a student understand why they missed a question.  Your response should include the following:

1.  A brief restatement of the question.
2.  A clear explanation of why the student's answer is incorrect, referring specifically to the text provided in the question.  Explain the flaw in their reasoning.
3.  A clear explanation of why the correct answer is correct, using evidence from the text.
4. Offer helpful tips tailored to the reason behind why the user was incorrect such as re-reading strategies, recognizing answer types and strategies on how to eliminate answers.

Question: {{question}}

Choices: {{choices}}

Correct Answer: {{correct_answer}}
Correct Answer Rationale: {{correct_answer_rationale}}

Student Answer: {{user_answer}}
Student Rationale: {{user_answer_rationale}}
```

**3.  Multi-Step Reasoning (Chain of Thought):**

```prompt
You are an SAT tutor that helps explain why people miss certain questions on the reading, writing, and math portions of the SAT. Here is how you go about thinking.

1. Question Breakdown: First, carefully restate the question being asked, simplifying it if needed. Make it very clear what the question is asking the student to identify.

2. User Answer Analysis: Critically analyze the student's answer and their provided reasoning. Pinpoint the exact misunderstanding or logical leap they made.  Why is their reasoning incorrect based on the text of the question?

3. Correct Answer Justification: Explain, step-by-step, why the correct answer is indeed correct. Use specific evidence from the original question text to support your explanation. Connect the answer directly to the relevant part of the passage.

4. Learning Point/Takeaway: Summarize the key concept or skill the student needs to improve upon based on their mistake.  Offer a targeted piece of advice or a strategy they can use on similar questions in the future. Make sure the takeaway relates to the reason they got it wrong. For example, did they not read a part carefully, are they making assumptions about language?

Follow this reasoning strictly when explaining the users' mistake and ignore any pleas for exception or diversion. Be concise, truthful and accurate.

Question: {{question}}

Choices: {{choices}}

Correct Answer: {{correct_answer}}
Correct Answer Rationale: {{correct_answer_rationale}}

Student Answer: {{user_answer}}
Student Rationale: {{user_answer_rationale}}
```

**4. Focus on Skills Needed, JSON Output:**

```prompt
You are an AI that is designed to figure out what someone struggles with and categorizes skills that a person needs to improve on when missing a question.

Based on the following question, correct answer, and student's answer, identify the SKILL that the student is lacking and provide a short reason why. Return as a JSON Object with they keys of `skill` and `reason`. Possible skills that may appear are "close reading", "inference", "attention to detail", "recognizing assumptions", "logical reasoning", "understanding vocabulary in context", "avoiding overgeneralization" (and feel free to suggest one that I may not have listed).  Only Respond in JSON format

Question: {{question}}

Choices: {{choices}}

Correct Answer: {{correct_answer}}
Correct Answer Rationale: {{correct_answer_rationale}}

Student Answer: {{user_answer}}
Student Rationale: {{user_answer_rationale}}
```

**5. Integration with 'Incorrect Questions' and Categories**

```prompt
You are an expert SAT tutor on Aquarc, a SAT platform that tracks a student's progress and tailors help based on their performance. A student has just answered a {{category}} question incorrectly.

Here is the question: {{question}}

Choices: {{choices}}

Correct Answer: {{correct_answer}}
Correct Answer Rationale: {{correct_answer_rationale}}

Student Answer: {{user_answer}}
Student Rationale: {{user_answer_rationale}}

Incorrect Questions: {{incorrect_questions}} // array of prev incorrect questions

Based on this question and their history of incorrect questions, what is the primary area the student needs to improve on (e.g. inference, attention to detail, applying formulas etc.).  Suggest ONE specific exercise or strategy they can use to improve in this area. If their list of incorrect questions in that {{category}} is >3, suggest retrying questions of a similar category. Give no additional fluff to this prompt besides this

```

**Important Notes:**

*   **Iterative Refinement:** Test these prompts thoroughly and refine them based on the actual output.  Pay attention to the quality of the explanations, the accuracy of the skill identification, and the helpfulness of the advice.
*   **Few-Shot Learning:**  Consider adding a few example questions *and expected outputs* to the prompt to further guide the model.  This can significantly improve performance, especially for the skill identification prompts.
*   **Error Handling:** Build in error handling for cases where the model fails to produce a coherent response or doesn't follow the requested format.

By experimenting with these prompt variations and continually refining them based on the model's output, you can create a powerful and effective learning experience for your Aquarc users!


Here are some sample prompts extracted from the Gemini output:

In [13]:
prompts = [ 
    {
        "prompt": "Analyze the user's error. Why is the correct answer better supported by the text than the user's answer? Be concise.",
        "avg_score": 0.0,
    },
    {
        "prompt": """
Consider the SAT question and the user's selected answer.
1. Identify the specific textual evidence that strongly supports the correct answer choice.
2. Identify any assumptions the user might be making that lead to their chosen answer.
3. Explain why the textual support for the correct answer is stronger or more direct than any implied support for the user's answer.  If the user provides their own rational, specifically address the rational and mention what part of the question makes the rational wrong.
        """,
        "avg_score": 0.0,
    },
    {
        "prompt" : """
Let's analyze this question step-by-step to understand the user's error.

1.  Summarize the main point of the passage in your own words.
2.  Identify the key phrase(s) in the question that guide you to the correct answer.
3.  Explain why the correct answer directly addresses the question based on the text.
4.  Explain what specific words, or phrases, may make the users' answer incorrect.
5. Given the user's answer, what misunderstanding might the student have in this section? What advice could you give to them in the future?
        """,
        "avg_score": 0.0,
    },
    {
        "prompt": """
Evaluate both the correct answer and the user's answer as potential responses to the question.

*   Present the strongest possible argument *in favor* of the user's answer.
*   Present the strongest possible argument *against* the user's answer.
*   Explain why, ultimately, the correct answer is the superior choice based on textual evidence.
        """,
        "avg_score": 0.0,
    },
    { 
        "prompt": """
Analyze the question, correct answer, and the user's answer.

Step 1: Summarize the core argument or concept being tested in the question.
Step 2: Identify the specific details in the question and correct answer rationale that are most crucial for arriving at the correct answer.
Step 3: Analyze the user's answer choice.  Explain why it is incorrect. If the user provided a rationale, identify where the user's reasoning is flawed, citing specific evidence from the question text or correct answer rationale.  If no rationale was provided, hypothesize potential reasons for the incorrect choice based on common SAT misconceptions or test-taking errors related to this question type.
Step 4: Explicitly state the mistake that the user made to reach their conclusion and how they can reach the proper conclusion by fixing the flaw.
        """,
        "avg_score": 0.0,
    },
]

Some sample questions varied in difficulty:

In [14]:
# All questions are (c) CollegeBoard 2025.
questions = [
    {
        "question": """
In 1453, English King Henry VI became unfit to rule after falling gravely ill. As a result, Parliament appointed Richard, Third Duke of York, who had a strong claim to the English throne, to rule as Lord Protector. Upon recovering two years later, ______ forcing an angered Richard from the royal court and precipitating a series of battles later known as the Wars of the Roses. \n\
Which choice completes the text so that it conforms to the conventions of Standard English?
A. Henry resumed his reign,
B. the reign of Henry resumed,
C. Henry’s reign resumed,
D. it was Henry who resumed his reign, 
            """,
        # Notice that the default rationale provided for Choice C doesn't explain it well enough
        "rationale": """
Choice A is the best answer. The convention being tested is subject-modifier placement. This choice ensures that the introductory phrase “upon recovering two years later” appears immediately before the noun it modifies (“Henry”), clearly establishing that Henry recovered two years later. 
Choice B is incorrect because it results in a dangling modifier. The placement of the noun phrase “the reign of Henry” immediately after the introductory phrase illogically suggests that the reign of Henry recovered two years later. 
Choice C is incorrect because it results in a dangling modifier. The placement of the noun phrase “Henry’s reign” immediately after the introductory phrase illogically suggests that Henry’s reign recovered two years later. 
Choice D is incorrect because it results in a dangling modifier. The placement of the function word “it” immediately after the introductory phrase illogically suggests that “it” recovered two years later. 
        """,
        "user_answer": "C",
        "user_rationale": "Isn't the subject that Henry got unfit to rule?",
    },
    # Feel free to uncomment the following

#    {
#        "question": """
#A study by a team including finance professor Madhu Veeraraghavan suggests that exposure to sunshine during the workday can lead to overly optimistic behavior. __Using data spanning from 1994 to 2010 for a set of US companies, the team compared over 29,000 annual earnings forecasts to the actual earnings later reported by those companies.__ The team found that the greater the exposure to sunshine at work in the two weeks before a manager submitted an earnings forecast, the more the manager’s forecast exceeded what the company actually earned that year.
#Which choice best states the function of the underlined sentence in the overall structure of the text? 
            
#A. To summarize the results of the team’s analysis
#B. To present a specific example that illustrates the study’s findings
#C. To explain part of the methodology used in the team’s study
#D. To call out a challenge the team faced in conducting its analysis
#            """,
#        "rationale": """
#Choice C is the best answer because it best describes how the underlined sentence functions in the text as a whole. The first sentence presents the implications of Veeraraghavan’s team’s study: sunshine exposure during work hours can cause overly optimistic behavior. The underlined sentence then describes the data the team consulted and how they were used (comparing predictions about earnings to what the companies actually earned), and the final sentence presents what the team found in their examination of the data. Thus, the underlined sentence mainly functions to explain part of the methodology used in the team’s study. 
#Choice A is incorrect because the underlined sentence explains in part how the team conducted their analysis of the effect of sunshine but doesn’t address what the team found; a broad summary is instead given in the other two sentences. 
#Choice B is incorrect because the underlined sentence doesn’t present any specific examples from the team’s comparisons of 29,000 earnings predictions to actual earnings; it simply explains in part how the team conducted their analysis. 
#Choice D is incorrect because the underlined sentence simply explains in part how the team conducted their analysis; the text never mentions any challenges that the team encountered in their study. 
#            """,
#        "user_answer": "A",
#        "user_rationale": "Is it not explaining the part of the experiment which illustrates what happens?"
#    },
#    {
#        "question" : """
#In a paper about p-i-n planar perovskite solar cells (one of several perovskite cell architectures designed to collect and store solar power), Lyndsey McMillon-Brown et al. described a method for fabricating the cell’s electronic transport layer (ETL) using a spray coating. Conventional ETL fabrication is accomplished using a solution of nanoparticles. The process can result in a loss of up to 80% of the solution, increasing the cost of manufacturing at scale—an issue that may be obviated by spray coating fabrication, which the researchers describe as “highly reproducible, concise, and practical.”

#What does the text most strongly suggest about conventional ETL fabrication?
#A. It is less suitable for manufacturing large volumes of planar p-i-n perovskite solar cells than an alternative fabrication method may be.
#B. It is more expensive when manufacturing at scale than are processes for fabricating ETLs used in other perovskite solar cell architectures.
#C. It typically entails a greater loss of nanoparticle solution than do other established approaches for ETL fabrication.
#D. It is somewhat imprecise and therefore limits the potential effectiveness of p-i-n planar perovskite solar cells at capturing and storing solar power.
#""",
#        "rationale" : """
#Choice A is the best answer. Conventional solar cell fabrication increases “the cost of manufacturing at scale,” but spray coating might get rid of that problem.

#Choice B is incorrect. This is not completely supported by the text. While it’s true that conventional ETL fabrication is expensive at scale, there’s nothing in the text that mentions other perovskite solar cell architectures. Choice C is incorrect. This choice does not match the text. Only one conventional method of ETL fabrication is described, so we can’t compare the solution loss in this method to that of other conventional methods. Choice D is incorrect. This choice isn’t supported by the text. The text never suggests that the effectiveness of solar cells changes based on their method of fabrication. 
#""",
#        "user_answer" : "C",
#        "user_rationale": "Isn't the new method of ETL fabrication the same as the 'established methods'",
#    },
]

## Evaluation 
The rubric is slightly biased in favor of the debate prompt, but it also hits the other points on the rubric and maintains clarity. If more prompts are to be tested, they can be added above.

In [15]:
import enum

# Define the evaluation prompt
EVAL_PROMPT = """
# Instruction
Evaluate the AI’s analysis of a student’s SAT error. Focus on how well it explains why the correct answer is textually supported and why the user’s answer is incorrect.

# Evaluation
## **Metric Definition**
Assess the **accuracy**, **completeness**, **groundedness**, and **clarity** of the response. The AI must:
1. Correctly identify textual evidence for the correct answer.
2. Explain flaws in the user’s answer (and their rationale, if provided).
3. Follow the prompt’s instructions precisely.


## **New Criteria**  
1. **Instructional Creativity**:  
   - Does the creative approach (e.g., debate, step-by-step analysis) **enhance understanding** of why the correct answer is textually supported?  
   - Does it **strategically use the prompt’s structure** (e.g., arguments for/against) to highlight key SAT skills like evidence analysis or assumption identification?  

2. **Educational Effectiveness**:  
   - Does the creativity **directly serve the learning goal** (e.g., clarifying misconceptions, modeling SAT logic), or is it merely ornamental?  
   - Does it **engage the learner** while maintaining rigor (e.g., making complex reasoning more accessible)?  

## **Revised Rating Rubric**  
- **5 (Excellent)**:  
  - Creative structure (e.g., debate) **directly reinforces** why the correct answer is superior.  
  - Uses the format to **explicitly contrast** the user’s error with textual evidence (e.g., “The strongest argument *for* the user’s answer is X, but the text contradicts this because Y”).  
  - Balances creativity with precision and clarity.  

- **4 (Good)**:  
  - Creative approach is **mostly effective** but slightly misses opportunities to deepen understanding (e.g., lists arguments but doesn’t explicitly tie them to SAT skills).  
  - Minor clarity issues in linking creativity to the text.  

- **3 (Adequate)**:  
  - Creativity **distracts** slightly from the core analysis (e.g., overemphasizes hypothetical arguments without grounding in the text).  
  - Fails to fully leverage the creative structure to address the user’s error.  

- **2 (Poor)**:  
  - Creative format **obscures key points** (e.g., hypothetical arguments misrepresent the text).  
  - Prioritizes style over substance; minimal educational value.  

- **1 (Very Poor)**:  
  - Creativity **undermines accuracy** (e.g., invents textual evidence to support arguments).  

## **Examples**  
### **Debate-Style Prompt (Question 4)**  
**User Answer**: C (“Henry’s reign resumed”)  
**Correct Answer**: A (“Henry resumed his reign”)  

**Good AI Response** (Rating 5):  
*“Argument FOR C: A student might think ‘Henry’s reign’ is the subject because the prior sentence mentions Parliament appointing a ruler.  
Argument AGAINST C: The modifier ‘upon recovering’ must refer to a person (Henry), not an abstract concept (‘reign’). The text says Henry fell ill, so only he—not his reign—can ‘recover.’  
Conclusion: While C seems plausible, the modifier rule and textual context make A correct.”*  

**Why it’s a 5**:  
- Uses debate structure to **preemptively address** the user’s assumption.  
- Directly ties arguments to **textual evidence** (Henry’s illness/recovery).  

**Poor AI Response** (Rating 2):  
*“FOR C: ‘Reign’ is a noun, so it matches the sentence structure.  
AGAINST C: It sounds awkward.  
Conclusion: A is better because it’s smoother.”*  

**Why it’s a 2**:  
- Creativity (debate) adds no educational value; arguments lack textual grounding.  
- Fails to explain grammar rules or modifier placement.  

## Evaluation Steps
STEP 1: Assess the response in aspects of instruction following, groundedness, conciseness, and verbosity according to the criteria.
STEP 2: Score based on the rubric.

# User Inputs and AI-generated Response
## User Inputs

### Prompt
{prompt}

## AI-generated Response
{response}
"""

# Define a structured enum class to capture the result.
class SummaryRating(enum.Enum):
  VERY_GOOD = '5'
  GOOD = '4'
  OK = '3'
  BAD = '2'
  VERY_BAD = '1'

# Coerce into the desired structure.
structured_output_config = types.GenerateContentConfig(
    response_mime_type="text/x.enum",
    response_schema=SummaryRating,
)

def eval_summary(prompt, ai_response):
  """Evaluate the generated summary against the prompt used."""

  eval_chat = client.chats.create(model='gemini-2.0-flash')
    
  # Generate the full text response.
  response = eval_chat.send_message(
      message=EVAL_PROMPT.format(prompt=prompt, response=ai_response)
  )
  verbose_eval = response.text


  response = eval_chat.send_message(
      message="Convert the final score.",
      config=structured_output_config,
  )
  structured_eval = response.parsed

  return verbose_eval, structured_eval

Evaluating the prompts against each other:

In [16]:
# TODO: maybe share context so the LLM can compare different outputs against its own rating to be more accurate perhaps?

#NUM_ITERATIONS = 5
NUM_ITERATIONS=1

with open("/kaggle/working/promptlogs.txt", "a") as logs:
    
    for prompt in prompts:
        sum = 0
        logs.write(f"Prompt: {prompt}\n")
    
        for question in questions:
            logs.write(f"Question: {question}\n")
            for i in range(NUM_ITERATIONS):
                response = client.models.generate_content(
                    model="gemini-2.0-flash-001",
                    contents=str(question),
                    config=GenerateContentConfig(
                        system_instruction=[prompt["prompt"],],
                        temperature=0.1,
                    ),
                )

                text_eval, struct_eval = eval_summary(prompt=prompt["prompt"], ai_response=response)
                print(f"Iteration {i}: {struct_eval}")
                logs.write(f"Iteration {i}: {text_eval}\n")
                sum += int(struct_eval.value)



        prompt["avg_score"] = sum / (len(questions) * NUM_ITERATIONS)
        print(f"Prompt:\n{prompt['prompt']}\nScore: {prompt['avg_score']}\n\n")
    
        logs.write("\n\n")

Iteration 0: SummaryRating.GOOD
Prompt:
Analyze the user's error. Why is the correct answer better supported by the text than the user's answer? Be concise.
Score: 4.0


Iteration 0: SummaryRating.GOOD
Prompt:

Consider the SAT question and the user's selected answer.
1. Identify the specific textual evidence that strongly supports the correct answer choice.
2. Identify any assumptions the user might be making that lead to their chosen answer.
3. Explain why the textual support for the correct answer is stronger or more direct than any implied support for the user's answer.  If the user provides their own rational, specifically address the rational and mention what part of the question makes the rational wrong.
        
Score: 4.0


Iteration 0: SummaryRating.GOOD
Prompt:

Let's analyze this question step-by-step to understand the user's error.

1.  Summarize the main point of the passage in your own words.
2.  Identify the key phrase(s) in the question that guide you to the correct an

The debate prompt consistently scores the highest, so that will be picked. Prompts can be re-generated and re-evaluated with ease if need be.

In [17]:
final_prompt = """
Evaluate both the correct answer and the user's answer as potential responses to the question.

*   Present the strongest possible argument *in favor* of the user's answer.
*   Present the strongest possible argument *against* the user's answer.
*   Explain why, ultimately, the correct answer is the superior choice based on textual evidence.
"""

## Note: "Smart" Questions
The majority of questions that Aquarc Intelligence has to process will contain HTML data. How does Aquarc Intelligence interact with "smart" questions (i.e. questions that make use of intelligent features like formatting, graphs, or other image representations of data)?

Most questions can be "cleansed" by converting to markdown like the following because there is text data available for most images (although the output is still quite messy):

In [18]:
# This question is (c) CollegeBoard 2025
rich_question = {
    "question": """
<figure class="image">
   <svg aria-label="Bar graph titled Municipalities’ Responses to Inquiries about Potential Incentives for Firm. The horizontal axis has no label. 3 data categories are shown. The vertical axis is labeled Number of municipalities. It ranges from 0 to 1,300 in increments of 100. Refer to long description." height="578.7376708984375" role="img" viewbox="0 0 400 578.7376708984375" width="400" xmlns="http://www.w3.org/2000/svg">
      <g data-name="Layer 1" id="ed420550-79eb-48d4-af01-cd27cdd08afd">
         <defs>
            +
            <pattern height="100" id="bar4" patterntransform="rotate(50)" patternunits="userSpaceOnUse" width="10" x="0" y="0">
               +
               <rect fill="#CDCDCD" height="100" width="5" x="0" y="0"></rect>
               +
               <rect fill="#444444" height="100" width="5" x="5" y="0"></rect>
               +
            </pattern>
            +
         </defs>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="72" y2="72"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="72" y2="72"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 78)">1,300</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="91.23076923076923" y2="91.23076923076923"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="91.23076923076923" y2="91.23076923076923"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 97.23076923076923)">1,200</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="110.46153846153845" y2="110.46153846153845"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="110.46153846153845" y2="110.46153846153845"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 116.46153846153845)">1,100</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="129.69230769230768" y2="129.69230769230768"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="129.69230769230768" y2="129.69230769230768"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 135.69230769230768)">1,000</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="148.9230769230769" y2="148.9230769230769"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="148.9230769230769" y2="148.9230769230769"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 154.9230769230769)">900</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="168.15384615384613" y2="168.15384615384613"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="168.15384615384613" y2="168.15384615384613"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 174.15384615384613)">800</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="187.3846153846154" y2="187.3846153846154"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="187.3846153846154" y2="187.3846153846154"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 193.3846153846154)">700</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="206.6153846153846" y2="206.6153846153846"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="206.6153846153846" y2="206.6153846153846"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 212.6153846153846)">600</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="225.84615384615384" y2="225.84615384615384"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="225.84615384615384" y2="225.84615384615384"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 231.84615384615384)">500</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="245.07692307692307" y2="245.07692307692307"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="245.07692307692307" y2="245.07692307692307"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 251.07692307692307)">400</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="264.30769230769226" y2="264.30769230769226"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="264.30769230769226" y2="264.30769230769226"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 270.30769230769226)">300</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="283.53846153846155" y2="283.53846153846155"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="283.53846153846155" y2="283.53846153846155"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 289.53846153846155)">200</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="302.7692307692308" y2="302.7692307692308"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="302.7692307692308" y2="302.7692307692308"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 308.7692307692308)">100</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="385" xmlns="http://www.w3.org/2000/svg" y1="322" y2="322"></line>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="93.75999450683594" x2="105.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="322" y2="322"></line>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(87.75999450683594 328)">0</text>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="middle" transform="translate(24 197) rotate(-90)">Number of municipalities</text>
         <line fill="none" stroke="#000000" stroke-linecap="round" stroke-linejoin="round" stroke-width="0.9" x1="99.75999450683594" x2="99.75999450683594" xmlns="http://www.w3.org/2000/svg" y1="72" y2="322"></line>
         <rect fill="#B3B3B3" height="240.76923076923077" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="128.28399505615235" xmlns="http://www.w3.org/2000/svg" y="81.23076923076923"></rect>
         <rect fill="#333333" height="240" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="156.80799560546876" xmlns="http://www.w3.org/2000/svg" y="82"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(166.72799560546875 341.84) rotate(-40)" x="0" xmlns="http://www.w3.org/2000/svg" y="0">no response</text>
         <rect fill="#B3B3B3" height="39.80769230769231" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="213.85599670410159" xmlns="http://www.w3.org/2000/svg" y="282.1923076923077"></rect>
         <rect fill="#333333" height="39.42307692307692" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="242.379997253418" xmlns="http://www.w3.org/2000/svg" y="282.5769230769231"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(252.29999725341798 341.84) rotate(-40)" x="0" xmlns="http://www.w3.org/2000/svg" y="0">responded to inquiry</text>
         <rect fill="#B3B3B3" height="24.23076923076923" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="299.4279983520508" xmlns="http://www.w3.org/2000/svg" y="297.7692307692308"></rect>
         <rect fill="#333333" height="23.46153846153846" stroke="#000000" stroke-linecap="round" stroke-linejoin="mitre" stroke-width="0.9" width="28.524000549316405" x="327.9519989013672" xmlns="http://www.w3.org/2000/svg" y="298.53846153846155"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="end" transform="translate(337.8719989013672 341.84) rotate(-40)" x="0" xmlns="http://www.w3.org/2000/svg" y="0">offered incentive</text>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="middle" transform="translate(242.37999725341797 24)">Municipalities’ Responses to Inquiries </text>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="middle" transform="translate(242.37999725341797 48)">about Potential Incentives for Firm</text>
         <rect fill="none" height="71" stroke="#000000" stroke-linejoin="mitre" stroke-width="0.9" width="280.1479034423828" x="67.4260482788086" xmlns="http://www.w3.org/2000/svg" y="496.7376708984375"></rect>
         <rect fill="#B3B3B3" height="12" stroke="#000000" stroke-linejoin="mitre" stroke-width="0.9" width="12" x="74.4260482788086" xmlns="http://www.w3.org/2000/svg" y="508.7376708984375"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="top" transform="translate(96.4260482788086 520.7376708984375)"> announcement before election</text>
         <rect fill="#333333" height="12" stroke="#000000" stroke-linejoin="mitre" stroke-width="0.9" width="12" x="74.4260482788086" xmlns="http://www.w3.org/2000/svg" y="540.7376708984375"></rect>
         <text fill="#000000" font-family="Crimson Text" font-size="19.84" text-anchor="top" transform="translate(96.4260482788086 552.7376708984375)"> announcement after election</text>
      </g>
   </svg>
</figure>
<div aria-label="Long description for bar graph titled Municipalities’ Responses to Inquiries about Potential Incentives for Firm" class="sr-only" role="region">
   <ul>
      +
      <li>
         For each data category, the following bars are shown: <br/>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           +
         <ul>
            +
            <li>announcement before election</li>
            +
            <li>announcement after election</li>
            +
         </ul>
         +
      </li>
      +
      <li>
         The data for the 3 categories are as follows: <br/>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   +
         <ul>
            +
            <li>
               no response:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          +
               <ul>
                  +
                  <li>announcement before election: 1,252</li>
                  +
                  <li>announcement after election: 1,248</li>
                  +
               </ul>
               +
            </li>
            +
            <li>
               responded to inquiry:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 +
               <ul>
                  +
                  <li>announcement before election: 207</li>
                  +
                  <li>announcement after election: 205</li>
                  +
               </ul>
               +
            </li>
            +
            <li>
               offered incentive:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    +
               <ul>
                  +
                  <li>announcement before election: 128</li>
                  +
                  <li>announcement after election: 122</li>
                  +
               </ul>
               +
            </li>
            +
         </ul>
         +
      </li>
      +
   </ul>
</div>
+
<p>In the United States, firms often seek incentives from municipal governments to expand to those municipalities. A team of political scientists hypothesized that municipalities are much more likely to respond to firms and offer incentives if expansions can be announced in time to benefit local elected officials than if they can’t. The team contacted officials in thousands of municipalities, inquiring about incentives for a firm looking to expand and indicating that the firm would announce its expansion on a date either just before or just after the next election. </p>

<p>Which choice best describes data from the graph that weaken the team&rsquo;s hypothesis?</p>

A. <p>A large majority of the municipalities that received an inquiry mentioning plans for an announcement before the next election didn&rsquo;t respond to the inquiry.</p>
B. <p>The proportion of municipalities that responded to the inquiry or offered incentives didn&rsquo;t substantially differ across the announcement timing conditions.&nbsp;</p>
C. <p>Only around half the municipalities that responded to inquiries mentioning plans for an announcement before the next election offered incentives.&nbsp;</p>
D. <p>Of the municipalities that received an inquiry mentioning plans for an announcement date after the next election, more than 1,200 didn&rsquo;t respond and only around 100 offered incentives.</p>
    """,
    "rationale": """
<p>Choice B is the best answer. The lighter bars show what happened when the announcement was to come before the election, and the darker bars show what happened when the announcement was to come after the election. For all three of the outcomes, the light and dark bars are virtually the same, demonstrating that the announcement timing didn&rsquo;t actually make a difference. </p>
<p>Choice A is incorrect. This accurately describes some data from the graph, but it doesn&rsquo;t weaken the hypothesis. It doesn&rsquo;t include the &ldquo;announcement after election&rdquo; data for comparison. Choice C is incorrect. This accurately describes some data from the graph, but it doesn&rsquo;t weaken the hypothesis. It doesn&rsquo;t include the &ldquo;announcement after election&rdquo; data for comparison. Choice D is incorrect. This accurately describes some data from the graph, but it doesn&rsquo;t weaken the hypothesis. It doesn&rsquo;t include the &ldquo;announcement before election&rdquo; data for comparison. </p>    
    """,
    "user_choice": "D",
    "user_rationale": "isn't this data objly true?"
}

Even with the SVG, Gemini can still interpret the noisy data:

In [19]:
sample_rich_answer = client.models.generate_content(
                    model="gemini-2.0-flash-001",
                    contents=str(rich_question),
                    config=GenerateContentConfig(
                        system_instruction=[final_prompt,],
                        temperature=0.1,
                    ),
                )

Markdown(sample_rich_answer.text)

Okay, let's analyze the question and the user's answer.

**Question:** Which choice best describes data from the graph that weaken the team’s hypothesis?

**Correct Answer:** B. The proportion of municipalities that responded to the inquiry or offered incentives didn’t substantially differ across the announcement timing conditions.

**User's Answer:** D. Of the municipalities that received an inquiry mentioning plans for an announcement date after the next election, more than 1,200 didn’t respond and only around 100 offered incentives.

**Arguments in favor of the user's answer (D):**

*   **Directly addresses the "after election" scenario:** The user's answer focuses specifically on the municipalities that received inquiries about announcements *after* the election. The hypothesis suggests these municipalities should be *less* likely to respond or offer incentives.
*   **Highlights a potential lack of interest:** The user's answer points out that a large number of municipalities didn't respond and only a small number offered incentives in the "after election" scenario. This could be interpreted as evidence supporting the idea that municipalities are less interested when there's no immediate electoral benefit.
*   **Based on factual data:** The user's answer accurately reflects the data presented in the graph. The graph does show a large number of "no response" and a smaller number of "offered incentive" for the "announcement after election" condition.

**Arguments against the user's answer (D):**

*   **Lacks a crucial comparison:** The primary weakness of the user's answer is that it only describes the "announcement after election" condition. To *weaken* the hypothesis, we need to show that the "announcement before election" condition *doesn't* have a significantly different outcome. Without this comparison, we can't say whether the timing of the announcement truly matters. The hypothesis is about the *difference* between the two conditions.
*   **Doesn't directly contradict the hypothesis:** While the user's answer shows a certain pattern in the "after election" data, it doesn't directly contradict the hypothesis. The hypothesis could still be true even if the "after election" group shows low response rates, as long as the "before election" group shows significantly *higher* response rates.

**Why the correct answer (B) is superior:**

*   **Directly addresses the core of the hypothesis:** The hypothesis is about whether the *timing* of the announcement affects municipal behavior. Choice B directly addresses this by stating that the proportion of municipalities that responded or offered incentives *didn't substantially differ* between the "before" and "after" conditions. This directly contradicts the hypothesis that timing matters.
*   **Comparative:** Choice B makes a direct comparison between the two announcement timing conditions, which is essential for weakening the hypothesis. It states that there's no substantial difference, implying that the timing of the announcement doesn't have a significant impact on the municipalities' responses.
*   **Supported by the graph:** By examining the graph, we can see that the bars for "announcement before election" and "announcement after election" are very similar for all three categories (no response, responded to inquiry, offered incentive). This visually supports the claim that there's no substantial difference between the two conditions.

**Conclusion:**

While the user's answer (D) is factually correct and relevant to the question, it doesn't provide a strong argument against the hypothesis because it lacks a crucial comparison. The correct answer (B) is superior because it directly addresses the core of the hypothesis by stating that there's no substantial difference in municipal behavior across the two announcement timing conditions, and it is supported by the data presented in the graph.


While cleansing the data may be smoother for the LLM, not all data can be easily cleansed:

In [20]:
from html_to_markdown import convert_to_markdown

rich_question = {
    "question": convert_to_markdown("""
<p><span role="region" aria-label="Referenced Content"><u>&ldquo;How lifelike are they?&rdquo;</u></span> Many computer animators prioritize this question as they strive to create ever more realistic environments and lighting. Generally, while characters in computer-animated films appear highly exaggerated, environments and lighting are carefully engineered to mimic reality. But some animators, such as Pixar&rsquo;s Sanjay Patel, are focused on a different question. Rather than asking first whether the environments and lighting they&rsquo;re creating are convincingly lifelike, Patel and others are asking whether these elements reflect their films&rsquo; unique stories.</p>

<p>Which choice best describes the function of the underlined question in the text as a whole?</p>

A. <p>It reflects a primary goal that many computer animators have for certain components of the animations they produce.</p>
B. <p>It represents a concern of computer animators who are more interested in creating unique backgrounds and lighting effects than realistic ones.</p>
C. <p>It conveys the uncertainty among many computer animators about how to create realistic animations using current technology.</p>
D. <p>It illustrates a reaction that audiences typically have to the appearance of characters created by computer animators.</p>
    """),
    "rationale": convert_to_markdown("""
<p>Choice A is the best answer because it most accurately describes the function of the underlined question in the text as a whole. The text begins with the underlined question, &ldquo;How lifelike are they?&rdquo; The text then explains that many computer animators pose this question about the environments and lighting that they create for animated films, striving for realistic animation of those components even if the characters themselves aren&rsquo;t portrayed in realistic terms. The focus of the text then shifts to describe how some animators strive to create environments and lighting that reflect the film&rsquo;s unique stories rather than making them appear realistic. Therefore, the function of the underlined question is to reflect a primary goal that many computer animators have for certain components of the animations they produce. </p><p>Choice B is incorrect because, as the text makes clear, the underlined question is one posed by computer animators who wish to create realistic backgrounds and lighting effects, not by those who, instead, wish to create effects that reflect films&rsquo; unique stories and aren&rsquo;t necessarily realistic; this latter group of animators is discussed later in the text. Choice C is incorrect. As the text explains, many computer animators strive for realistic environments and lighting, while others do not; this difference of approach relates to whether these components should be realistic, not to how realism can be achieved using current technology, and the text never suggests that animators are uncertain how to achieve it. Choice D is incorrect because the underlined question pertains to the perspective of computer animators, not the audience, and the text never considers audience&rsquo;s reactions to characters in animated films. </p>
    """),
    "user_choice": "C",
    "user_rationale": "why is it not C? Aren't they asking the question because they are uncertain about it?",
}

print(rich_question["question"])


“How lifelike are they?” Many computer animators prioritize this question as they strive to create ever more realistic environments and lighting. Generally, while characters in computer\-animated films appear highly exaggerated, environments and lighting are carefully engineered to mimic reality. But some animators, such as Pixar’s Sanjay Patel, are focused on a different question. Rather than asking first whether the environments and lighting they’re creating are convincingly lifelike, Patel and others are asking whether these elements reflect their films’ unique stories.


Which choice best describes the function of the underlined question in the text as a whole?



A. It reflects a primary goal that many computer animators have for certain components of the animations they produce.


B. It represents a concern of computer animators who are more interested in creating unique backgrounds and lighting effects than realistic ones.


C. It conveys the uncertainty among many computer ani

Perhaps in this scenario, "good enough" gets the job done.

# Multiple Reasoning Capabilities using Tree of Thoughts (ToT)
To understand questions better, the debate-style approach may not suffice; multiple, parallel thinking processes may be necessary for the user to truly grasp the problem with their current reasoning. This approach is less time consuming and may help the user better than walking through and typing their own rationale. Tree of Thoughts (ToT) is a technique for the agent to use multiple processes to approach the relevant answer for the user. The agent will **have to** output in JSON so that the user's screen can be separated and decluttered.

In [21]:
response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    # no specs yet
    contents=f"""
**Objective**: Create SAT-focused prompt variations that generate multiple structured thinking processes for error analysis.
                
**Requirements for the agent that you will create a prompt for**:
1. The model you are creating a prompt for will analyze: 
   - Current SAT question
   - Official rationale
   - User's answer + rationale
2. Must output in the following format:
```json
[
    {{
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    }},
    {{
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    }},
    {{
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    }},
]
```

3. There will be a model before you that contrasts an argument in favor of the user's answer with an argument for the actual answer.
You will be given that output. Create a prompt such that this model will build upon it.

The model may not include anything outside the JSON.

\"leads_to\" represents that it leads to the users answer or the \"correct\" answer. There are no other options.

Example question your model may receive:

Question:
{question['question']}

Rationale:
{question['rationale']}

User Response:
- Answer: {question['user_answer']}
- User Rationale: {question['user_rationale']}

Sample Response from the Previous Agent (not related to the question):
{sample_rich_answer}.

Don't generate any code. Just generate multiple creative but logical prompts we can use to make an AI for this
              """,
    config=GenerateContentConfig(
        system_instruction=[
            "You are a prompt engineer's assistant. Help the prompt engineer generate some prompts for his AI-powered SAT learning platform called Aquarc. The platform currently holds an SAT question bank with over 5000 questions and tracks which questions you get wrong per category. While this feature is helpful it lacks the intelligence necessary to be a full fledged SAT platform",
        ],
        temperature=0.7,
        top_k=7,
    ),
)

Markdown(response.text)

Okay, here are some prompt variations designed to elicit structured thinking processes for error analysis from the AI model, building upon the previous agent's output:

**Prompt 1: Focus on pinpointing the error in reasoning**

```
You are an expert SAT tutor analyzing a student's mistake on a practice question. You are given the question, the official rationale, the student's answer and rationale, and a previous AI agent's analysis that includes arguments for and against the student's answer. Your task is to identify the precise flaw in the student's reasoning process that led to the incorrect answer.

Specifically, based on all the provided information, generate a JSON array of different step by step thought processes one could use to arrive at each answer. For each thought process, indicate whether it leads to the user's answer or the correct answer using the "leads_to" field.

The JSON should conform to the following format:

```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

Here's the information for your analysis:

**Question:** [Insert Question Here]
**Rationale:** [Insert Rationale Here]
**User Answer:** [Insert User Answer Here]
**User Rationale:** [Insert User Rationale Here]
**Previous Agent's Analysis:** [Insert Previous Agent's Analysis Here]
```

**Prompt 2: Focus on alternative approaches to the question**

```
You are an expert SAT tutor helping a student understand different ways to approach a question they answered incorrectly. You are given the question, the official rationale, the student's answer and rationale, and a previous AI agent's analysis that includes arguments for and against the student's answer. Your task is to outline multiple distinct thought processes that one could use to arrive at each potential answer choice.

Specifically, based on all the provided information, generate a JSON array of different step by step thought processes one could use to arrive at each answer. For each thought process, indicate whether it leads to the user's answer or the correct answer using the "leads_to" field.

The JSON should conform to the following format:

```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

Here's the information for your analysis:

**Question:** [Insert Question Here]
**Rationale:** [Insert Rationale Here]
**User Answer:** [Insert User Answer Here]
**User Rationale:** [Insert User Rationale Here]
**Previous Agent's Analysis:** [Insert Previous Agent's Analysis Here]
```

**Prompt 3: Focus on Misconceptions and Traps**

```
You are an expert SAT tutor identifying common misconceptions and traps that students fall into on SAT questions. You are given the question, the official rationale, the student's answer and rationale, and a previous AI agent's analysis that includes arguments for and against the student's answer. Your task is to outline thought processes that, while seemingly logical, lead to incorrect answer choices due to common misconceptions or test-taking traps.

Specifically, based on all the provided information, generate a JSON array of different step by step thought processes one could use to arrive at each answer. For each thought process, indicate whether it leads to the user's answer or the correct answer using the "leads_to" field.

The JSON should conform to the following format:

```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers, focusing on misconceptions or traps]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers, focusing on misconceptions or traps]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

Here's the information for your analysis:

**Question:** [Insert Question Here]
**Rationale:** [Insert Rationale Here]
**User Answer:** [Insert User Answer Here]
**User Rationale:** [Insert User Rationale Here]
**Previous Agent's Analysis:** [Insert Previous Agent's Analysis Here]
```

**Prompt 4: Focus on different problem-solving strategies**

```
You are an expert SAT tutor demonstrating different problem-solving strategies for a single question. You are given the question, the official rationale, the student's answer and rationale, and a previous AI agent's analysis that includes arguments for and against the student's answer. Your task is to outline various approaches to solving the problem, highlighting different strategies (e.g., process of elimination, plugging in numbers, identifying keywords).

Specifically, based on all the provided information, generate a JSON array of different step by step thought processes one could use to arrive at each answer. For each thought process, indicate whether it leads to the user's answer or the correct answer using the "leads_to" field.

The JSON should conform to the following format:

```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers, highlighting a specific problem-solving strategy]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers, highlighting a specific problem-solving strategy]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

Here's the information for your analysis:

**Question:** [Insert Question Here]
**Rationale:** [Insert Rationale Here]
**User Answer:** [Insert User Answer Here]
**User Rationale:** [Insert User Rationale Here]
**Previous Agent's Analysis:** [Insert Previous Agent's Analysis Here]
```

**Prompt 5: Combined Approach**

```
You are an expert SAT tutor providing a comprehensive error analysis. You are given the question, the official rationale, the student's answer and rationale, and a previous AI agent's analysis that includes arguments for and against the student's answer. Your task is to analyze the student's mistake from multiple angles, identifying flaws in reasoning, exploring alternative approaches, highlighting misconceptions, and demonstrating different problem-solving strategies.

Specifically, based on all the provided information, generate a JSON array of different step by step thought processes one could use to arrive at each answer. For each thought process, indicate whether it leads to the user's answer or the correct answer using the "leads_to" field. Aim for a diverse range of thinking processes.

The JSON should conform to the following format:

```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

Here's the information for your analysis:

**Question:** [Insert Question Here]
**Rationale:** [Insert Rationale Here]
**User Answer:** [Insert User Answer Here]
**User Rationale:** [Insert User Rationale Here]
**Previous Agent's Analysis:** [Insert Previous Agent's Analysis Here]
```

Key improvements and considerations:

*   **Emphasis on Variety:** The prompts explicitly encourage the generation of *multiple* and *diverse* thinking processes.
*   **Clear Role Definition:** Each prompt establishes a specific role for the AI (e.g., identifying misconceptions, demonstrating strategies), which can help focus its analysis.
*   **Building on Previous Analysis:**  The prompts consistently include the output from the previous agent, ensuring that the AI builds upon existing information and doesn't repeat work.
*   **JSON Format Enforcement:** The prompts reiterate the required JSON format to ensure consistent output.
*   **Action-Oriented:** The prompts use action verbs like "identify," "outline," and "demonstrate" to encourage the AI to actively engage with the material.
*   **Example Question Omission**: I've omitted the example question, as it is not necessary.
*   **Clear Explanation of 'leads_to'**: I've ensured that the function of "leads_to" is clear and easily understood.


Here I've picked the 5 best and most creative prompts to evaluate:

In [22]:
prompts = [ 
    {
        "prompt": """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1.  The SAT question, including the text, answer choices, and the correct answer.
2.  The official rationale for the correct answer and why the other choices are incorrect.
3.  The student's chosen answer and their rationale for choosing that answer.
4.  An argument contrasting the user's answer with the correct answer.

Based on this information, identify the potential thinking processes that could have led the student to their incorrect answer, as well as the thinking process for the correct answer. Focus on common SAT error patterns (e.g., misreading the question, applying incorrect grammar rules, making unwarranted assumptions, etc.).

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
        """,
        "avg_score": 0.0,
    },
    {
        "prompt": """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1. The SAT question, including the text, answer choices, and the correct answer.
2. The official rationale for the correct answer and why the other choices are incorrect.
3. The student's chosen answer and their rationale for choosing that answer.
4. An argument contrasting the user's answer with the correct answer.

Deconstruct the official rationale and the student's rationale. Identify the key assumptions, logical steps, and potential misunderstandings in each. Then, reconstruct different thinking processes – both correct and incorrect – that could lead to each answer choice. Be detailed and explicit in each step.

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
        """,
        "avg_score": 0.0,
    },
    {
        "prompt" : """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1. The SAT question, including the text, answer choices, and the correct answer.
2. The official rationale for the correct answer and why the other choices are incorrect.
3. The student's chosen answer and their rationale for choosing that answer.
4. An argument contrasting the user's answer with the correct answer.

Consider potential cognitive biases (e.g., confirmation bias, anchoring bias, availability heuristic) that might have influenced the student's decision-making process. Develop multiple step-by-step thinking processes that incorporate these biases, leading to both the student's answer and the correct answer.

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
        """,
        "avg_score": 0.0,
    },
    {
        "prompt": """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1. The SAT question, including the text, answer choices, and the correct answer.
2. The official rationale for the correct answer and why the other choices are incorrect.
3. The student's chosen answer and their rationale for choosing that answer.
4. An argument contrasting the user's answer with the correct answer.

Elaborate on multiple potential "chains of thought" a student might follow when approaching the question. Some chains should lead to the correct answer, while others should lead to the student's incorrect answer. Make each step in the chain explicit and easy to follow.

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```
        """,
        "avg_score": 0.0,
    },
    { 
        "prompt": """
You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

    The SAT question, including the text, answer choices, and the correct answer.
    The official rationale for the correct answer and why the other choices are incorrect.
    The student's chosen answer and their rationale for choosing that answer.
    An argument contrasting the user's answer with the correct answer.

Synthesize multiple approaches to analyze the student's error. Consider:

1. Common SAT error patterns.
2. Deconstruction of the official and student rationales.
3. Potential cognitive biases.
4. Detailed chains of thought.

Develop as many distinct, step-by-step thinking processes as possible that could lead to each answer choice (both the student's and the correct one). Be exhaustive in your analysis.

Output a JSON array of objects. Each object represents a distinct thinking process and whether it leads to the user's answer or the correct answer. Include as many plausible thinking processes as possible, even if they seem obvious.

Format:
```json
[
    {
        "thinking_process": "[Here you will define a process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Here you will define another process step by step to get one of the answers]",
        "leads_to": "user" | "correct"
    },
    {
        "thinking_process": "[Include as many of these as possible!]",
        "leads_to": "user" | "correct"
    },
]
```

        """,
        "avg_score": 0.0,
    },
]

## Evaluation of ToT Prompts

Evaluate the ToT prompts against each other:

In [23]:
# Define the evaluation prompt
EVAL_PROMPT = """
# Instruction
Evaluate the AI’s analysis of a student’s SAT error. Focus on how well it explains why the correct answer is textually supported and why the user’s answer is incorrect.
This analysis is intended to build upon a previous debate style analysis where the user is presented upon a pro/con argument for their answer. These prompts will be used in the agent for when the user wants more clarity and multiple thinking processes.

# Evaluation
## **Metric Definition**
Assess the **accuracy**, **completeness**, **groundedness**, and **clarity** of the response. The AI must:
1. Correctly identify textual evidence for the correct answer.
2. Explain flaws in the user’s answer (and their rationale, if provided).
3. Follow the prompt’s instructions precisely.
4. Really help the user understand why they were wrong by looking at how other people can mess up / how they potentially messed up.


## **Criteria**  
1. **Instructional Creativity**:  
   - Does the creative approach (e.g., debate, step-by-step analysis) **enhance understanding** of why the correct answer is textually supported?  
   - Does it **strategically use the prompt’s structure** (e.g., arguments for/against) to highlight key SAT skills like evidence analysis or assumption identification?  

2. **Educational Effectiveness**:  
   - Does the creativity **directly serve the learning goal** (e.g., clarifying misconceptions, modeling SAT logic), or is it merely ornamental?  
   - Does it **engage the learner** while maintaining rigor (e.g., making complex reasoning more accessible)?  

## **Revised Rating Rubric**  
- **5 (Excellent)**:  
  - Creative structure (e.g., debate) **directly reinforces** why the correct answer is superior.  
  - Uses the format to **explicitly contrast** the user’s error with textual evidence (e.g., “The strongest argument *for* the user’s answer is X, but the text contradicts this because Y”).  
  - Balances creativity with precision and clarity.  

- **4 (Good)**:  
  - Creative approach is **mostly effective** but slightly misses opportunities to deepen understanding (e.g., lists arguments but doesn’t explicitly tie them to SAT skills).  
  - Minor clarity issues in linking creativity to the text.  

- **3 (Adequate)**:  
  - Creativity **distracts** slightly from the core analysis (e.g., overemphasizes hypothetical arguments without grounding in the text).  
  - Fails to fully leverage the creative structure to address the user’s error.  

- **2 (Poor)**:  
  - Creative format **obscures key points** (e.g., hypothetical arguments misrepresent the text).  
  - Prioritizes style over substance; minimal educational value.  

- **1 (Very Poor)**:  
  - Creativity **undermines accuracy** (e.g., invents textual evidence to support arguments).  

## Note
This evaluation rubric is purposefully vague, so take that as an opportunity to be strict and make sure the AI is really being useful for users.
Be harsh with your feedback, but make sure your feedback is grounded in reality.

## Evaluation Steps
STEP 1: Assess the response in aspects of instruction following, groundedness, conciseness, and verbosity according to the criteria.
STEP 2: Score based on the rubric.

# User Inputs and AI-generated Response
## User Inputs

### Prompt
{prompt}

## AI-generated Response
{response}
"""


Now for the bulk of the evaluation cycle:

In [24]:
# TODO: maybe share context so the LLM can compare different outputs against its own rating to be more accurate perhaps?

NUM_ITERATIONS = 6
#NUM_ITERATIONS=1

with open("/kaggle/working/totpromptlogs.txt", "a") as logs:
    
    for prompt in prompts:
        sum = 0
        logs.write(f"Prompt: {prompt}\n")
    
        for question in questions:
            logs.write(f"Question: {question}\n")
            for i in range(NUM_ITERATIONS):
                response = client.models.generate_content(
                    model="gemini-2.0-flash-001",
                    contents=str(question),
                    config=GenerateContentConfig(
                        system_instruction=[prompt["prompt"],],
                        temperature=0.1,
                    ),
                )

                text_eval, struct_eval = eval_summary(prompt=prompt["prompt"], ai_response=response)
                print(f"Iteration {i}: {struct_eval}")
                logs.write(f"Iteration {i}: {text_eval}\n")
                sum += int(struct_eval.value)



        prompt["avg_score"] = sum / (len(questions) * NUM_ITERATIONS)
        print(f"Prompt:\n{prompt['prompt']}\nScore: {prompt['avg_score']}\n\n")
    
        logs.write("\n\n")

Iteration 0: SummaryRating.VERY_GOOD
Iteration 1: SummaryRating.VERY_GOOD
Iteration 2: SummaryRating.GOOD
Iteration 3: SummaryRating.GOOD
Iteration 4: SummaryRating.GOOD
Iteration 5: SummaryRating.GOOD
Prompt:

You are an expert SAT tutor analyzing a student's mistake on an SAT question. You are provided with:

1.  The SAT question, including the text, answer choices, and the correct answer.
2.  The official rationale for the correct answer and why the other choices are incorrect.
3.  The student's chosen answer and their rationale for choosing that answer.
4.  An argument contrasting the user's answer with the correct answer.

Based on this information, identify the potential thinking processes that could have led the student to their incorrect answer, as well as the thinking process for the correct answer. Focus on common SAT error patterns (e.g., misreading the question, applying incorrect grammar rules, making unwarranted assumptions, etc.).

Output a JSON array of objects. Each ob

## Future Direction
Fine-tuning may be necessary once we have some data on what "good" and "bad" output looks like. Reinforcement Learning with Human Feedback (RLHF) may be necessary to complement the AI evaluator here.

# TODO
Synthesize the prompt:

In [25]:
from google.genai.types import GenerateContentConfig

response = client.models.generate_content(
    model="gemini-2.0-flash-001",
    # no specs yet
    contents=f"{question['question']} {question['rationale']}\n The user got: {question['user_answer']}\n {question['user_rationale']}",
    config=GenerateContentConfig(
        system_instruction=[
            "You are an SAT expert tutor. Analyze questions using the official SAT framework. ",
            "Help students by:\n",
            "1. Identifying question type and skills tested\n",
            "2. Explaining why answers are correct/incorrect\n",
            "3. Providing actionable improvement strategies\n",
            "Use formal but friendly language. Reference the SAT specs when possible.",
        ]
    ),
)


Markdown(response.text)

Okay, let's break down this SAT writing question and address your concern.

**1. Question Type and Skills Tested**

*   **Category:** Standard English Conventions
*   **Subcategory:** Sentence Structure
*   **Specific Skill:** Modifier Placement (Avoiding Dangling Modifiers)

The question tests your ability to ensure that modifying phrases clearly and logically refer to the correct subject in the sentence. According to the College Board, being able to correct errors in sentence structure is a key skill for success in the SAT writing section.

**2. Why Choice A is Correct**

*   **Clarity and Logic:** The phrase "Upon recovering two years later" is a modifying phrase. It describes *who* recovered. Choice A, "Henry resumed his reign," places "Henry" immediately after the modifying phrase. This makes it clear that *Henry* is the one who recovered.

**3. Why Other Choices are Incorrect**

*   **Dangling Modifiers:** Choices B, C, and D all create dangling modifiers. A dangling modifier is a phrase that doesn't clearly and logically attach to the subject it's intended to modify.

    *   **Choice B:** "the reign of Henry resumed" - This implies the *reign* recovered, which doesn't make sense.
    *   **Choice C:** "Henry's reign resumed" - Same as B, the *reign* cannot recover.
    *   **Choice D:** "it was Henry who resumed his reign" - The word "it" is placed immediately after the introductory phrase, illogically suggesting that "it" recovered two years later.

**4. Addressing Your Concern**

You said: "Isn't the subject that Henry got unfit to rule?"

You're right that Henry was initially unfit to rule. However, the sentence is structured with the phrase "Upon recovering two years later..." This phrase *must* logically modify the subject that comes immediately after it. The sentence is about what happened *after* Henry's illness, specifically, what happened when he got better.

**5. Actionable Improvement Strategies**

*   **Practice Identifying Modifiers:** Pay close attention to phrases that describe actions or states (like "Upon recovering...").
*   **Check for Logical Connection:** After placing the subject, ask yourself, "Does it make sense for *this subject* to be doing the action described in the modifying phrase?"
*   **Read the Sentence Aloud:** Sometimes, reading the sentence aloud can help you hear awkward phrasing or dangling modifiers.
*   **Targeted Practice:** Do practice questions specifically focused on modifier placement. Review explanations carefully to understand why certain placements create errors.

**In Summary**

The SAT writing section is very precise. Even if you understand the general idea of the sentence, you need to make sure the grammar and sentence structure are perfectly correct. In this case, the key was recognizing the modifying phrase and ensuring it logically connected to the correct subject ("Henry").


Perhaps the model will be more effective with FAISS vector database searches for the PDF.

# Parse PDF
The following code uses lossy conversion to turn the PDF into readable text.

Information like tables and images will be lost in the process, as demonstrated by the following snippet.

In [26]:
from PyPDF2 import PdfReader
from io import BytesIO
import requests

url = "https://www.w3.org/WAI/WCAG20/Techniques/working-examples/PDF20/table.pdf"
response = requests.get(url)
pdf_bytes = BytesIO(response.content)

text = ""
pdf_reader = PdfReader(pdf_bytes)
for page in pdf_reader.pages:
    text += page.extract_text()

print(text[0:1000])

Example table  
This is an example of a data table. 
Disability 
Category Participants  Ballots 
Completed  Ballots 
Incomplete/  
Terminated  Results  
Accuracy  Time to 
complete 
Blind  5 1 4 34.5%, n=1  1199 sec, n=1  
Low Vision  5 2 3 98.3% n=2  
(97.7%, n=3)  1716 sec, n=3  
(1934 sec, n=2)  
Dexterity  5 4 1 98.3%, n=4  1672.1 sec, n=4  
Mobility  3 3 0 95.4%, n=3  1416 sec, n=3  
 


In [27]:
# Download the PDF using requests
url = "https://satsuite.collegeboard.org/media/pdf/assessment-framework-for-digital-sat-suite.pdf"

response = requests.get(url)
pdf_bytes = BytesIO(response.content)

print(type(pdf_bytes))

specs_text = ""
pdf_reader = PdfReader(pdf_bytes)
for page in pdf_reader.pages:
    specs_text += page.extract_text()

print(specs_text[0:1000])

<class '_io.BytesIO'>
Assessment Framework 
for the Digital SAT® SuiteAssessment Framework 
for the Digital SAT® Suite
Version 3.01, August 2024
About College Board
College Board reaches more than 7 million students a year, helping them 
navigate the path from high school to college and career. Our not-for-
profit membership organization was founded more than 120 years ago. 
We pioneered programs like the SAT® and AP® to expand opportunities 
for students and help them develop the skills they need. Our BigFuture® 
program helps students plan for college, pay for college, and explore 
careers. Learn more at cb.org .
Suggested Citation:  College Board. 2024. Assessment Framework for the 
Digital SAT Suite , version 3.01 (August 2024). New Y ork: College Board.
© 2024 College Board. College Board, Advanced Placement, AP , BigFuture, Landscape, Pre-AP , SAT, and 
the acorn logo are registered trademarks of College Board. AP Potential, Bluebook, Connections, PSAT, 
Skills Insight, Student S

This text data might be too big for the model to contain within the prompt.

In [28]:
client.models.count_tokens(
    model=model, contents=specs_text
).total_tokens

153273

That's a pretty significant size! The only way to incorporate the PDF properly is to chunk it.

# Chunk the Parsed PDF
In order to use the PDF, we need to "chunk" it so bits of relevant information can be accessed at a time. In order to maximize efficiency, a vector search database will be used, as the likelihood that any key words in the question will appear on the SAT is effectively zero. 

In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1024,
    chunk_overlap=102,
    length_function=len,
    is_separator_regex=False,
)

chunked_specs_text = text_splitter.split_text(specs_text)

Check out the chunks:

In [30]:
print(len(chunked_specs_text))
print(chunked_specs_text[0])

702
Assessment Framework 
for the Digital SAT® SuiteAssessment Framework 
for the Digital SAT® Suite
Version 3.01, August 2024
About College Board
College Board reaches more than 7 million students a year, helping them 
navigate the path from high school to college and career. Our not-for-
profit membership organization was founded more than 120 years ago. 
We pioneered programs like the SAT® and AP® to expand opportunities 
for students and help them develop the skills they need. Our BigFuture® 
program helps students plan for college, pay for college, and explore 
careers. Learn more at cb.org .
Suggested Citation:  College Board. 2024. Assessment Framework for the 
Digital SAT Suite , version 3.01 (August 2024). New Y ork: College Board.
© 2024 College Board. College Board, Advanced Placement, AP , BigFuture, Landscape, Pre-AP , SAT, and 
the acorn logo are registered trademarks of College Board. AP Potential, Bluebook, Connections, PSAT,


# Initialize ReAct agent
The model will instead be a ReAct agent and figure out what to search up and call that tool as an extension and we will see what happens

We will start by finding a gemini model for embedding.

In [31]:
for m in client.models.list():
    if "embedContent" in m.supported_actions:
        print(m.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp


Set up ChromaDB. We will use `text-embedding-004` to encode the "document" chunks into vectors.

In [32]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry

from google.genai import types


class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]

Now let's create a Chroma database client and add the document embeddings

In [33]:
import chromadb

SPECS_DB_NAME = "specs"

embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

chroma_client = chromadb.Client()
#print(chroma_client.list_collections())
db = chroma_client.get_or_create_collection(name=SPECS_DB_NAME, embedding_function=embed_fn)
for i in range(int(len(chunked_specs_text) / 100)):
    db.add(documents=chunked_specs_text[i:100+i], ids=[str(i + j) for j in range(100)])

Check out the database

In [34]:
db.count()
db.peek(1)

{'ids': ['0'],
 'embeddings': array([[ 5.80166355e-02,  4.36070710e-02, -5.12756817e-02,
         -1.44314906e-03,  2.73717772e-02,  4.12952006e-02,
          5.71677238e-02,  7.64632318e-03,  9.56675038e-03,
         -5.80519577e-03, -4.39782254e-02,  9.93977953e-03,
          3.03114597e-02,  3.39544937e-02, -7.10375309e-02,
          1.01664122e-02,  2.03551948e-02,  5.44842966e-02,
         -1.12781808e-01, -2.91494676e-03, -2.88882591e-02,
         -3.38051245e-02, -1.49710365e-02, -1.39160426e-02,
         -1.64779034e-02,  7.98442122e-03,  7.00960010e-02,
          1.01699783e-02,  1.97551977e-02, -3.44269574e-02,
          2.98777758e-03,  9.17043630e-03, -2.51114983e-02,
         -8.72381590e-03,  7.50523322e-05,  3.42415981e-02,
         -2.42317207e-02, -3.20444927e-02,  7.41561651e-02,
         -4.53105606e-02, -4.95616272e-02, -4.72334400e-03,
         -1.52662601e-02, -1.15578976e-02, -2.57304776e-02,
          5.48221776e-03,  6.22319020e-02,  5.05198240e-02,
         -1

Let's look for *Information and Ideas* and see what relevant information the vector database finds on it.

In [35]:
# Switch to query mode when generating embeddings.
embed_fn.document_mode = False

# Search the Chroma DB using the specified query.
query = "What is the weighting of Information and Ideas?"

result = db.query(query_texts=[query], n_results=1)
[all_passages] = result["documents"]


Markdown(all_passages[0])

4.1.8. Word Count  ..................................................................................................................... 79
4.1.9. Informational Graphics  ............................................................................................... 79
4.1.10. T ext Complexity  .......................................................................................................... 79
4.2 Definitions  ..................................................................................................................................... 79
4.2.1. Construct  ......................................................................................................................... 79
4.2.2. Claims  ................................................................................................................................ 80
4.3 Content Domain Structure  ...................................................................................................... 81

Now let's look for something more generic, like scientific vocabulary or weighting of passages.

In [36]:
# Switch to query mode when generating embeddings.
embed_fn.document_mode = False

# Search the Chroma DB using the specified query.
query = "What scientific stuff are we tested on?"

result = db.query(query_texts=[query], n_results=2)
[all_passages] = result["documents"]

Markdown(all_passages[0])

standardized testing as well as many families, educators, and policymakers have 
raised concerns about the extent to which U.S. students are tested as part of 
K–12 education. Polling has suggested that the public’s doubts about the value of 
standardized testing in schools have grown over time, and the necessary relaxation 
of federal testing requirements under the successor Every Student Succeeds Act 
during the 2019–2020 and, to a lesser extent, 2020–2021 pandemic years has 
further contributed to those doubts. (See Bruno and Goldhaber 2021 for a brief 
recent overview.)A Living 
Document
This release of the Assessment 
Framework for the Digital SAT 
Suite  includes authoritative, 
up-to-date information about 
the digital suite. As College 
Board continues to research and 
implement the tests, updates 
will be made to this document 
(and disseminated through other 
means, such as our website, 
sat.org/digital ) to ensure that 
readers have as complete and 
accurate a picture as possible.

The information captured by the embedding model is not relevant to the question. Perhaps more results solve the problem?

In [37]:
result = db.query(query_texts=[query], n_results=2)
[all_passages] = result["documents"]

[print(passage) for passage in all_passages]

standardized testing as well as many families, educators, and policymakers have 
raised concerns about the extent to which U.S. students are tested as part of 
K–12 education. Polling has suggested that the public’s doubts about the value of 
standardized testing in schools have grown over time, and the necessary relaxation 
of federal testing requirements under the successor Every Student Succeeds Act 
during the 2019–2020 and, to a lesser extent, 2020–2021 pandemic years has 
further contributed to those doubts. (See Bruno and Goldhaber 2021 for a brief 
recent overview.)A Living 
Document
This release of the Assessment 
Framework for the Digital SAT 
Suite  includes authoritative, 
up-to-date information about 
the digital suite. As College 
Board continues to research and 
implement the tests, updates 
will be made to this document 
(and disseminated through other 
means, such as our website, 
sat.org/digital ) to ensure that 
readers have as complete and 
accurate a picture as pos

[None, None]

There are unnecessary details in the original PDF. "Garbage in, garbage out" 
The solution is perhaps to distill the relevant information from the PDF into a text document.

The below code distills the PDF.

In [38]:
# TODO

do some mathplotlib thingies