In [3]:
import torch
import os
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from openai import OpenAI

In [4]:
import pandas as pd

# Read the JSONL file into a DataFrame
processed_test = pd.read_json("processed_test.jsonl", lines=True)

# Display the first few rows
print(processed_test.head())


                                            question  \
0  What do these two changes have in common?\ncom...   
1  Using only these supplies, which question can ...   
2  Is the following trait inherited or acquired?\...   
3  What do these two changes have in common?\nbut...   
4                                  Select the plant.   

                                             choices  answer             task  \
0  [Both are caused by cooling., Both are only ph...       3  multiple choice   
1  [Does a pie crust made with white flour burn m...       0  multiple choice   
2                              [acquired, inherited]       0  multiple choice   
3  [Both are caused by cooling., Both are only ph...       1  multiple choice   
4  [Oak trees can have thick branches., Orcas swi...       0  multiple choice   

   grade                              topic                      category  \
0      4                          chemistry  Physical and chemical change   
1      8  science-and-

In [5]:
processed_test.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill,Bloom's Taxonomy,DOK Level
0,What do these two changes have in common?\ncom...,"[Both are caused by cooling., Both are only ph...",3,multiple choice,4,chemistry,Physical and chemical change,Compare physical and chemical changes,analyze,2
1,"Using only these supplies, which question can ...",[Does a pie crust made with white flour burn m...,0,multiple choice,8,science-and-engineering-practices,Designing experiments,Identify questions that can be investigated wi...,create,3
2,Is the following trait inherited or acquired?\...,"[acquired, inherited]",0,multiple choice,4,biology,Traits and heredity,Identify inherited and acquired traits,understand,2
3,What do these two changes have in common?\nbut...,"[Both are caused by cooling., Both are only ph...",1,multiple choice,3,chemistry,Physical and chemical change,Compare physical and chemical changes,analyze,2
4,Select the plant.,"[Oak trees can have thick branches., Orcas swi...",0,multiple choice,2,biology,Classification,Identify plants and animals,remember,1


In [6]:
processed_test["input"] = processed_test.apply(
    lambda row: f"Generate a science question and its answer using the following metadata: grade: {row['grade']}, task: {row['task']}, topic: {row['topic']}, category: {row['category']}, skill: {row['skill']}, bloom's taxonomy: {row["Bloom's Taxonomy"]}, dok:{row['DOK Level']}",
    axis=1
)

In [7]:
processed_test["output"] = processed_test.apply(
    lambda row: f"Question: {row['question']},\n Choices: \n{row['choices']},\n Answer: \n{row['answer']}",
    axis=1
)

In [8]:
system_info = {
    "role": "You are an expert science assessment specialist that generates science questions based on specified metadata provided by the user. Your role is to ensure that the generated questions are of high quality, align with the intended learning objectives, and adhere to scientifically rigorous standards."
    }

In [9]:
system_info_test = {
    "task_description": "Improve low-scoring questions using high-performing questions as a guide. Given metadata about a low-scoring question, regenerate a new question that aligns with the high-scoring examples.",
    
    "high_performing_examples": [
        {
            "question": "Which of these changes is a physical change?",
            "choices": ["a banana turning brown", "a candle melting", "a piece of metal rusting"],
            "answer": "a candle melting",
            "explanation": "The question aligns well with learning outcomes related to distinguishing physical and chemical changes, which is a common topic in middle school science curriculums. The question is appropriate for middle school science, fitting well with curriculum goals focused on teaching basic concepts of physical and chemical changes. The question and the answer provided are accurate. A candle melting is indeed a physical change, while the other options represent chemical changes. The question stem is clear, concise, and free from unnecessary complexity, making it easy for students to understand what is being asked. The distractors are plausible and meaningful as they represent common misconceptions about physical vs. chemical changes. The answer choices are free from cultural, gender, or regional biases, being strictly scientific. The answer choices are balanced in length, structure, and complexity, maintaining fairness in the response options. The language used in the question is grammatically correct and suitable for the intended audience. The vocabulary is appropriate for middle school students and is clear for the intended learner demographic. The question formatting could be improved by providing each option on a separate line to enhance readability, although it is generally clear. The question avoids common errors such as unclear distractors and the use of options like 'All of the above'. The question has a single, unambiguous correct answer, promoting clarity and accuracy in assessment. The content of the question is free from stereotypes, cultural insensitivity, or potentially offensive material. The question is fair and accessible to a diverse learner group, without any biased content or language. The question appears to be unique and not a rephrased version of an existing one. While the question introduces a common topic, it doesn't present a particularly novel or fresh perspective but is adequately engaging for its purpose. The question appropriately reflects the 'Recall' level of Depth of Knowledge (DOK), suitable for identifying physical changes. The question aligns with the 'Understand' level of Bloom's Taxonomy, requiring students to comprehend and distinguish between types of changes."
        },
        {
            "question": "Which of these changes is a chemical change?",
            "choices": ["A banana is cut into pieces.", "A candle is lit.", "A piece of iron is heated in a fire."],
            "answer": "A candle is lit.",
            "explanation": "The question aligns well with common science learning outcomes related to understanding chemical changes, which is a fundamental concept in chemistry education. The question is relevant to middle school or early high school science curricula, where students learn to distinguish between physical and chemical changes. The content of the question and the correct answer choice are accurate. Lighting a candle involves a chemical reaction (combustion), which is a chemical change. The question stem is clear and concise, asking specifically about chemical changes, without any unnecessary complexity. The distractors are plausible in the context. Cutting a banana and heating iron involve physical changes, which makes them clearly incorrect but reasonable choices. The answer choices are free from any cultural, gender, or regional biases. All answer choices are balanced in terms of length and complexity, each describing a simple scientific action. The language used is grammatically correct and suitable for the intended audience. The vocabulary is appropriate for the grade level, using simple language that is accessible to students at the target educational level. The formatting of the question and the options is clear and consistent, making it easy to read and understand. The question avoids common errors such as unclear distractors or inappropriate options like 'All of the above.' The question has a single, unambiguous correct answer, with only one option involving a chemical change. The question is free from stereotypes, cultural insensitivity, or potentially offensive material. The question is fair and accessible, accommodating diverse learner groups without requiring any specific background knowledge beyond the curriculum. While the question is likely unique and not a direct copy, it is based on a common educational scenario, which might lack a sense of novelty. The topic of distinguishing between physical and chemical changes is standard in educational contexts, providing little in terms of novelty or a fresh perspective. The question reflects the appropriate Depth of Knowledge level by requiring students to distinguish between types of changes based on their understanding, rather than just recall. The question aligns with the -Understand- level of Bloom's Taxonomy, requiring students to comprehend and apply their knowledge about physical and chemical changes."
        },
        {
            "question": "Select the animal that is an invertebrate.",
            "choices": ["a jellyfish", "a rabbit", "a robin"],
            "answer": "a jellyfish",
            "explanation": "The question aligns well with biology topics commonly included in learning outcomes for understanding animal classifications. The question is suitable for middle school life sciences curricula, which cover vertebrates and invertebrates. The content of the question is accurate; jellyfish are indeed invertebrates while rabbits and robins are not. The question stem is concise and clearly asks for the selection of an invertebrate from the options provided. The distractors (rabbit and robin) are plausible since they are commonly known animals but are clearly incorrect as invertebrates. The answer choices are free from cultural, gender, or regional biases and are purely focused on biological classification. The answer choices are balanced in terms of length and complexity, each being a single common animal name. While generally correct, there's a minor inconsistency in format due to lack of 'a' before 'robin' in the presentation of options in the list. The vocabulary is appropriate for the intended grade level and demographic, using simple animal names. The formatting is mostly clear, but there's slight inconsistency in option presentation ('a robin' lacks formatted article). The question avoids common errors like unclear distractors or using options like 'All of the above.' There is a single, unambiguous correct answer (a jellyfish), meeting the single answer requirement. The question is free from stereotypes, cultural insensitivity, or potentially offensive material, focusing solely on factual biological classification. The question is fair and accessible, presented equally to all students without any unintentional exclusionary language or complexity. The question is unique and not obviously a rephrased version of a common existing question. While the topic itself is standard, the inclusion of a jellyfish provides a slightly fresher aspect than if using common textbook examples like worms or insects. This question reflects an appropriate DOK level of Recall, as it requires basic knowledge of animal classifications. It aligns with the 'Remember' level of Bloom's Taxonomy, asking students to recall factual information."
        },
        {
            "question": "Which object is moving faster?",
            "choices": ["a horse running at 10 m/s", "a car driving at 5 m/s"],
            "answer": "a horse running at 10 m/s",
            "explanation": "The question is relevant and aligns with learning outcomes related to understanding speed and comparisons, a fundamental concept in physics for many grade levels. This question fits well within the typical curriculum goals for middle school science or math, where speed and comparisons of different objects are often discussed. The question is accurate, clearly comparing the speeds of two different moving objects without any ambiguity. The question stem is simple and straightforward, making it easy for students to understand what is being asked without unnecessary complexity. The distractors are plausible. Both options represent realistic speeds for their respective objects, although one is clearly faster. The answer choices are free from cultural, gender, or regional biases. They are neutral in nature. The two choices are balanced in terms of length, structure, and complexity, facilitating an unbiased comparison. The language used in the question and answer choices is grammatically correct and suitable for the target audience. The vocabulary and sentence structure are appropriate for the intended grade level and learner demographic. The formatting is clear and consistent, with no distractions or errors in presentation. The question avoids common errors, such as unclear distractors or inappropriate use of options like 'All of the above.' The question has a single, unambiguous correct answer as intended, given the higher speed of the horse. The question is free from stereotypes, cultural insensitivity, or potentially offensive material. The question ensures fairness and accessibility, not disadvantaged by language or cultural factors for diverse learner groups. The question is unique and not simply a rephrased version of an existing question, presenting a fresh scenario. While the concept of speed comparison is standard, framing it with a horse and a car offers a fresh, engaging perspective to students. The question reflects a Recall DOK level, suitable for its context but not challenging strategic or extended thinking. The question aligns with the 'Understand' level, requiring students to comprehend and compare different speeds."
        },
        {
            "question": "Which material has the most thermal energy?",
            "choices": ["a cup of water at 80°F", "a cup of water at 90°F", "a cup of water at 100°F"],
            "answer": "a cup of water at 100°F",
            "explanation": "The question aligns well with the learning outcomes focused on understanding thermal energy, which is a fundamental concept in physics and science education. The question is appropriate for middle or high school science curricula where the concept of thermal energy is introduced and explored. The content is accurate; it correctly identifies temperature as a measure of thermal energy without ambiguity. The question stem is clear and concise, asking specifically about thermal energy in a straightforward manner. The distractors are plausible, as all options represent common temperatures for water and are slightly varied, making them good distractors for deeper thinking. The answer choices do not exhibit cultural, gender, or regional biases; they are based purely on scientific measurement. The answer choices are balanced as they all have the same structure and ask the same type of question with different temperature values. The language is grammatically correct and appropriate for the intended audience. The vocabulary and concepts used are appropriate for the grade level, assuming the learners have basic exposure to the topic. The formatting of the question is clear and consistent with typical multiple-choice layout, avoiding distractions. The question avoids common errors such as unclear distractors and does not utilize problematic options like 'All of the above'. There is a single, unambiguous correct answer, which increases the reliability of the assessment. The content is free from stereotypes, cultural insensitivity, and potentially offensive material. The question is fair and accessible to diverse learner groups; understanding thermal energy is a universal concept. The question is unique and not a rephrased version of existing standard questions; it asks a novel implementation question within a common topic. The question introduces a novel perspective by comparing water temperatures, encouraging the application of thermal energy knowledge. The question reflects a basic recall level rather than higher-order thinking skills, suitable for this concept but not extending into application or analysis. The question aligns with the 'Remember' level of Bloom's Taxonomy, requiring recall rather than deeper understanding or application."
        }
    ],
    
    "low_scoring_examples": [
        {
            "question": "Based on the information provided",
            "choices": ["When the salt is dissolved in water.", "When the salt is stirred in the water.", "When the salt is heated in a pan."],
            "answer": "When the salt is stirred in the water.",
            "issues": "The question lacks sufficient context, making it challenging to determine alignment with learning outcomes or educational standards. Without specific subject or grade level context, it's unclear how this question fits within the curriculum goals. The answer choices lack context, causing uncertainty about the accuracy without any clear information provided. The question stem is missing, leading to confusion since the stem is required for clarity. Two distractors are moderately plausible, but without context, it's difficult to assess their meaningfulness fully. The choices appear free from obvious cultural, gender, or regional biases. There is some balance in the structure of choices, but one option is slightly more detailed ('heated in a pan') compared to others. The language and syntax are correct and appropriate for the intended audience. The vocabulary is simple, but due to missing context, readability's suitability for a particular grade level cannot be fully assessed. The formatting is mostly clear; however, the question format could be more standardized. The absence of context affects the functionality, increasing the potential for errors like ambiguity. Without a clear question, it's difficult to determine if there's a singular correct answer based on the choice provided. The content appears free from stereotypes and cultural insensitivity. Without context, it's challenging to ensure fairness or accessibility for diverse learners. The question seems unique, though verifying its originality is challenging without additional context. Due to missing context, there is no opportunity to assess if the question provides novelty or a fresh perspective. Insufficient context makes it impossible to determine the depth of knowledge required for this question. Without knowing the question's intent or context, it's unfeasible to align it with a level of Bloom's Taxonomy."
        },
        {
            "question": "What is the inherited trait that this rabbit has?",
            "choices": ["The rabbit has white fur.", "The rabbit has a long body."],
            "answer": "The rabbit has white fur.",
            "issues": "The question is somewhat relevant to the genetics topic, which might align with learning outcomes related to inherited traits. However, it lacks specificity regarding the genetic concepts intended to be tested. The question can fit a biology curriculum discussing inheritance at a basic level, but the grade level appropriateness is unclear due to its simplicity. The question's wording is somewhat ambiguous. It's unclear if 'white fur' or 'long body' are genetically inherited or if this is the main educational point being tested. The question stem is clear, but more context would improve understanding. It lacks clarity regarding what genetic information is being questioned. The distractors are plausible as both options are features a rabbit might have, and one is clearly emphasized over the other. The answer choices are free from cultural, gender, or regional biases, focusing solely on physical traits. Both answer choices are balanced in length and structure, maintaining uniformity in presentation. The grammar and syntax are mostly correct, but the structure of the choices could be improved by using a phrase such as 'because of' to indicate causation. The vocabulary used is simple and appropriate for most educational levels concerned with genetics in a basic science class. The formatting is mostly clear, but listing the choices with a clearer distinction or numbering could improve readability. The question largely avoids common errors but presents ambiguity about exact inherited traits versus environmental factors. The question attempts a single correct answer, but the lack of context makes it unclear how the students are to deduce this. Content is free of stereotypes, cultural insensitivity, and offensive material. While the question is generally fair, the lack of additional context could disadvantage students without prior detailed knowledge of specific rabbit traits. There is no evidence of direct copying, but the question lacks a unique spin, potentially resembling basic textbook examples. This question lacks any novel or unique approach and is a basic presentation of a potential trait inheritance scenario. The question targets a low Depth of Knowledge level, more about recall with no deeper understanding or application of concepts. The question aligns with the 'Remember' level of Bloom's Taxonomy but lacks depth for a higher cognitive task."
        },
        {
            "question": "What happens to the sugar?",
            "choices": ["It is a physical change.", "It is a chemical change."],
            "answer": "It is a chemical change.",
            "issues": "The question is moderately aligned with common learning outcomes that discuss physical and chemical changes, but lacks context or a specific scenario to fully demonstrate its relevance. The question fits within science curricula that address changes in matter, likely appropriate for late elementary or middle school, but without a clear grade-level context. The question is overly simplistic and ambiguous, as it doesn't specify the context or process involving sugar (e.g., dissolving in water, burning), making it difficult to determine the correct answer. The question stem is simple but lacks context, which reduces clarity. More specific phrasing could make it clearer. Both distractors are plausible explanations for changes involving sugar, given the right context. The answer choices are free from cultural, gender, or regional biases. Both answer choices are balanced in terms of length and structure. The language used in the question and answer choices is grammatically correct and suitable for students. The vocabulary is appropriate for a general science audience and likely fits the intended grade level. The formatting is straightforward but includes a redundant listing of answer choices, which could be streamlined for clarity. The question fails to provide enough context to avoid ambiguity, a common error in question design. Without context, it is unclear what the single correct answer is, as sugar can undergo both physical and chemical changes in different scenarios. The content of the question is free from stereotypes or cultural insensitivity. The lack of context could make it confusing for some students, potentially impacting fairness. The question appears to be original and not a direct rephrase of an existing one. The question lacks novelty or a fresh perspective, as it doesn't present a specific scenario involving sugar. The question lacks depth, as it does not engage students beyond basic recall due to the lack of context. The question aligns with the 'Remember' level of Bloom's Taxonomy but does not engage higher levels of thinking due to its simplicity and lack of context."
        }
    ],

    "instruction": "Now, generate a new question based on the metadata and the high-scoring examples above while avoiding the mistakes seen in the low-scoring examples."
}



In [10]:
# Convert the system info to a formatted string
system_info_str = json.dumps(system_info_test, indent=2)

processed_test["input"] = processed_test.apply(
    lambda row: f"""{system_info_str}

 metadata: grade: {row['grade']}, task: {row['task']}, topic: {row['topic']}, category: {row['category']}, skill: {row['skill']}, bloom's taxonomy: {row["Bloom's Taxonomy"]}, dok: {row['DOK Level']}""",
    axis=1
)

In [11]:
# Assigning the dictionary to the DataFrame
processed_test["instruction"] = str(system_info)

In [12]:
# Drop unnecessary columns
columns_to_keep = ["input", "output", "instruction"]
processed_test = processed_test[columns_to_keep]

In [13]:
processed_test.head()

Unnamed: 0,input,output,instruction
0,"{\n ""task_description"": ""Improve low-scoring ...",Question: What do these two changes have in co...,{'role': 'You are an expert science assessment...
1,"{\n ""task_description"": ""Improve low-scoring ...","Question: Using only these supplies, which que...",{'role': 'You are an expert science assessment...
2,"{\n ""task_description"": ""Improve low-scoring ...",Question: Is the following trait inherited or ...,{'role': 'You are an expert science assessment...
3,"{\n ""task_description"": ""Improve low-scoring ...",Question: What do these two changes have in co...,{'role': 'You are an expert science assessment...
4,"{\n ""task_description"": ""Improve low-scoring ...","Question: Select the plant.,\n Choices: \n['Oa...",{'role': 'You are an expert science assessment...


In [14]:
# Function to format a row into the desired JSON structure
def format_row(row):
    messages = [
        {"role": "instruction", "content": row["instruction"]},
        {"role": "input", "content": row["input"]},
        {"role": "output", "content": row["output"]}
    ]
    return {"messages": messages}

# Function to format and save a DataFrame as a JSONL file
def format_and_save(df, output_file):
    formatted_data = df.apply(format_row, axis=1).tolist()
    with open(output_file, "w", encoding="utf-8") as f:
        for entry in formatted_data:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")
    print(f"Data saved to {output_file}")

In [15]:
format_and_save(processed_test, "few_shot_test.jsonl")

Data saved to few_shot_test.jsonl
