In [1]:
%pip install pandas fsspec huggingface_hub accelerate peft bitsandbytes transformers trl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import os
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from openai import OpenAI

In [None]:
from huggingface_hub import login

# Log in to Hugging Face with your token
login(token="")


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Define the dataset splits
splits = {
    'train': 'data/train-00000-of-00001-a31a321e61f8f5b1.parquet',
    'validation': 'data/validation-00000-of-00001-0d4b3eeedc9eb301.parquet',
    'test': 'data/test-00000-of-00001-60817eb9b00359a0.parquet'
}

# Load the train split
train_df = pd.read_parquet("hf://datasets/tasksource/ScienceQA_text_only/" + splits["train"])

# Load the validation split
validation_df = pd.read_parquet("hf://datasets/tasksource/ScienceQA_text_only/" + splits["validation"])

# Load the test split
test_df = pd.read_parquet("hf://datasets/tasksource/ScienceQA_text_only/" + splits["test"])


In [5]:
# Display the dataframe
train_df.head()

Unnamed: 0,question,choices,answer,hint,task,grade,subject,topic,category,skill,lecture,solution
0,Which tense does the sentence use?\nMona will ...,"[present tense, future tense, past tense]",1,,closed choice,grade2,language science,verbs,Verb tense,"Is the sentence in the past, present, or futur...",Present tense verbs tell you about something t...,The sentence is in future tense. You can tell ...
1,Complete the sentence.\nSewing an apron is a ().,"[chemical change, physical change]",1,,closed choice,grade4,natural science,chemistry,Physical and chemical change,Identify physical and chemical changes,Chemical changes and physical changes are two ...,Sewing an apron is a physical change. The fabr...
2,Which of the following contains a vague pronou...,[Steven's brother Jim wondered whether he ran ...,0,,closed choice,grade11,language science,writing-strategies,Pronouns,Identify vague pronoun references,"When writing, make sure to avoid vague pronoun...",The first answer choice contains a vague prono...
3,What do these two changes have in common?\ntea...,"[Both are only physical changes., Both are che...",0,,closed choice,grade4,natural science,chemistry,Physical and chemical change,Compare physical and chemical changes,Chemical changes and physical changes are two ...,Step 1: Think about each change.\nTearing a pi...
4,What is the volume of a large soup pot?,"[7 liters, 7 milliliters]",0,Select the better estimate.,closed choice,grade5,natural science,units-and-measurement,Units and measurement,Choose metric units of volume,Measurements are written with both a number an...,The better estimate for the volume of a large ...


In [6]:
train_df['subject'].unique()

array(['language science', 'natural science', 'social science'],
      dtype=object)

In [7]:
train_df['topic'].unique()

array(['verbs', 'chemistry', 'writing-strategies',
       'units-and-measurement', 'biology', 'physics',
       'figurative-language', 'reference-skills',
       'science-and-engineering-practices', 'phonological-awareness',
       'grammar', 'economics', 'earth-science', 'punctuation',
       'capitalization', 'world-history', 'us-history', 'civics',
       'vocabulary', 'reading-comprehension', 'word-study', 'culture',
       'pronouns', 'literacy-in-science', 'geography'], dtype=object)

In [8]:
train_df['task'].unique()

array(['closed choice', 'yes or no', 'true-or false'], dtype=object)

In [9]:
# Keep rows with these topics
topics_to_keep = ['chemistry', 'physics', 'biology', 'science-and-engineering-practices']

# Filter the DataFrame
train_df = train_df[train_df['topic'].isin(topics_to_keep)].copy()
validation_df = validation_df[validation_df['topic'].isin(topics_to_keep)].copy()
test_df = test_df[test_df['topic'].isin(topics_to_keep)].copy()


In [10]:
train_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
validation_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
test_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [11]:
train_df.drop(columns=['subject'], inplace=True)
validation_df.drop(columns=['subject'], inplace=True)
test_df.drop(columns=['subject'], inplace=True)

In [12]:
train_df.head()

Unnamed: 0,question,choices,answer,hint,task,grade,topic,category,skill,lecture,solution
1,Complete the sentence.\nSewing an apron is a ().,"[chemical change, physical change]",1,,closed choice,grade4,chemistry,Physical and chemical change,Identify physical and chemical changes,Chemical changes and physical changes are two ...,Sewing an apron is a physical change. The fabr...
3,What do these two changes have in common?\ntea...,"[Both are only physical changes., Both are che...",0,,closed choice,grade4,chemistry,Physical and chemical change,Compare physical and chemical changes,Chemical changes and physical changes are two ...,Step 1: Think about each change.\nTearing a pi...
6,What information supports the conclusion that ...,[Logan's mother has blue eyes. She passed this...,0,Read the description of a trait.\nLogan has bl...,closed choice,grade4,biology,Traits and heredity,Inherited and acquired traits: use evidence to...,"Organisms, including people, have both inherit...",
7,Is the following trait inherited or acquired?\...,"[inherited, acquired]",1,Hint: Playing soccer takes practice.,closed choice,grade3,biology,Heredity,Identify inherited and acquired traits,"Organisms, including people, have both inherit...",People are not born knowing how to play soccer...
8,"Based on this information, what is Holly's phe...","[straight whiskers, curved whiskers]",0,"In a group of Syrian hamsters, some individual...",closed choice,grade8,biology,Genes to traits,Genetics vocabulary: dominant and recessive,All organisms have pieces of hereditary materi...,You need to determine Holly's phenotype for th...


In [13]:
train_df["grade"] = train_df["grade"].str.replace("grade", "").astype(int)
test_df["grade"] = test_df["grade"].str.replace("grade", "").astype(int)
validation_df["grade"] = validation_df["grade"].str.replace("grade", "").astype(int)

In [14]:
train_df["task"] = train_df["task"].str.replace("closed", "multiple").astype(str)
test_df["task"] = test_df["task"].str.replace("closed", "multiple").astype(str)
validation_df["task"] = validation_df["task"].str.replace("closed", "multiple").astype(str)

In [15]:
# Keep rows with this task
tasks_to_keep = ['multiple choice']

# Filter the DataFrame
train_df = train_df[train_df['task'].isin(tasks_to_keep)].copy()
validation_df = validation_df[validation_df['task'].isin(tasks_to_keep)].copy()
test_df = test_df[test_df['task'].isin(tasks_to_keep)].copy()

In [16]:
train_df=train_df.drop(columns=['hint','solution','lecture'])
test_df=test_df.drop(columns=['hint','solution','lecture'])
validation_df=validation_df.drop(columns=['hint','solution','lecture'])

In [17]:
train_df.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill
1,Complete the sentence.\nSewing an apron is a ().,"[chemical change, physical change]",1,multiple choice,4,chemistry,Physical and chemical change,Identify physical and chemical changes
3,What do these two changes have in common?\ntea...,"[Both are only physical changes., Both are che...",0,multiple choice,4,chemistry,Physical and chemical change,Compare physical and chemical changes
6,What information supports the conclusion that ...,[Logan's mother has blue eyes. She passed this...,0,multiple choice,4,biology,Traits and heredity,Inherited and acquired traits: use evidence to...
7,Is the following trait inherited or acquired?\...,"[inherited, acquired]",1,multiple choice,3,biology,Heredity,Identify inherited and acquired traits
8,"Based on this information, what is Holly's phe...","[straight whiskers, curved whiskers]",0,multiple choice,8,biology,Genes to traits,Genetics vocabulary: dominant and recessive


In [18]:
test_df.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill
5,What do these two changes have in common?\ncom...,"[Both are caused by cooling., Both are only ph...",3,multiple choice,4,chemistry,Physical and chemical change,Compare physical and chemical changes
7,"Using only these supplies, which question can ...",[Does a pie crust made with white flour burn m...,0,multiple choice,8,science-and-engineering-practices,Designing experiments,Identify questions that can be investigated wi...
8,Is the following trait inherited or acquired?\...,"[acquired, inherited]",0,multiple choice,4,biology,Traits and heredity,Identify inherited and acquired traits
11,What do these two changes have in common?\nbut...,"[Both are caused by cooling., Both are only ph...",1,multiple choice,3,chemistry,Physical and chemical change,Compare physical and chemical changes
12,Select the plant.,"[Oak trees can have thick branches., Orcas swi...",0,multiple choice,2,biology,Classification,Identify plants and animals


In [19]:
validation_df.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill
3,"Based on this information, what is Daffodil's ...","[not having horns, having horns]",0,multiple choice,7,biology,Genes to traits,Genetics vocabulary: dominant and recessive
9,What information supports the conclusion that ...,"[Austen can cook food over a fire., Austen lea...",1,multiple choice,6,biology,Genes to traits,Inherited and acquired traits: use evidence to...
10,Which of the following is true about seeds?,[Seeds come in many shapes. But all seeds are ...,1,multiple choice,4,biology,Plants,Describe and construct flowering plant life cy...
12,Compare the motion of three ships. Which ship ...,[a ship that moved 555kilometers west in 10hou...,1,multiple choice,3,physics,Force and motion,Compare the speeds of moving objects
13,"Based on this information, what is Sasha's phe...","[Ff, short fur]",1,multiple choice,7,biology,Genes to traits,Genetics vocabulary: genotype and phenotype


In [20]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2342 entries, 1 to 6506
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  2342 non-null   object
 1   choices   2342 non-null   object
 2   answer    2342 non-null   int8  
 3   task      2342 non-null   object
 4   grade     2342 non-null   int32 
 5   topic     2342 non-null   object
 6   category  2342 non-null   object
 7   skill     2342 non-null   object
dtypes: int32(1), int8(1), object(6)
memory usage: 139.5+ KB


In [21]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 767 entries, 5 to 2220
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  767 non-null    object
 1   choices   767 non-null    object
 2   answer    767 non-null    int8  
 3   task      767 non-null    object
 4   grade     767 non-null    int32 
 5   topic     767 non-null    object
 6   category  767 non-null    object
 7   skill     767 non-null    object
dtypes: int32(1), int8(1), object(6)
memory usage: 45.7+ KB


In [22]:
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 820 entries, 3 to 2143
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  820 non-null    object
 1   choices   820 non-null    object
 2   answer    820 non-null    int8  
 3   task      820 non-null    object
 4   grade     820 non-null    int32 
 5   topic     820 non-null    object
 6   category  820 non-null    object
 7   skill     820 non-null    object
dtypes: int32(1), int8(1), object(6)
memory usage: 48.8+ KB


In [23]:
nan_counts = train_df.isnull().sum()
print("NaN counts in each column:")
print(nan_counts)

NaN counts in each column:
question    0
choices     0
answer      0
task        0
grade       0
topic       0
category    0
skill       0
dtype: int64


In [24]:
nan_counts = validation_df.isnull().sum()
print("NaN counts in each column:")
print(nan_counts)

NaN counts in each column:
question    0
choices     0
answer      0
task        0
grade       0
topic       0
category    0
skill       0
dtype: int64


In [25]:
nan_counts = test_df.isnull().sum()
print("NaN counts in each column:")
print(nan_counts)

NaN counts in each column:
question    0
choices     0
answer      0
task        0
grade       0
topic       0
category    0
skill       0
dtype: int64


In [26]:
train_df['topic'].unique()

array(['chemistry', 'biology', 'physics',
       'science-and-engineering-practices'], dtype=object)

In [55]:
train_df['category'].unique()

array(['Physical and chemical change', 'Traits and heredity', 'Heredity',
       'Genes to traits', 'States of matter', 'Classification', 'Animals',
       'Heat and thermal energy', 'Designing experiments',
       'Velocity, acceleration, and forces', 'Chemical reactions',
       'Kinetic and potential energy', 'Force and motion',
       'Thermal energy', 'Materials', 'Photosynthesis',
       'Atoms and molecules', 'Mixtures', 'Cells', 'Plants', 'Ecosystems',
       'Biochemistry', 'Traits', 'Adaptations'], dtype=object)

In [27]:
# Stratified sampling by 'subject' column
stratified_train, _ = train_test_split(train_df, test_size=(1 - 200 / len(train_df)), stratify=train_df['topic'], random_state=42)
stratified_train = stratified_train.reset_index(drop=True)
stratified_validation, _ = train_test_split(validation_df, test_size=(1 - 80 / len(validation_df)), stratify=validation_df['topic'], random_state=42)
stratified_validation = stratified_validation.reset_index(drop=True)

In [28]:
stratified_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  200 non-null    object
 1   choices   200 non-null    object
 2   answer    200 non-null    int8  
 3   task      200 non-null    object
 4   grade     200 non-null    int32 
 5   topic     200 non-null    object
 6   category  200 non-null    object
 7   skill     200 non-null    object
dtypes: int32(1), int8(1), object(6)
memory usage: 10.5+ KB


In [29]:
stratified_train.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill
0,Which is the stickiest?,"[leather belt, gum, silk kimono]",1,multiple choice,4,physics,Materials,Compare properties of materials
1,Is the following trait inherited or acquired?\...,"[inherited, acquired]",0,multiple choice,4,biology,Traits and heredity,Identify inherited and acquired traits
2,"Based on this information, what is Admiral's g...","[a gray body, BB]",1,multiple choice,6,biology,Genes to traits,Genetics vocabulary: genotype and phenotype
3,Is the following trait inherited or acquired?\...,"[acquired, inherited]",1,multiple choice,5,biology,Traits and heredity,Identify inherited and acquired traits
4,What information supports the conclusion that ...,[Colin has two pet fish. The fish live in a fi...,1,multiple choice,3,biology,Heredity,Inherited and acquired traits: use evidence to...


In [30]:
stratified_validation.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill
0,Complete the sentence.\nBurning food on a stov...,"[physical change, chemical change]",1,multiple choice,2,chemistry,Physical and chemical change,Identify physical and chemical changes
1,What information supports the conclusion that ...,"[Tim's mother speaks one language., Tim learne...",1,multiple choice,4,biology,Traits and heredity,Inherited and acquired traits: use evidence to...
2,"Based on this information, what is Deion's gen...","[not having sickle-cell disease, Aa]",1,multiple choice,8,biology,Genes to traits,Genetics vocabulary: genotype and phenotype
3,Which cherry pie has a higher temperature?,"[the cherry pie with more thermal energy, the ...",0,multiple choice,6,physics,Thermal energy,How are temperature and mass related to therma...
4,Compare the motion of three motorboats. Which ...,[a motorboat that moved 185kilometers west in ...,2,multiple choice,3,physics,Force and motion,Compare the speeds of moving objects


In [31]:
stratified_train.to_json('stratified_train.jsonl', orient='records', lines=True)
stratified_validation.to_json('stratified_validation.jsonl', orient='records', lines=True)
test_df.to_json('test_df_pre_blooms.jsonl', orient='records', lines=True)

In [None]:
# Initialize the OpenAI client
client = OpenAI(
    api_key=""
)

def assign_bloom_dok(question):
    prompt = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "system",
                "content": "You are an expert in educational assessment. Based on the revised Bloom's Taxonomy and Depth of Knowledge (DOK) framework, classify the given question into its appropriate cognitive level. Make sure to assign the BLoom's Taxonomy level from the following list: remember, understand, apply, analyze, evaluate, create (in lowercase). The DOK level must be numeric the values range from 1 to 4 appropriately."
            },
            {
                "role": "user",
                "content": f"""
                Classify the following question according to the revised Bloom's Taxonomy and Depth of Knowledge (DOK):

                Question: {question}

                Provide the classification in this format:
                Bloom's Taxonomy Level: <Level>
                DOK Level: <Level>
                """
            }
        ]
    }

    response = client.chat.completions.create(**prompt, max_tokens=100)
    classification_text = response.choices[0].message.content

    # Default values in case of unexpected output.
    bloom_level, dok_level = "unknown", "1"
    for line in classification_text.split("\n"):
        if "Bloom's Taxonomy Level:" in line:
            bloom_level = line.split(":", 1)[-1].strip().lower()
        elif "DOK Level:" in line:
            dok_level = line.split(":", 1)[-1].strip()

    try:
        dok_level = int(dok_level)
    except ValueError:
        dok_level = 1  # Default to 1 if conversion fails.

    return bloom_level, dok_level

# Function to add the new columns to your original DataFrame
def add_classification_columns(df):
    # Apply the assign_bloom_dok function to each row's "question" column
    df[['Bloom\'s Taxonomy', 'DOK Level']] = df.apply(
        lambda row: pd.Series(assign_bloom_dok(row['question'])),
        axis=1
    )
    return df

# Update your original DataFrames by adding the two new columns
processed_train = add_classification_columns(stratified_train)
processed_validation = add_classification_columns(stratified_validation)
processed_test = add_classification_columns(test_df)



In [33]:
processed_train=pd.DataFrame(processed_train)
processed_validation=pd.DataFrame(processed_validation)
processed_test=pd.DataFrame(processed_test)

In [34]:
def save_dataframe_as_jsonl(df, output_filename):
    """
    Save the DataFrame 'df' to a JSONL file named 'output_filename'.
    Each row of the DataFrame is written as a separate JSON object.
    """
    df.to_json(output_filename, orient='records', lines=True)

save_dataframe_as_jsonl(processed_train, "processed_train.jsonl")
save_dataframe_as_jsonl(processed_validation, "processed_validation.jsonl")
save_dataframe_as_jsonl(processed_test, "processed_test.jsonl")

In [35]:
processed_train.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill,Bloom's Taxonomy,DOK Level
0,Which is the stickiest?,"[leather belt, gum, silk kimono]",1,multiple choice,4,physics,Materials,Compare properties of materials,evaluate,3
1,Is the following trait inherited or acquired?\...,"[inherited, acquired]",0,multiple choice,4,biology,Traits and heredity,Identify inherited and acquired traits,understand,2
2,"Based on this information, what is Admiral's g...","[a gray body, BB]",1,multiple choice,6,biology,Genes to traits,Genetics vocabulary: genotype and phenotype,analyze,3
3,Is the following trait inherited or acquired?\...,"[acquired, inherited]",1,multiple choice,5,biology,Traits and heredity,Identify inherited and acquired traits,understand,2
4,What information supports the conclusion that ...,[Colin has two pet fish. The fish live in a fi...,1,multiple choice,3,biology,Heredity,Inherited and acquired traits: use evidence to...,analyze,3


In [36]:
processed_validation.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill,Bloom's Taxonomy,DOK Level
0,Complete the sentence.\nBurning food on a stov...,"[physical change, chemical change]",1,multiple choice,2,chemistry,Physical and chemical change,Identify physical and chemical changes,apply,2
1,What information supports the conclusion that ...,"[Tim's mother speaks one language., Tim learne...",1,multiple choice,4,biology,Traits and heredity,Inherited and acquired traits: use evidence to...,analyze,3
2,"Based on this information, what is Deion's gen...","[not having sickle-cell disease, Aa]",1,multiple choice,8,biology,Genes to traits,Genetics vocabulary: genotype and phenotype,analyze,3
3,Which cherry pie has a higher temperature?,"[the cherry pie with more thermal energy, the ...",0,multiple choice,6,physics,Thermal energy,How are temperature and mass related to therma...,understand,2
4,Compare the motion of three motorboats. Which ...,[a motorboat that moved 185kilometers west in ...,2,multiple choice,3,physics,Force and motion,Compare the speeds of moving objects,analyze,3


In [37]:
processed_validation.head()

Unnamed: 0,question,choices,answer,task,grade,topic,category,skill,Bloom's Taxonomy,DOK Level
0,Complete the sentence.\nBurning food on a stov...,"[physical change, chemical change]",1,multiple choice,2,chemistry,Physical and chemical change,Identify physical and chemical changes,apply,2
1,What information supports the conclusion that ...,"[Tim's mother speaks one language., Tim learne...",1,multiple choice,4,biology,Traits and heredity,Inherited and acquired traits: use evidence to...,analyze,3
2,"Based on this information, what is Deion's gen...","[not having sickle-cell disease, Aa]",1,multiple choice,8,biology,Genes to traits,Genetics vocabulary: genotype and phenotype,analyze,3
3,Which cherry pie has a higher temperature?,"[the cherry pie with more thermal energy, the ...",0,multiple choice,6,physics,Thermal energy,How are temperature and mass related to therma...,understand,2
4,Compare the motion of three motorboats. Which ...,[a motorboat that moved 185kilometers west in ...,2,multiple choice,3,physics,Force and motion,Compare the speeds of moving objects,analyze,3


In [38]:
processed_train["Bloom's Taxonomy"].unique()

array(['evaluate', 'understand', 'analyze', 'remember', 'apply', 'create'],
      dtype=object)

In [39]:
processed_validation["Bloom's Taxonomy"].unique()

array(['apply', 'analyze', 'understand', 'create', 'remember', 'evaluate'],
      dtype=object)

In [40]:
processed_test["Bloom's Taxonomy"].unique()

array(['analyze', 'create', 'understand', 'remember', 'apply', 'evaluate'],
      dtype=object)

In [41]:
processed_train["DOK Level"].unique()

array([3, 2, 1, 4], dtype=int64)

In [42]:
processed_validation["DOK Level"].unique()

array([2, 3, 1], dtype=int64)

In [43]:
processed_test["DOK Level"].unique()

array([2, 3, 1, 4], dtype=int64)

In [44]:
processed_train["input"] = processed_train.apply(
    lambda row: f"Generate a science question and its answer using the following metadata: grade: {row['grade']}, task: {row['task']}, topic: {row['topic']}, category: {row['category']}, skill: {row['skill']}, bloom's taxonomy: {row["Bloom's Taxonomy"]}, dok:{row['DOK Level']}",
    axis=1
)
processed_validation["input"] = processed_validation.apply(
    lambda row: f"Generate a science question and its answer using the following metadata: grade: {row['grade']}, task: {row['task']}, topic: {row['topic']}, category: {row['category']}, skill: {row['skill']}, bloom's taxonomy: {row["Bloom's Taxonomy"]}, dok:{row['DOK Level']}",
    axis=1
)
processed_test["input"] = processed_test.apply(
    lambda row: f"Generate a science question and its answer using the following metadata: grade: {row['grade']}, task: {row['task']}, topic: {row['topic']}, category: {row['category']}, skill: {row['skill']}, bloom's taxonomy: {row["Bloom's Taxonomy"]}, dok:{row['DOK Level']}",
    axis=1
)

In [45]:
processed_train["output"] = processed_train.apply(
    lambda row: f"Question: {row['question']},\n Choices: \n{row['choices']},\n Answer: \n{row['answer']}",
    axis=1
)
processed_validation["output"] = processed_validation.apply(
    lambda row: f"Question: {row['question']},\n Choices: \n{row['choices']},\n Answer: \n{row['answer']}",
    axis=1
)
processed_test["output"] = processed_test.apply(
    lambda row: f"Question: {row['question']},\n Choices: \n{row['choices']},\n Answer: \n{row['answer']}",
    axis=1
)

In [46]:
system_info = {
    "role": "You are an expert science assessment specialist that generates science questions based on specified metadata provided by the user. Your role is to ensure that the generated questions are of high quality, align with the intended learning objectives, and adhere to scientifically rigorous standards."
    }

In [47]:
# system_info_test = {
#     "role": "You are an expert science assessment specialist. Your task is to generate high-quality science questions based on given metadata (e.g., grade level, subject, educational standards, difficulty level) and then rigorously evaluate your own question. In this process, you will first assign weights to each evaluation criterion based on its importance for the given context. The more important a criterion is to the learning outcomes or curriculum, the higher weight it should receive.",
    
#     "evaluation_instructions": {
#         "steps": [
#             "Step 1: Assign Importance Weights to each evaluation criterion. For each of the following criteria, choose a weight between 0 and 1 such that the sum of all weights equals 1.0. The weights should reflect the relative importance of each criterion given the provided metadata.",
#             "Step 2: Generate a science question along with a set of answer choices and indicate the correct answer. Ensure the question aligns with the intended learning outcomes and metadata.",
#             "Step 3: For each of the 18 evaluation criteria (listed below), evaluate your generated question by assigning a score between 0 and 3 (0 = Poor, 3 = Excellent). Provide a clear justification (a brief reason) for each score.",
#             "Step 4: For each criterion, calculate the weighted score as: (Score × the weight you assigned).",
#             "Step 5: Sum all weighted scores to get a Total Weighted Score.",
#             "Step 6: Calculate the Maximum Possible Weighted Score as (3 × 1.0) = 3.0 (since each criterion’s maximum score is 3 and the total weight is 1). Then compute the Weighted Percentage as: (Total Weighted Score / 3.0) × 100%.",
#             "Step 7: If any criterion scores 1 or 0, provide specific improvement suggestions for that criterion."
#         ],
#         "output_format": "Your final output must include: (a) the self-assigned weights for each criterion, (b) the Generated Question (with answer choices and correct answer), (c) a Detailed Evaluation table (each criterion with its score, weighted score, and justification), (d) the Final Weighted Score and Weighted Percentage, and (e) any Improvement Suggestions."
#     },
    
#     "evaluation_criteria": {
#         "1. Relevance Grade": "Does the question align with the intended learning outcomes or educational standards?",
#         "2. Curriculum Fit Grade": "Is the question relevant to the grade level, subject, and curriculum goals?",
#         "3. Accuracy Grade": "Is the content of the question accurate and free from ambiguity?",
#         "4. Clarity Grade": "Is the question stem clear, concise, and free from unnecessary complexity?",
#         "5. Bias-Free (Answer Choices) Grade": "Are the answer choices free from cultural, gender, or regional biases?",
#         "6. Grammar and Syntax Grade": "Is the language grammatically correct and suitable for the intended audience?",
#         "7. Single Correct Answer Grade": "Does the question have a single, unambiguous correct answer?",
#         "8. Avoiding Common Errors Grade": "Does the question avoid common errors (e.g., unclear distractors, use of 'All of the above')?",
#         "9. Readability Grade": "Is the vocabulary appropriate for the target grade level and demographic?",
#         "10. Plausibility Grade": "Are the distractors plausible and meaningful while clearly incorrect?",
#         "11. Balance (Answer Choices) Grade": "Are the answer choices balanced in length, structure, and complexity?",
#         "12. Bias-Free Content Grade": "Is the question free from stereotypes, cultural insensitivity, or potentially offensive material?",
#         "13. Formatting Grade": "Is the formatting of the question consistent, clear, and free from distractions?",
#         "14. Fairness Grade": "Does the question ensure fairness and accessibility for diverse learner groups?",
#         "15. Avoiding Plagiarism Grade": "Is the question unique and not simply a rephrased version of an existing one?",
#         "16. Novelty Grade": "Does the question introduce a novel perspective or fresh twist on the topic?",
#         "17. DOK Level Grade": "Does the question reflect the appropriate Depth of Knowledge (DOK) level?",
#         "18. Bloom's Taxonomy Grade": "Does the question align with the appropriate level of Bloom's Taxonomy?"
#     },
    
#     "final_output_format": {
#         "sections": [
#             "Self-Assigned Weights (list each criterion with its weight)",
#             "Generated Question (with answer choices and correct answer)",
#             "Detailed Evaluation (for each criterion: assigned score, weighted score, and justification)",
#             "Final Weighted Score (total of weighted scores)",
#             "Weighted Percentage (Final Weighted Score / 3.0 × 100%)",
#             "Improvement Suggestions (if any criterion scores 1 or 0)"
#         ]
#     },
    
#     "few_shot_examples": [
#         {
#             "Example_Description": "Example 1: A poorly defined question due to lack of context.",
#             "Self_Assigned_Weights": {
#                 "Relevance Grade": 0.12,
#                 "Curriculum Fit Grade": 0.12,
#                 "Accuracy Grade": 0.10,
#                 "Clarity Grade": 0.09,
#                 "Bias-Free (Answer Choices) Grade": 0.06,
#                 "Grammar and Syntax Grade": 0.04,
#                 "Single Correct Answer Grade": 0.08,
#                 "Avoiding Common Errors Grade": 0.04,
#                 "Readability Grade": 0.04,
#                 "Plausibility Grade": 0.04,
#                 "Balance (Answer Choices) Grade": 0.04,
#                 "Bias-Free Content Grade": 0.04,
#                 "Formatting Grade": 0.03,
#                 "Fairness Grade": 0.04,
#                 "Avoiding Plagiarism Grade": 0.03,
#                 "Novelty Grade": 0.01,
#                 "DOK Level Grade": 0.06,
#                 "Bloom's Taxonomy Grade": 0.04
#             },
#             "Generated Question": "Based on the information provided, which scenario best describes a change in the state of matter?",
#             "Answer_Choices": "['When the salt is dissolved in water.', 'When the salt is stirred in the water.', 'When the salt is heated in a pan.']",
#             "Correct_Answer": "1",
#             "Detailed_Evaluation": {
#                 "Relevance Grade": {"Score": 1, "Weighted Score": "1 × 0.12 = 0.12", "Justification": "Lacks context for clear alignment with learning outcomes."},
#                 "Curriculum Fit Grade": {"Score": 1, "Weighted Score": "1 × 0.12 = 0.12", "Justification": "Unclear connection to specific grade-level or curriculum goals."},
#                 "Accuracy Grade": {"Score": 0, "Weighted Score": "0 × 0.10 = 0", "Justification": "Incomplete question that fails to specify what is being asked."},
#                 "Clarity Grade": {"Score": 0, "Weighted Score": "0 × 0.09 = 0", "Justification": "Missing question stem leads to significant ambiguity."},
#                 "Plausibility Grade": {"Score": 2, "Weighted Score": "2 × 0.04 = 0.08", "Justification": "Distractors are somewhat plausible but lack clear context."},
#                 "Bias-Free (Answer Choices) Grade": {"Score": 3, "Weighted Score": "3 × 0.06 = 0.18", "Justification": "Answer choices are free from biases."},
#                 "Balance (Answer Choices) Grade": {"Score": 2, "Weighted Score": "2 × 0.04 = 0.08", "Justification": "Choices are fairly balanced but impacted by missing context."},
#                 "Grammar and Syntax Grade": {"Score": 2, "Weighted Score": "2 × 0.04 = 0.08", "Justification": "Grammatically correct language despite fragmentary context."},
#                 "Readability Grade": {"Score": 2, "Weighted Score": "2 × 0.04 = 0.08", "Justification": "Vocabulary is acceptable, but clarity suffers overall."},
#                 "Avoiding Common Errors Grade": {"Score": 1, "Weighted Score": "1 × 0.04 = 0.04", "Justification": "Errors present due to unclear structure and context."},
#                 "Single Correct Answer Grade": {"Score": 0, "Weighted Score": "0 × 0.08 = 0", "Justification": "Not possible to determine a single correct answer without context."},
#                 "Bias-Free Content Grade": {"Score": 1, "Weighted Score": "1 × 0.04 = 0.04", "Justification": "Cannot fully assess bias without context."},
#                 "Formatting Grade": {"Score": 2, "Weighted Score": "2 × 0.03 = 0.06", "Justification": "Formatting is concise but hampered by missing question stem."},
#                 "Fairness Grade": {"Score": 1, "Weighted Score": "1 × 0.04 = 0.04", "Justification": "Lack of context affects fairness for diverse learners."},
#                 "Avoiding Plagiarism Grade": {"Score": 2, "Weighted Score": "2 × 0.03 = 0.06", "Justification": "Appears original despite insufficient context."},
#                 "Novelty Grade": {"Score": 1, "Weighted Score": "1 × 0.01 = 0.01", "Justification": "Lacks novelty due to basic phrasing."},
#                 "DOK Level Grade": {"Score": 0, "Weighted Score": "0 × 0.06 = 0", "Justification": "DOK level cannot be determined without a complete question."},
#                 "Bloom's Taxonomy Grade": {"Score": 0, "Weighted Score": "0 × 0.04 = 0", "Justification": "Cannot assess Bloom's alignment with inadequate question details."}
#             },
#             "Final_Weighted_Score": "Sum of weighted scores (example: 0.12+0.12+0+0+... = X)",
#             "Weighted_Percentage": "Calculated as (Total Weighted Score / 3.0) × 100%",
#             "Improvement_Suggestions": "Improve context and clarity to allow a precise, complete question; ensure a single correct answer can be determined."
#         },
#         {
#             "Example_Description": "Example 2: A well-constructed question with good alignment.",
#             "Self_Assigned_Weights": {
#                 "Relevance Grade": 0.14,
#                 "Curriculum Fit Grade": 0.13,
#                 "Accuracy Grade": 0.11,
#                 "Clarity Grade": 0.09,
#                 "Bias-Free (Answer Choices) Grade": 0.06,
#                 "Grammar and Syntax Grade": 0.04,
#                 "Single Correct Answer Grade": 0.08,
#                 "Avoiding Common Errors Grade": 0.04,
#                 "Readability Grade": 0.04,
#                 "Plausibility Grade": 0.05,
#                 "Balance (Answer Choices) Grade": 0.05,
#                 "Bias-Free Content Grade": 0.05,
#                 "Formatting Grade": 0.03,
#                 "Fairness Grade": 0.04,
#                 "Avoiding Plagiarism Grade": 0.03,
#                 "Novelty Grade": 0.01,
#                 "DOK Level Grade": 0.06,
#                 "Bloom's Taxonomy Grade": 0.04
#             },
#             "Generated Question": "Which of these questions can be investigated with the materials provided?",
#             "Answer_Choices": "['What is the average height of a tomato plant?', 'Does the height of a tomato plant affect the number of tomatoes it produces?', 'Which tomato plant will grow the tallest?']",
#             "Correct_Answer": "2",
#             "Detailed_Evaluation": {
#                 "Relevance Grade": {"Score": 2, "Weighted Score": "2 × 0.14 = 0.28", "Justification": "The question partially aligns with the learning outcome but lacks full context."},
#                 "Curriculum Fit Grade": {"Score": 2, "Weighted Score": "2 × 0.13 = 0.26", "Justification": "Suitable for science curricula, though more specific grade context would help."},
#                 "Accuracy Grade": {"Score": 2, "Weighted Score": "2 × 0.11 = 0.22", "Justification": "Generally accurate, but some ambiguity remains without material details."},
#                 "Clarity Grade": {"Score": 3, "Weighted Score": "3 × 0.09 = 0.27", "Justification": "Clear and concise question stem."},
#                 "Bias-Free (Answer Choices) Grade": {"Score": 3, "Weighted Score": "3 × 0.06 = 0.18", "Justification": "Answer choices are free from bias."},
#                 "Grammar and Syntax Grade": {"Score": 3, "Weighted Score": "3 × 0.04 = 0.12", "Justification": "Language is grammatically correct."},
#                 "Single Correct Answer Grade": {"Score": 2, "Weighted Score": "2 × 0.08 = 0.16", "Justification": "There is a presumed single correct answer, though slight context ambiguity exists."},
#                 "Avoiding Common Errors Grade": {"Score": 2, "Weighted Score": "2 × 0.04 = 0.08", "Justification": "Minor errors in presentation, such as repeated information."},
#                 "Readability Grade": {"Score": 3, "Weighted Score": "3 × 0.04 = 0.12", "Justification": "Vocabulary is appropriate for the target audience."},
#                 "Plausibility Grade": {"Score": 3, "Weighted Score": "3 × 0.05 = 0.15", "Justification": "Distractors are plausible."},
#                 "Balance (Answer Choices) Grade": {"Score": 3, "Weighted Score": "3 × 0.05 = 0.15", "Justification": "Choices are well-balanced."},
#                 "Bias-Free Content Grade": {"Score": 3, "Weighted Score": "3 × 0.05 = 0.15", "Justification": "The content is neutral and unbiased."},
#                 "Formatting Grade": {"Score": 3, "Weighted Score": "3 × 0.03 = 0.09", "Justification": "Consistent and clear formatting."},
#                 "Fairness Grade": {"Score": 2, "Weighted Score": "2 × 0.04 = 0.08", "Justification": "Fair overall, though context could be more explicit."},
#                 "Avoiding Plagiarism Grade": {"Score": 3, "Weighted Score": "3 × 0.03 = 0.09", "Justification": "The question appears original."},
#                 "Novelty Grade": {"Score": 1, "Weighted Score": "1 × 0.01 = 0.01", "Justification": "Lacks a unique twist, addressing a standard topic."},
#                 "DOK Level Grade": {"Score": 2, "Weighted Score": "2 × 0.06 = 0.12", "Justification": "Reflects a basic depth of knowledge."},
#                 "Bloom's Taxonomy Grade": {"Score": 2, "Weighted Score": "2 × 0.04 = 0.08", "Justification": "Aligns with the 'Understand' level of Bloom's Taxonomy."}
#             },
#             "Final_Weighted_Score": "Sum of weighted scores (example: 0.28+0.26+... = Y)",
#             "Weighted_Percentage": "Calculated as (Total Weighted Score / 3.0) × 100%",
#             "Improvement_Suggestions": "Clarify material context to enhance accuracy and ensure a single, unambiguous correct answer."
#         }
#     ]
# }


In [48]:
# # Convert the system info to a formatted string
# system_info_str = json.dumps(system_info_test, indent=2)

# processed_test["input"] = processed_test.apply(
#     lambda row: f"""{system_info_str}

# Generate a science question and its answer using the following metadata: grade: {row['grade']}, task: {row['task']}, topic: {row['topic']}, category: {row['category']}, skill: {row['skill']}, bloom's taxonomy: {row["Bloom's Taxonomy"]}, dok: {row['DOK Level']}""",
#     axis=1
# )

In [49]:
# Assigning the dictionary to the DataFrame
processed_train["instruction"] = str(system_info)
processed_validation["instruction"] = str(system_info)
processed_test["instruction"] = str(system_info)

In [50]:
# Drop unnecessary columns
columns_to_keep = ["input", "output", "instruction"]
processed_train = processed_train[columns_to_keep]
processed_validation = processed_validation[columns_to_keep]
processed_test = processed_test[columns_to_keep]

In [51]:
processed_train.head()

Unnamed: 0,input,output,instruction
0,Generate a science question and its answer usi...,"Question: Which is the stickiest?,\n Choices: ...",{'role': 'You are an expert science assessment...
1,Generate a science question and its answer usi...,Question: Is the following trait inherited or ...,{'role': 'You are an expert science assessment...
2,Generate a science question and its answer usi...,"Question: Based on this information, what is A...",{'role': 'You are an expert science assessment...
3,Generate a science question and its answer usi...,Question: Is the following trait inherited or ...,{'role': 'You are an expert science assessment...
4,Generate a science question and its answer usi...,Question: What information supports the conclu...,{'role': 'You are an expert science assessment...


In [52]:
processed_test.head()

Unnamed: 0,input,output,instruction
5,Generate a science question and its answer usi...,Question: What do these two changes have in co...,{'role': 'You are an expert science assessment...
7,Generate a science question and its answer usi...,"Question: Using only these supplies, which que...",{'role': 'You are an expert science assessment...
8,Generate a science question and its answer usi...,Question: Is the following trait inherited or ...,{'role': 'You are an expert science assessment...
11,Generate a science question and its answer usi...,Question: What do these two changes have in co...,{'role': 'You are an expert science assessment...
12,Generate a science question and its answer usi...,"Question: Select the plant.,\n Choices: \n['Oa...",{'role': 'You are an expert science assessment...


In [53]:
# Function to format a row into the desired JSON structure
def format_row(row):
    messages = [
        {"role": "instruction", "content": row["instruction"]},
        {"role": "input", "content": row["input"]},
        {"role": "output", "content": row["output"]}
    ]
    return {"messages": messages}

# Function to format and save a DataFrame as a JSONL file
def format_and_save(df, output_file):
    formatted_data = df.apply(format_row, axis=1).tolist()
    with open(output_file, "w", encoding="utf-8") as f:
        for entry in formatted_data:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")
    print(f"Data saved to {output_file}")

In [54]:
format_and_save(processed_train, "train.jsonl")
format_and_save(processed_validation, "validation.jsonl")
format_and_save(processed_test, "test.jsonl")

Data saved to train.jsonl
Data saved to validation.jsonl
Data saved to test.jsonl
