# Using chatAPI to grade student response. 
The code is simplified and cleaned using custom modules.

### Load packages

In [1]:
#setup all the packages
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
import pandas as pd
import re #this handles regular expression.
import dill as pickle #pickle cannot handle functions, dill can. 
#import pipe

load_dotenv() #loads the dotenv for API keys as environmental variables

True

### Load custom modules

In [2]:
import binary_output #used to check or extract binary grading outcomes.
import calc_price #calculate the price of each run based on price models.
import pickle_tools #used to save and load a list of variables, avoids variables that cannot be serialized.

Define some global variables

In [7]:
project_folder = 'chatAPI_scriptCleaning' #This is used as the folder name to save pickled variables
nRubricItems = 3 #How many items are there in the rubric, used for checking/extracting binary grading outcome.

In [8]:
#run this chunck to load previously saved variables
try:
    pickle_tools.load_from_pickle(folderName=project_folder, globalVars = globals())
except FileNotFoundError as err:
    print(err)

## Load student responses

In [12]:
student_responses = pd.read_csv("./data/StudentResponse_Full.csv")

In [None]:
student_responses

## Setup all the components of the grading prompt.
For chat API, the prompt template currently contains two messages, a system message and a human message (in few shot learning it can contain multiple human and ai messages).
The system message will just be a message for now (select a message from a list)
The human message will be from a prompt template with the following variables:
* Problem body
* Rubric
* Requirements
* Student Response

Note 7/22/2024: I'm currently not utilizing output formatting options as they are not necessary for this task.

In [14]:
#prompt message and template dictionary
prompt_dict ={
    'sys_messages': [
        """You are a college introductory level physics teacher who is grading a student's written explanation to a physics problem based on a grading rubric, following the given instruction.""",
        """You are a college introductory physics teacher who is grading a student's written explanation to a physics problem based on a grading rubric. Your grading always ends with a comma separated binary vector."""
    ],
    'human_prompt_template': {
        'no-formatting': {},
        'with-formatting': {}
    }
}

prompt_dict['human_prompt_template']['no-formatting'] =  """Here is a college introductory level physics problem: 
"{ProblemBody}"
Students are instructed to provide an explanation to their answer.
Student explanations are being graded based on the following rubric:
"{Rubric}"
Grading is performed strictly according to the following requirements: 
# The grading must start with the evaluation of each individual rubric item.
{Requirements}
# For each rubric item, the student explanation will receive 1 point if the explanation satisfies the rubric, or 0 point if the explanation does not satisfy the rubric. Never assign a 0.5 for an item. 
# Each rubric item is graded only once.  
# Steps or sentences in student's explanation may not follow the same order as the rubric. 
# Conclude the grading response with vector of length 3, included in curly brackets and separated by commas, such as {{0,0,0}} or {{1,0,1}} or {{1,1,1}}. The vector summarizes the grading of each of the three rubric items. 
Student response:
"{StudentResponse}"
Grading:
"""

#This "with_formatting part is not needed here."
prompt_dict['human_prompt_template']['with-formatting'] = """Here is a college introductory level physics problem: 
"{ProblemBody}"
Students are instructed to provide an explanation to their answer.
Student explanations are being graded based on the following rubric:
"{Rubric}"
Grading is performed strictly according to the following requirements: 
# The grading must start with the evaluation of each individual rubric item.
{Requirements}
# For each rubric item, the student explanation will receive 1 point if the explanation satisfies the rubric, or 0 point if the explanation does not satisfy the rubric. Never assign a 0.5 for an item. 
# Each rubric item is graded only once.  
# Steps or sentences in student's explanation may not follow the same order as the rubric. 

{Format_Instructions}

Student response:
"{StudentResponse}"
"""


In [15]:
#create the langchain prompt template with four or five input variables
#this one is without json output parsing. See if removing "partial credit" will stop it from giving 0.5s.
prompt_template_noformatting = ChatPromptTemplate.from_messages([
    ("system", prompt_dict['sys_messages'][1]), #use the system message for no output formatting.
    ("human", prompt_dict['human_prompt_template']['no-formatting']) #use the no formatting human message template
])
print(prompt_template_noformatting.input_variables) #list the input variables.

['ProblemBody', 'Requirements', 'Rubric', 'StudentResponse']


### Here are different versions of the rubric and grading requirements for testing.

In [16]:
#changeable components for problem body, rubric and requirements
prompt_components_dict = {
    'ProblemBody':"""Swimmers at a water park have a choice of two frictionless water slides. Both slides drop over the same height h: slide 1 is straight while slide 2 is curved, dropping quickly at first and then leveling out. How does the speed v1 of a swimmer reaching the bottom of slide 1 compare with v2, the speed of a swimmer reaching the end of slide 2?""",
    'Rubric':{},
    'Requirements':{}
}

prompt_components_dict['Rubric']['simple'] ="""
# Item 1: The student should mention either one of the following: 
  * conservation of energy OR
  * work and kinetic energy theorem 

# Item 2: The student mentioned either one of the following: 
   * No net external non-conservative work is being done, so mechanical energy  is conserved for the system  OR 
   * the slide is frictionless/smooth OR
   * gravity is the only force that does work on the girl. 

# Item 3: The student indicated either one of the following: 
   * potential energy is converted into kinetic energy OR  
   * Work done by gravity/gravitational force is equal to the change in kinetic energy of the girl"""

prompt_components_dict['Rubric']['detailed']="""
# Item 1: 
	"* The student should mention either one of the following in the explanation: 
		** Conservation of energy/mechanical energy. Conservation of Energy can be expressed in mathematical forms such as mgh = 1/2 mv^2, mghi+ ½mvi^2=mghf+ ½mvf^2, or MEi = MEf
		** work and kinetic energy theorem. 
		** The student could explicitly mention both (gravitational) potential energy and kinetic energy, or mention both work and kinetic energy, without explicilty saying conservation of energy or name of the theorem.
	* Only mentioning the term potential energy will NOT satisfy this rubric. 
	* The explanation cannot mention momentum, linear momentum, or centripetal forces"

# Item 2: 
	"* The student mentioned either one of the following in the explanation: 
		** No net external non-conservative work is being done, so mechanical energy is conserved for the system. 
		** the slide is frictionless, or that the slide is smooth, so that mechanical energy is conserved. Note that the explanation must have indicated using mechanical energy/enery/work principles.   
		** gravity is the only force that does work on the girl.
		** No non-conservative forces do work on the system."
    
# Item 3: 
	* The student explanation indicated either one of the following: 
		** potential energy or gravitational potential energy is converted or turned into kinetic energy. 
		** Work done by gravity or gravitational force is equal to the change in kinetic energy of the girl or the swimmer. 
		** Discussed relation between work done by gravity and kinetic energy or the velocity of the girl/swimmer.
		** Discussed the relation between potential energy or height, and the final kinetic energy (or the girl's velocity), when the rest of the explanation resolves around energy concepts.
	* The student can express potential energy as mgh or mgy, and kinetic energy as 1/2mv^2 or 0.5 mv^2. They can write expression such as mgh = 1/2mv^2"
    """

## I'm not sure what this "detailed_2" rubric is doing.
prompt_components_dict['Rubric']['detailed_2']="""
# Item 1: "The student should mention one of the following: 
	 * conservation of energy/mechanical energy
     * work and kinetic energy theorem. 
    The student could also indicate that potential energy is converted or tranformed in to kinetic energy."

# Item 2: "The student mentioned one of the following: 
	* No net external non-conservative work is being done, so mechanical energy is conserved for the system. 
	* the slide is frictionless, or that the slide is smooth, so mechanical energy is conserved. Student's explanation must explicitly contain "frictionless" or "smooth" or a similar phrase
	* gravity is the only force that does work on the girl."
    
# Item 3: "The student explanation indicated one of the following: 
	* potential energy is converted into kinetic energy. The student must explicitly mention either gravitational potential energy is converted or turned into kinetic energy,
	* Work done by gravity or gravitational force is equal to the change in kinetic energy of the girl. 
    * Discussed relation between work done by gravity and kinetic energy of the girl."
"""

prompt_components_dict['Requirements']['test'] = """# For each rubric item, first say the exact same words "This is a test", then give it a grade of 1."""
prompt_components_dict['Requirements']['naive_cot']="""
# For each rubric item, first write step by step reasoning on why or why not the student explanation satisfies or contradicts the item. Then assign a binary grade of either 0 or 1, with 1 indicating the student explanation satisfied the rubric item, and 0 otherwise.
"""
prompt_components_dict['Requirements']['comparison']="""
# For each rubric item, first compare student explanation with the rubric item description, then conclude if the explanation satisfies or didn't satisfy the rubric item. Finally, with 1 indicating the student explanation satisfied the rubric item, and 0 otherwise.
"""

prompt_components_dict['Requirements']['force_compare'] = """
# For each rubric item, write the grading statement strictly following the order of the statements below: 
  ## First, state one of the following two:  
     "For item <<item number>>, the rubric states that <<quote from the rubric item description>>. The most relevant parts in the student explanation are <<direct quote or quotes from student explanation>>.
     "For item <<item number>>, the rubric states that <<quote from the rubric item description>>. No part in the students' explanation is relevant to the rubric"
  ## then state one of the following: 
     "the student explanation is similar to this part of the rubric description <<most similar part of the rubric>>", 
     "the student explanation and the rubric description are very different" 
     "the student explanation and the rubric description are irrelevant"
  ## Finally, conclude with a binary score: 
     "so the grade is 1"
     "so the grade is 0"
"""


Below is a dictionary that toggles the components to form grading condition dictionary that will be sent to the llm chain. Each element is a combination of a rubric style and a prompt style.

In [17]:

grading_conditions_noparser = {
    'naive_cot_1': {
        'Rubric' : prompt_components_dict['Rubric']['simple'],
        'Requirements': prompt_components_dict['Requirements']['naive_cot']
    },
    'naive_cot_2': {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements': prompt_components_dict['Requirements']['naive_cot']
    },
    'detailed_compare' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements' : prompt_components_dict['Requirements']['comparison']
    },
    'forced_compare' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements' : prompt_components_dict['Requirements']['force_compare']
    }
}


In [24]:
#get the grading contion names for easier labeling of columns in the output dataframe
grading_condition_names = list(grading_conditions_noparser.keys())

In [25]:
grading_condition_names

['naive_cot_1', 'naive_cot_2', 'detailed_compare', 'forced_compare']

## Create LLM communication and LLM chain. 

In [18]:
#define the llm chain and test one grading.
#setup the llm connection and basic parameters. 
#Empirically, temperatur 0.8 seems to be a good balance. This is of course a guess. 
#change to 800 tokens max to save some time and money based on previous experience.
#only 4 stop sequences are allowed for Azure via API. This is an API issue
llm_gpt35_chat = AzureChatOpenAI(    
    api_version="2024-02-01", #latest stable version
    deployment_name = "zchen-gpt35-chat",
    max_tokens = 1000,
    temperature= 0.8,
    #model_version = '0301',
    model_kwargs= {"stop": ["<|im_end|>", "Student response:", "Grading:", "Example "]}
    )

#define the grading chain. If parsing is needed, it can be connected to the output parser
grading_chain_gpt35 = prompt_template_noformatting | llm_gpt35_chat

The following function sends a response to llm for grading, and repeats the process until the outcome contains a binary string.

In [19]:
#A function to take the student response and grading input as parameters and invoke the chain to grade. 
def grade_by_llmChain(response: str, grading_input, chain, nItems = nRubricItems, problem = prompt_components_dict['ProblemBody']):
    grading_input['StudentResponse'] = response # student response is the parameter to feed to the function. grading_input is from the grainding_condition list
    grading_input['ProblemBody'] = problem #add the problem body
    grading_output = chain.invoke(input=grading_input) #invoke the llm chain to produce the grading.
    #check if the grading contains a binary output. If not, redo grading.
    binaryPattern = binary_output.Create_Search(nItems) #using the binary outuput module to create search pattern
    while not re.search(pattern=binaryPattern, string= grading_output.content):
        print("proper grading output not found, re-do grading again.")
        grading_output = chain.invoke(input=grading_input)
    return(grading_output)

In [20]:
#function to extract the grading text and grading outcome in one step.
def extract_info(df : pd.DataFrame, outcomeName : str, nItems = nRubricItems):
    df.loc[:,f'{outcomeName}_text'] = df.loc[:,outcomeName].apply(lambda x:x.content)
    df.loc[:,f'{outcomeName}_grade'] = df.loc[:,f'{outcomeName}_text'].apply(lambda x:binary_output.Extract_binary(x, nItems= nItems))
    return(df)

In [21]:
# This function automates the grading and extract of information process to avoid code copying errors.
def do_grading(promptStyle : str, llm_chain, response_df : pd.DataFrame, name_append = '', nItems = nRubricItems):
    colName = promptStyle if name_append == "" else f"{promptStyle}_{name_append}"
    response_df[colName] = response_df['response'].apply(grade_by_llmChain, grading_input = grading_conditions_noparser[promptStyle], chain = llm_chain, nItems = nItems)
    response_df = extract_info(response_df, colName, nItems = nItems)

A test case with one answer

In [17]:
test_grading = grade_by_llmChain(
    response=student_responses['response'][67],
    chain = grading_chain_gpt35,
    grading_input=grading_conditions_noparser['naive_cot_2']
)

In [None]:
print(student_responses['response'][67])
print(test_grading.content)

Test a random sample of 3 student answers.

In [None]:
batch_grading_test = student_responses.sample(3)
batch_grading_test['naive_cot_2'] = batch_grading_test['response'].apply(grade_by_llmChain, chain = grading_chain_gpt35, grading_input = grading_conditions_noparser['naive_cot_2'])
batch_grading_test = extract_info(batch_grading_test, 'naive_cot_2')
#batch_grading_test['naive_cot_2_text'] = batch_grading_test['naive_cot_2'].apply(lambda x:x.content)
print(batch_grading_test)

In [50]:
batch_grading_test.to_csv("./data_chatAPI/test_grading.csv")

In [51]:
#calculate the price of this test run
calc_price.Calc_Price(batch_grading_test['naive_cot_2'], modelUsed= 'gpt35')

0.006888

In [None]:
#read the grading outcome. This is useful for prompt engineering.
batch_grading_test['naive_cot_2_text'].apply(lambda x: print(x))

## Grade all responses using GPT-35

In [171]:
# run this line only once as it creates a new variable
# full_grading_gpt35 = student_responses.copy() #use copy to create a new variable with new variable id.

In [192]:
# grade all the student responses using gpt-35chat, naive_cot_2
# Note: The grading process ran about 10 minutes.
full_grading_gpt35['naive_cot_2'] = full_grading_gpt35['response'].apply(grade_by_llmChain, grading_input = grading_conditions_noparser['naive_cot_2'], chain = grading_chain_gpt35) #grade
full_grading_gpt35 = extract_info(full_grading_gpt35, 'naive_cot_2')
full_grading_gpt35.to_csv('./data_chatAPI/full_grading.csv')

In [None]:
calc_price.Calc_Price(full_grading_gpt35['naive_cot_2'], modelUsed="gpt35")

In [26]:
grading_condition_names[2]

'detailed_compare'

In [32]:
do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

In [None]:
full_grading_gpt35

In [34]:
full_grading_gpt35.to_csv("./data_chatAPI/full_grading.csv")

## The code below grades with gpt-4o model

In [5]:
#setup the llm connection with gpt-4o. 
#empirically, temperatur 0.8 seems to be a good balance. This is of course a guess. 
#only 4 stop sequences are allowed for Azure via API. This is an API issue. Under chat API this seems to not be a problem.
gpt4_model = AzureChatOpenAI(    
    api_version="2024-02-01", #latest stable version
    deployment_name = "zchen-test-gpt-4o",
    max_tokens = 1000,
    temperature= 0.8,
    #model_version = '0301',
    model_kwargs= {"stop": ["<|im_end|>", "Student response:", "Grading:", "Example "]}
    )

#define the grading chain. If parsing is needed, it can be connected to the output parser
grading_chain_gpt4o = prompt_template_noformatting | gpt4_model

test grading of one answer

In [26]:
test_grading_gpt4 = grade_by_llmChain(
    response=student_responses.query('student == 57') ['response'].iloc[0],
    grading_input=grading_conditions_noparser['naive_cot_1'],
    chain= grading_chain_gpt4o
)
print(test_grading_gpt4.content)

Let's evaluate the student's response based on the three rubric items step by step.

**Item 1:** The student should mention either one of the following: 
  * conservation of energy OR
  * work and kinetic energy theorem 

- The student explicitly states that "the Mechanical energy of the system is preserved all the way through," referencing the conservation of energy.
- Therefore, the student's explanation satisfies the first rubric item.

Grade for Item 1: 1

**Item 2:** The student should mention either one of the following:
   * No net external non-conservative work is being done, so mechanical energy is conserved for the system OR 
   * the slide is frictionless/smooth OR
   * gravity is the only force that does work on the girl.

- The student mentions "the slide is frictionless," satisfying the requirement that the slide is frictionless/smooth.
- Therefore, the student's explanation satisfies the second rubric item.

Grade for Item 2: 1

**Item 3:** The student should indicate ei

Test grading of multiple answers

In [30]:
test_grading_gpt4o = student_responses.sample(3)

In [None]:
test_grading_gpt4o['naive_cot_2'] = test_grading_gpt4o['response'].apply(grade_by_llmChain, grading_input = grading_conditions_noparser['naive_cot_2'], chain = grading_chain_gpt4o) #grade
test_grading_gpt4o = extract_info(test_grading_gpt4o, 'naive_cot_2', nItems=3)
#test_grading_gpt4o['naive_cot_2_text'] = test_grading_gpt4o['naive_cot_2'].apply(lambda x:x.content) #extract the content of the grading
#test_grading_gpt4o['naive_cot_2_grade'] = test_grading_gpt4o['naive_cot_2_text'].apply(extract_binary_vector)

In [52]:
#save for reproducibility
test_grading_gpt4o.to_csv("./data_chatAPI/test_grading_gpt4_3response.csv")

In [48]:
#calculate the cost of running
calc_price.Calc_Price(test_grading_gpt4o['naive_cot_2'], 'gpt4o')

0.02874

## Using GPT-4o to do full grading now

In [90]:
#full_grading_gpt4o = student_responses.copy() #Need to use the copy method to create a different variable, not an alias of the same variable.

In [10]:
full_grading_gpt4o = pd.read_csv("./data_chatAPI/full_grading_gpt4o.csv")

In [None]:
full_grading_gpt4o

gpt-4o on Azure took 22m36s to produce grading for 96 student responses

In [91]:
full_grading_gpt4o['naive_cot_2'] = full_grading_gpt4o['response'].apply(grade_by_llmChain, grading_input = grading_conditions_noparser['naive_cot_2'], chain = grading_chain_gpt4o, nItems = 3) #grade
full_grading_gpt4o = extract_info(full_grading_gpt4o, 'naive_cot_2', nItems=3)
#full_grading_gpt4o['naive_cot_2_text'] = full_grading_gpt4o['naive_cot_2'].apply(lambda x:x.content) #extract the content of the grading
#full_grading_gpt4o['naive_cot_2_grade'] = full_grading_gpt4o['naive_cot_2_text'].apply(extract_binary_vector) #extract the binary vector in the grading.

#save to csv file
full_grading_gpt4o.to_csv("./data_chatAPI/full_grading_gpt4o.csv")

grading output not found, re-do grading again.
grading output not found, re-do grading again.
grading output not found, re-do grading again.
grading output not found, re-do grading again.
grading output not found, re-do grading again.
grading output not found, re-do grading again.
grading output not found, re-do grading again.
grading output not found, re-do grading again.
grading output not found, re-do grading again.
grading output not found, re-do grading again.


In [36]:
do_grading(promptStyle=grading_condition_names[3], llm_chain=grading_chain_gpt4o, response_df= full_grading_gpt4o)

## GPT-4o takes significantly longer to grade. Check if I have a stricter rate limit quota on gpt-4o compared to gpt-35 model.

In [None]:
full_grading_gpt4o

In [38]:
#save to csv file
full_grading_gpt4o.to_csv("./data_chatAPI/full_grading_gpt4o.csv")

## Serializing variables using pickle

In [39]:
#use the globalVars parameter to allow the function to access the current global variables list.
try:
    pickle_tools.save_as_pickle(
        variables=[student_responses, 
                   prompt_components_dict, 
                   prompt_dict, 
                   prompt_template_noformatting, 
                   grading_conditions_noparser, 
                   batch_grading_test, 
                   extract_info, 
                   grade_by_llmChain,
                   full_grading_gpt35, 
                   test_grading_gpt4o,
                   full_grading_gpt4o], 
        folderName = project_folder,
        globalVars=globals())
except IndexError as err:
    print(err)

print(pickled_varNames)

['student_responses', 'prompt_components_dict', 'prompt_dict', 'prompt_template_noformatting', 'grading_conditions_noparser', 'batch_grading_test', 'extract_info', 'grade_by_llmChain', 'full_grading_gpt35', 'test_grading_gpt4o', 'full_grading_gpt4o']
