# Using chatAPI to grade Q8 on the Final Exam
The code is simplified and cleaned using custom modules.

### Load packages

In [1]:
#setup all the packages
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
import pandas as pd
import re #this handles regular expression.
import dill as pickle #pickle cannot handle functions, dill can. 
#import pipe

load_dotenv() #loads the dotenv for API keys as environmental variables

True

### Load custom modules

In [2]:
import binary_output #used to check or extract binary grading outcomes.
import calc_price #calculate the price of each run based on price models.
import pickle_tools #used to save and load a list of variables, avoids variables that cannot be serialized.
import print_response #used to print the response and grading in a very readable format.

Define some global variables

In [214]:
project_folder = 'FinalQ8' #This is used as the folder name to save pickled (serialized) variables
nRubricItems = 3 #How many items are there in the rubric, used for checking/extracting binary grading outcome.

## Load key variables from previous session. 
If the chunck below is run, then there is no need to load data frames or lists. The GPT connections need to be re-defined.

In [4]:
#run this chunck to load previously saved variables
try:
    pickle_tools.load_from_pickle(folderName=project_folder, globalVars = globals())
except FileNotFoundError as err:
    print(err)

## Load student responses

In [4]:
#Note: This is student responses without the alternate solution
student_responses = pd.read_csv("./data/Final Q8/Q8-StudentResponse-Small.csv")
print(student_responses)

    student  outcome                                           response
0    292286        0  found the x momentum originally\n\nfound the y...
1    379046        0  for this question it took me about 5 steps to ...
2    545257        0  I'm solving for the final magnitude ofÂ  veloc...
3    108612        0  This problem took about 4 steps.\n\nThe proble...
4    842697        0  For each boulder respectively I multiplied the...
..      ...      ...                                                ...
72   714706        1  1) The problem was asking for the final veloci...
73   671085        1  We're given the mass and velocities of both ob...
74   177232        1  Since the boulders stick together, we must cal...
75   953782        0  I solved this problem by taking the velocities...
76   105981        1  Solving for v_f, we are given m1, m2, v1, v2, ...

[77 rows x 3 columns]


### Use a random sample of 5 responses for prompt engineering

In [5]:
#run this code if want a new set of random response. This is for prompt engineering.
randomResponse_5 = student_responses.sample(5)

In [88]:
#randomResponse_5 = batch_grading_test[['student', 'response', 'outcome']]

## Setup all the components of the grading prompt.
For chat API, the prompt template currently contains two messages, a system message and a human message (in few shot learning it can contain multiple human and ai messages).
The system message will just be a message for now (select a message from a list)
The human message will be from a prompt template with the following variables:
* Problem body
* Rubric
* Requirements
* Student Response

Note 7/22/2024: I'm currently not utilizing output formatting options as they are not necessary for this task.

In [6]:
#prompt message and template dictionary
prompt_dict ={
    'sys_messages': [
        """You are a college introductory level physics teacher who is grading a student's written explanation to a physics problem based on a grading rubric, following the given instruction.""",
        """You are a college introductory physics teacher who is grading a student's written explanation to a physics problem based on a grading rubric. Your grading always ends with a comma separated binary vector."""
    ],
    'human_prompt_template': {
        'no-formatting': {},
        'with-formatting': {}
    }
}

prompt_dict['human_prompt_template']['no-formatting'] =  """Here is a college introductory level physics problem: 
"{ProblemBody}"
Students are instructed to provide an explanation to their answer.
Student explanations are being graded based on the following rubric:
"{Rubric}"
Grading is performed strictly according to the following requirements: 
# The grading must start with the evaluation of each individual rubric item.
{Requirements}
# For each rubric item, the student explanation will receive 1 point if the explanation satisfies the rubric, or 0 point if the explanation does not satisfy the rubric. Never assign a 0.5 for an item. 
# Each rubric item is graded only once.  
# Steps or sentences in student's explanation may not follow the same order as the rubric. 
# Conclude the grading response with vector of length 3, included in curly brackets and separated by commas, such as {{0,0,0}} or {{1,0,1}} or {{1,1,1}}. The vector summarizes the grading of each of the three rubric items. 
Student response:
"{StudentResponse}"
Grading:
"""

#This "with_formatting part is not needed here."
prompt_dict['human_prompt_template']['with-formatting'] = """Here is a college introductory level physics problem: 
"{ProblemBody}"
Students are instructed to provide an explanation to their answer.
Student explanations are being graded based on the following rubric:
"{Rubric}"
Grading is performed strictly according to the following requirements: 
# The grading must start with the evaluation of each individual rubric item.
{Requirements}
# For each rubric item, the student explanation will receive 1 point if the explanation satisfies the rubric, or 0 point if the explanation does not satisfy the rubric. Never assign a 0.5 for an item. 
# Each rubric item is graded only once.  
# Steps or sentences in student's explanation may not follow the same order as the rubric. 

{Format_Instructions}

Student response:
"{StudentResponse}"
"""


In [7]:
#create the langchain prompt template with four or five input variables
#this one is without json output parsing. See if removing "partial credit" will stop it from giving 0.5s.
prompt_template_noformatting = ChatPromptTemplate.from_messages([
    ("system", prompt_dict['sys_messages'][1]), #use the system message for no output formatting.
    ("human", prompt_dict['human_prompt_template']['no-formatting']) #use the no formatting human message template
])
print(prompt_template_noformatting.input_variables) #list the input variables.

['ProblemBody', 'Requirements', 'Rubric', 'StudentResponse']


### Here are different versions of the rubric and grading requirements for testing.

In [186]:
#changeable components for problem body, rubric and requirements
prompt_components_dict = {
    'ProblemBody':"""Two icy boulders in Saturn's rings approach each other, collide, and stick together as shown in the figure below. The first has a mass of [m1] kg and velocity of [v1] m/s.The second has a mass of [m2] kg and velocity of [v2] LaTeX: m/s. The angle between the two velocities is theta. Determine the magnitude of their velocity after they collide. Round your answer to the nearest 1 decimal place in units of m/s.""",
    'Rubric':{},
    'Requirements':{}
}

prompt_components_dict['Rubric']['simple'] ="""
# Item 1: The student solution decomposed the initial linear momentum of boulder 2 into its x and y components. 

# Item 2: The student wrote down conservation of linear momentum equation for both the x and y directions independently.

# Item 3: The student used Pythagorean theorem to find the magnitude of the final velocity.
"""

prompt_components_dict['Rubric']['detailed']="""
# Item 1: The student considered the x and y components of the linear momentum of the second boulder separately. 
   * The student could write down the x and y components of linear momentum of the second boulder in two separate conservation of linear momentum equations. 
   * The student could decompose the linear momentum of the second boulder using trigonometry, for example m_2v2cos(theta), m2v2sin(theta), or mvcos(theta), mvsin(theta) for the second boulder. 
   * The student could explicitly state decomposing linear momentum of the second boulder into its x and y components.
   * The student may also decompose the velocity of the second boulder into its x and y components.
 
# Item 2: The student wrote down conservation of linear momentum equations for both the x and y directions independently.
   * The student must write down equations for both x and y directions.
   * Conservation of linear momentum equations can take forms such as m1v1x + m2v2x = (m1 + m2)vx, and m2v2y = (m1 + m2)vy.
   * The student could imply that linear momentum is conserved on both x and y directions, or conservation of linear momentum procedure is applied to both directions.
   * Only stating that linear momentum is conserved or writing down a single equation such as m1v1 + m2v2 = (m1 + m2)V do not satisfy this rubric.

# Item 3: The student used Pythagorean theorem to find the magnitude of the final velocity.
   * The student could write equations such as v_2f^2 = v_2xf^2 + v_2yf^2, or v2f = sqrt(v2xf^2 + v2yf^2)
   * The student could also state that the final velocity is obtained using the pythagorean theorem, or by taking the square root of the velocity squares.
   * The student could also state that the final velocity is the vector sum of the x component and y component velocities.
   * The student could also apply pythagorean theorem directly to the linear momentums, and divide the final momentum by the mass of the boulders.
   * The student could write equations such as (m2v2)^2 = (m2v2x)^2 + (m2v2y)^2
"""

## This is the improved detailed rubric based on reviewing error grades.
## Major improvements to item 2 and 3.
prompt_components_dict['Rubric']['detailed_2']="""
# Item 1: The student considered the x and y components of the linear momentum of the second boulder separately. 
   * The student could write down the x and y components of linear momentum of the second boulder in two separate conservation of linear momentum equations. 
   * The student could decompose the linear momentum of the second boulder using trigonometry, for example m_2v2cos(theta), m2v2sin(theta), or mvcos(theta), mvsin(theta) for the second boulder. 
   * The student could explicitly state decomposing linear momentum of the second boulder into its x and y components.
   * The student could also decompose the velocity of the second boulder into its x and y components.
 
# Item 2: The student applied conservation of linear momentum to both the x and y directions independently.
   * Conservation of linear momentum equations can take forms such as m1v1x + m2v2x = (m1 + m2)vx, and m2v2y = (m1 + m2)vy.
   * The student could imply that linear momentum is conserved on both x and y directions, or conservation of linear momentum procedure is applied to both directions.
   * Only stating that linear momentum is conserved or writing down a single equation such as m1v1 + m2v2 = (m1 + m2)V do not satisfy this rubric.
   * The student could write down one conservation of linear momentum equation, and either immediately or later indicate that this equation is applied to both x and y directions.

# Item 3: The student used Pythagorean theorem to find the magnitude of the final velocity.
   * The student could write equations such as v_2f^2 = v_2xf^2 + v_2yf^2, or v2f = sqrt(v2xf^2 + v2yf^2)
   * The student could also state that the final velocity is obtained using the pythagorean theorem, or by taking the square root of the velocity squares.
   * The student could also state that the final velocity is the vector sum of the x component and y component velocities.
   * The student could write equations such as (m2v2)^2 = (m2v2x)^2 + (m2v2y)^2
   * The student could also apply pythagorean theorem directly to the linear momentums, and divide the final momentum by the mass of the boulders.
   * The student cannot apply pythagorean theorem directly to p1 and p2, or the linear momentum of boulders 1 and 2. The pythagorean theorem must by applied to the components of velocity or momentum.
   * Simply stating "obtain the magnitude" of the velocity do not satisfy this rubric. 
"""

prompt_components_dict['Rubric']['detailed_3']="""
# Item 1: The student solution decomposed the initial linear momentum of boulder 2 into its x and y components. 
   * The student could also write m2v2x and m2v2y in conservation of linear momentum equations without explicitly decomposing the momentum or the velocities.

# Item 2: The student wrote down conservation of linear momentum equation for both the x and y directions independently.
   * The student could also imply that linear momentum is conserved on both x and y directions separately, and may not explicitly write down conservation equations.
   * Only saying "momentum equation" or "used momentum" does not satisfy this rubric.

# Item 3: The student used Pythagorean theorem to find the magnitude of the final velocity.
   * Just saying "put them together" does not satisfy this rubric
"""

#the 'test item is added here to verify that the code is correct and different prompts are actually being passed to the LLM.
prompt_components_dict['Requirements']['test'] = """# For each rubric item, first say the exact same words "This is a test", then give it a grade of 1."""
prompt_components_dict['Requirements']['naive_cot']="""
# For each rubric item, first write step by step reasoning on why or why not the student explanation satisfies or contradicts the item. Then assign a binary grade of either 0 or 1, with 1 indicating the student explanation satisfied the rubric item, and 0 otherwise.
"""
prompt_components_dict['Requirements']['comparison']="""
# For each rubric item, first compare student explanation with the rubric item and the item description, then conclude if the explanation satisfies or didn't satisfy the rubric item. Finally, assign a binary grade of either 0 or 1, with 1 indicating the student explanation satisfied the rubric item, and 0 otherwise.
"""

prompt_components_dict['Requirements']['force_compare'] = """
# For each rubric item, write the grading statement strictly following the order of the statements below: 
  ## First, state one of the following two:  
     "For item <<item number>>, the rubric states that <<quote from the rubric item description>>. The most relevant parts in the student explanation are <<direct quote or quotes from student explanation>>.
     "For item <<item number>>, the rubric states that <<quote from the rubric item description>>. No part in the students' explanation is relevant to the rubric"
  ## then state one of the following: 
     "the student explanation is similar to this part of the rubric description <<most similar part of the rubric>>", 
     "the student explanation and the rubric description are very different" 
     "the student explanation and the rubric description are irrelevant"
  ## Finally, conclude with a binary score: 
     "so the grade is 1"
     "so the grade is 0"
"""


Below is a dictionary that toggles the components to form grading condition dictionary that will be sent to the llm chain. Each element is a combination of a rubric style and a prompt style.

In [201]:

grading_conditions_noparser = {
    'naive_cot_1': {
        'Rubric' : prompt_components_dict['Rubric']['simple'],
        'Requirements': prompt_components_dict['Requirements']['naive_cot']
    },
    'naive_cot_2': {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements': prompt_components_dict['Requirements']['naive_cot']
    },
    'detailed_compare' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements' : prompt_components_dict['Requirements']['comparison']
    },
    'forced_compare' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements' : prompt_components_dict['Requirements']['force_compare']
    },
    'detailed_compare_2' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed_2'],
        'Requirements' : prompt_components_dict['Requirements']['comparison']
    },
    'detailed_compare_3' : {
        'Rubric' : prompt_components_dict['Rubric']['simple'],
        'Requirements' : prompt_components_dict['Requirements']['comparison']
    },
    'naive_cot_3' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed_3'],
        'Requirements' : prompt_components_dict['Requirements']['naive_cot']
    },
    'detailed_compare_4' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed_3'],
        'Requirements' : prompt_components_dict['Requirements']['comparison']
    },
}


In [202]:
#get the grading contion names for easier labeling of columns in the output dataframe
grading_condition_names = list(grading_conditions_noparser.keys())

In [203]:
grading_condition_names

['naive_cot_1',
 'naive_cot_2',
 'detailed_compare',
 'forced_compare',
 'detailed_compare_2',
 'detailed_compare_3',
 'naive_cot_3',
 'detailed_compare_4']

## Create LLM communication and LLM chain. 

In [12]:
#define the llm chain and test one grading.
#setup the llm connection and basic parameters. 
#Empirically, temperatur 0.8 seems to be a good balance. This is of course a guess. 
#change to 800 tokens max to save some time and money based on previous experience.
#only 4 stop sequences are allowed for Azure via API. This is an API issue
llm_gpt35_chat = AzureChatOpenAI(    
    api_version="2024-02-01", #latest stable version
    deployment_name = "zchen-gpt35-chat",
    max_tokens = 1000,
    temperature= 0.8,
    #model_version = '0301',
    model_kwargs= {"stop": ["<|im_end|>", "Student response:", "Grading:", "Example "]}
    )

#define the grading chain. If parsing is needed, it can be connected to the output parser
grading_chain_gpt35 = prompt_template_noformatting | llm_gpt35_chat

The following function sends a response to llm for grading, and repeats the process until the outcome contains a binary string.

In [13]:
#A function to take the student response and grading input as parameters and invoke the chain to grade. 
def grade_by_llmChain(response: str, grading_input, chain, nItems = nRubricItems, problem = prompt_components_dict['ProblemBody']):
    grading_input['StudentResponse'] = response # student response is the parameter to feed to the function. grading_input is from the grainding_condition list
    grading_input['ProblemBody'] = problem #add the problem body
    grading_output = chain.invoke(input=grading_input) #invoke the llm chain to produce the grading.
    #check if the grading contains a binary output. If not, redo grading.
    binaryPattern = binary_output.Create_Search(nItems) #using the binary outuput module to create search pattern
    while not re.search(pattern=binaryPattern, string= grading_output.content):
        print("proper grading output not found, re-do grading again.")
        grading_output = chain.invoke(input=grading_input)
    return(grading_output)

In [14]:
#function to extract the grading text and grading outcome in one step.
def extract_info(df : pd.DataFrame, outcomeName : str, nItems = nRubricItems):
    df.loc[:,f'{outcomeName}_text'] = df.loc[:,outcomeName].apply(lambda x:x.content)
    df.loc[:,f'{outcomeName}_grade'] = df.loc[:,f'{outcomeName}_text'].apply(lambda x:binary_output.Extract_binary(x, nItems= nItems))
    return(df)

In [21]:
### This function automates the grading and extract of information process to avoid code copying errors.
def do_grading(promptStyle : str, llm_chain, response_df : pd.DataFrame, name_append = '', nItems = nRubricItems):
    colName = promptStyle if name_append == "" else f"{promptStyle}_{name_append}"
    response_df[colName] = response_df['response'].apply(grade_by_llmChain, grading_input = grading_conditions_noparser[promptStyle], chain = llm_chain, nItems = nItems)
    response_df = extract_info(response_df, colName, nItems = nItems)

A test case with one answer

In [19]:
randomResponse = student_responses.sample(1)

In [22]:
do_grading(promptStyle= grading_condition_names[2], llm_chain=grading_chain_gpt35, response_df=randomResponse)

In [None]:
print_response.print_gradingOutcome(randomResponse, grading_colName="detailed_compare_text")

Test a random sample of 5 student answers.

In [26]:
grading_test_gpt35 = randomResponse_5.copy()

In [27]:
do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt35,response_df=grading_test_gpt35)

In [None]:
grading_test_gpt35

In [None]:
print_response.print_gradingOutcome(grading_test_gpt35, grading_colName = f"{grading_condition_names[2]}_text")

In [31]:
#calculate the price of this test run
calc_price.Calc_Price(grading_test_gpt35[grading_condition_names[2]], modelUsed= 'gpt35')

0.012586

## Grade all responses using GPT-35

In [99]:
# run this line only once as it creates a new variable
#full_grading_gpt35 = student_responses.copy() #use copy to create a new variable with new variable id.

In [100]:
do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [104]:
do_grading(promptStyle=grading_condition_names[1], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [107]:
do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [110]:
do_grading(promptStyle=grading_condition_names[3], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

GPT-3.5 typically takes about 2-3 minutes in grading all the answers.

In [111]:
full_grading_gpt35.to_csv('./data_chatAPI/FinalQ8/full_grading_gpt35_new.csv')

In [112]:
calc_price.Calc_Price(full_grading_gpt35[grading_condition_names[3]], modelUsed="gpt35")

0.23141399999999998

GPT-35 took 9 minutes to grade and costed $0.19 

## The code below grades with gpt-4o model

In [32]:
#setup the llm connection with gpt-4o. 
#empirically, temperatur 0.8 seems to be a good balance. This is of course a guess. 
#only 4 stop sequences are allowed for Azure via API. This is an API issue. Under chat API this seems to not be a problem.
gpt4_model = AzureChatOpenAI(    
    api_version="2024-02-01", #latest stable version
    deployment_name = "zchen-test-gpt-4o",
    max_tokens = 1000,
    temperature= 0.8,
    #model_version = '0301',
    model_kwargs= {"stop": ["<|im_end|>", "Student response:", "Grading:", "Example "]}
    )

#define the grading chain. If parsing is needed, it can be connected to the output parser
grading_chain_gpt4o = prompt_template_noformatting | gpt4_model

In [33]:
test_grading_gpt4o = randomResponse_5.copy()

In [103]:
grading_condition_names

['naive_cot_1', 'naive_cot_2', 'detailed_compare', 'forced_compare']

In [97]:
do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt4o, response_df=test_grading_gpt4o)


In [None]:
print_response.print_gradingOutcome(test_grading_gpt4o, grading_colName=f'{grading_condition_names[2]}_text')

In [96]:
test_grading_gpt4o.to_csv('./data_chatAPI/FinalQ8/grading_test_gpt4o.csv')

In [66]:
test_oneResponse = student_responses.iloc[[21]]

In [None]:
do_grading(grading_condition_names[2], llm_chain= grading_chain_gpt4o, response_df= test_oneResponse)

In [None]:
print_response.print_gradingOutcome(test_oneResponse, "detailed_compare_text")

### Despite our best effort of prompt engineering, GPT-4o refuse to acknowledge that the student considered the linear momentum of the second boulder separately, unless the exact form of the student's written formula was written in the first rubric explanation. Most likely the word "decompose" is strongly associated with trigonometric expression, and GPT would not recognize m2v2x and m2v2y as a valid form of considering the momentum components separately.  

In [95]:
test_oneResponse.to_csv("./data_chatAPI/FinalQ8/test_oneCase.csv")

In [108]:
#save for reproducibility
test_grading_gpt4o.to_csv("./data_chatAPI/FinalQ8/grading_test_gpt4o.csv")

In [115]:
#calculate the cost of running
#Note: I need a utility to calculate the price of multiple runs, and output a list
calc_price.Calc_Price(test_grading_gpt4o['detailed_compare'], 'gpt4o')
#calc_price.Calc_Price(test_grading_gpt4o['forced_compare'], 'gpt4o')

0.046485

## Using GPT-4o to do full grading now

In [116]:
#The following line is run only once at the initiation of the process
#full_grading_gpt4o = student_responses.copy() #Need to use the copy method to create a different variable, not an alias of the same variable.

In [117]:
do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt4o, response_df= full_grading_gpt4o)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [205]:

#save to csv file
full_grading_gpt4o.to_csv("./data_chatAPI/FinalQ8/full_grading_gpt4o_new.csv")

In [125]:
calc_price.Calc_Price(full_grading_gpt4o['detailed_compare'], modelUsed='gpt4o')

0.776285

GPT-4o took 18 minutes to perform the grading

In [120]:
do_grading(promptStyle=grading_condition_names[1], llm_chain=grading_chain_gpt4o, response_df= full_grading_gpt4o)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [121]:
do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt4o, response_df= full_grading_gpt4o)

In [126]:
do_grading(promptStyle=grading_condition_names[3], llm_chain=grading_chain_gpt4o, response_df= full_grading_gpt4o)

proper grading output not found, re-do grading again.


In [None]:
full_grading_gpt4o

# Testing improved detailed rubric

the improved rubric was tested on 5 cases in which the detailed compare grading with gpt-4o differed from both human raters, and with outcome of naive_COT_simple. The rubric explanation was iteratively improved until all 5 cases were graded satisfactorily by detailed compare. The main improvement focused on explaining items 2 and 3. For item 2, the requirement to "explicitly write equations for both x and y directions" was softened. For item 3, the explanation states that pythagorean theorem cannot be used directly on the linear momentum of boulders 1 and 2.  

In [167]:
test_newRubric_grading = student_responses[student_responses['student'].isin([412565, 995849, 671085, 177232, 105981])].copy()

In [194]:
test_newRubric_grading_2 = student_responses[student_responses['student'].isin([620513, 292286, 673497, 714773, 553249])].copy()

In [189]:
grading_condition_names

['naive_cot_1',
 'naive_cot_2',
 'detailed_compare',
 'forced_compare',
 'detailed_compare_2',
 'detailed_compare_3']

In [173]:
do_grading(grading_condition_names[4], llm_chain = grading_chain_gpt4o, response_df= test_newRubric_grading)

In [None]:
print_response.print_gradingOutcome(test_newRubric_grading, grading_colName=f'{grading_condition_names[4]}_text')

In [None]:
do_grading(grading_condition_names[5], llm_chain = grading_chain_gpt4o, response_df= test_newRubric_grading)
print_response.print_gradingOutcome(test_newRubric_grading, grading_colName=f'{grading_condition_names[5]}_text')

In [None]:
do_grading(promptStyle=grading_condition_names[6], llm_chain=grading_chain_gpt4o, response_df=test_newRubric_grading_2)
print_response.print_gradingOutcome(test_newRubric_grading_2, "naive_cot_3_text")

## Do several more rounds of grading with improved rubric and detailed compared with simple rubric

In [176]:
do_grading(grading_condition_names[4], llm_chain = grading_chain_gpt4o, response_df = full_grading_gpt4o)

In [179]:
do_grading(grading_condition_names[5], llm_chain = grading_chain_gpt4o, response_df = full_grading_gpt4o)

proper grading output not found, re-do grading again.


In [197]:
do_grading(grading_condition_names[6], llm_chain = grading_chain_gpt4o, response_df = full_grading_gpt4o)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [204]:
do_grading(grading_condition_names[7], llm_chain = grading_chain_gpt4o, response_df = full_grading_gpt4o)

proper grading output not found, re-do grading again.


In [None]:
full_grading_gpt4o

### Conclusion: more rubric explanation do not always improve performance, especially when naive COT performance is already high (i.e. GPT-4o model's interpretation of original rubric is close to human experts). The naive COT level performance is reached with a simple rubric addressing the shortcomings of original rubric + detailed compare.

# Self consistency using nct 1

In [183]:
self_consistency = student_responses.copy()

In [221]:
#do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_1") #first run
#do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_2") #second run 
#do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_3") #third run
#do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_4") #fourth run
#do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_5") #fifth run

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [224]:
self_consistency.to_csv(f'./data_chatAPI/FinalQ8/self_consistency_nct1.csv')

In [226]:
self_consistency_2 = student_responses.copy()

In [239]:
#do_grading(promptStyle=grading_condition_names[7], llm_chain=grading_chain_gpt4o, response_df= self_consistency_2, name_append="run_1") #first run
#do_grading(promptStyle=grading_condition_names[7], llm_chain=grading_chain_gpt4o, response_df= self_consistency_2, name_append="run_2") #second run 
#do_grading(promptStyle=grading_condition_names[7], llm_chain=grading_chain_gpt4o, response_df= self_consistency_2, name_append="run_3") #third run
#do_grading(promptStyle=grading_condition_names[7], llm_chain=grading_chain_gpt4o, response_df= self_consistency_2, name_append="run_4") #fourth run
#do_grading(promptStyle=grading_condition_names[7], llm_chain=grading_chain_gpt4o, response_df= self_consistency_2, name_append="run_5") #fifth run

proper grading output not found, re-do grading again.


In [None]:
self_consistency_2

In [242]:
self_consistency_2.to_csv(f'./data_chatAPI/FinalQ8/self_consistency_dc4.csv')

## Serializing variables using pickle

In [243]:
#use the globalVars parameter to allow the function to access the current global variables list.
try:
    pickle_tools.save_as_pickle(
        variables=[student_responses, 
                   randomResponse, 
                   randomResponse_5, 
                   prompt_components_dict, 
                   prompt_dict, 
                   prompt_template_noformatting, 
                   grading_conditions_noparser, 
                   grading_condition_names,
                   do_grading, 
                   extract_info,
                   test_oneResponse,
                   grading_test_gpt35,
                   full_grading_gpt35,
                   full_grading_gpt4o,
                   test_newRubric_grading, 
                   test_newRubric_grading_2,
                   test_grading_gpt4o,
                   self_consistency,
                   self_consistency_2
                   ], 
        folderName = project_folder,
        globalVars=globals())
except IndexError as err:
    print(err)

print(pickled_varNames)

['student_responses', 'randomResponse', 'randomResponse_5', 'prompt_components_dict', 'prompt_dict', 'prompt_template_noformatting', 'grading_conditions_noparser', 'grading_condition_names', 'do_grading', 'extract_info', 'test_oneResponse', 'grading_test_gpt35', 'full_grading_gpt35', 'full_grading_gpt4o', 'test_newRubric_grading', 'test_newRubric_grading_2', 'test_grading_gpt4o', 'self_consistency', 'self_consistency_2']
