# Using chatAPI to grade Q10 on the Final Exam
The code is simplified and cleaned using custom modules, based on grading of Q8

### Load packages

In [5]:
#setup all the packages
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
import pandas as pd
import re #this handles regular expression.
import dill as pickle #pickle cannot handle functions, dill can. 
#import pipe

load_dotenv() #loads the dotenv for API keys as environmental variables

True

### Load custom modules

In [6]:
import binary_output #used to check or extract binary grading outcomes.
import calc_price #calculate the price of each run based on price models.
import pickle_tools #used to save and load a list of variables, avoids variables that cannot be serialized.
import print_response #used to print the response and grading in a very readable format.

Define some global variables

In [7]:
project_folder = 'Final_Q10' #This is used as the folder name to save pickled (serialized) variables
nRubricItems = 3 #How many items are there in the rubric, used for checking/extracting binary grading outcome.

In [8]:
if not os.path.exists(f"./data_chatAPI/{project_folder}"): 
    os.makedirs(f"./data_chatAPI/{project_folder}")
    print(f"created folder ./data_chatAPI/{project_folder}")

## Load key variables from previous session. 
If the chunck below is run, then there is no need to load data frames or lists. The GPT connections need to be re-defined.

In [9]:
#run this chunck to load previously saved variables
try:
    pickle_tools.load_from_pickle(folderName=project_folder, globalVars = globals())
except FileNotFoundError as err:
    print(err)

## Load student responses

In [None]:
student_responses = pd.read_csv("./data/Final Q10/Q10-StudentResponse.csv")

In [None]:
student_responses

### Use a random sample of 5 responses for prompt engineering

In [None]:
#run this code if want a new set of random response. This is for prompt engineering.
randomResponse_5 = student_responses.sample(5)
randomResponse_5

## Setup all the components of the grading prompt.
For chat API, the prompt template currently contains two messages, a system message and a human message (in few shot learning it can contain multiple human and ai messages).
The system message will just be a message for now (select a message from a list)
The human message will be from a prompt template with the following variables:
* Problem body
* Rubric
* Requirements
* Student Response

Note 7/22/2024: I'm currently not utilizing output formatting options as they are not necessary for this task.

In [11]:
#prompt message and template dictionary
prompt_dict ={
    'sys_messages': [
        """You are a college introductory level physics teacher who is grading a student's written explanation to a physics problem based on a grading rubric, following the given instruction.""",
        """You are a college introductory physics teacher who is grading a student's written explanation to a physics problem based on a grading rubric. Your grading always ends with a comma separated binary vector."""
    ],
    'human_prompt_template': {
        'no-formatting': {},
        'with-formatting': {}
    }
}

prompt_dict['human_prompt_template']['no-formatting'] =  """Here is a college introductory level physics problem: 
"{ProblemBody}"
Students are instructed to provide an explanation to their answer.
Student explanations are being graded based on the following rubric:
"{Rubric}"
Grading is performed strictly according to the following requirements: 
# The grading must start with the evaluation of each individual rubric item.
{Requirements}
# For each rubric item, the student explanation will receive 1 point if the explanation satisfies the rubric, or 0 point if the explanation does not satisfy the rubric. Never assign a 0.5 for an item. 
# Each rubric item is graded only once.  
# Steps or sentences in student's explanation may not follow the same order as the rubric. 
# Conclude the grading response with vector of length 3, included in curly brackets and separated by commas, such as {{0,0,0}} or {{1,0,1}} or {{1,1,1}}. The vector summarizes the grading of each of the three rubric items. 
Student response:
"{StudentResponse}"
Grading:
"""

#This "with_formatting part" is not needed here.
prompt_dict['human_prompt_template']['with-formatting'] = """Here is a college introductory level physics problem: 
"{ProblemBody}"
Students are instructed to provide an explanation to their answer.
Student explanations are being graded based on the following rubric:
"{Rubric}"
Grading is performed strictly according to the following requirements: 
# The grading must start with the evaluation of each individual rubric item.
{Requirements}
# For each rubric item, the student explanation will receive 1 point if the explanation satisfies the rubric, or 0 point if the explanation does not satisfy the rubric. Never assign a 0.5 for an item. 
# Each rubric item is graded only once.  
# Steps or sentences in student's explanation may not follow the same order as the rubric. 

{Format_Instructions}

Student response:
"{StudentResponse}"
"""


In [8]:
#create the langchain prompt template with four or five input variables
#this one is without json output parsing. See if removing "partial credit" will stop it from giving 0.5s.
prompt_template_noformatting = ChatPromptTemplate.from_messages([
    ("system", prompt_dict['sys_messages'][1]), #use the system message for no output formatting.
    ("human", prompt_dict['human_prompt_template']['no-formatting']) #use the no formatting human message template
])
print(prompt_template_noformatting.input_variables) #list the input variables.

['ProblemBody', 'Requirements', 'Rubric', 'StudentResponse']


### Here are different versions of the rubric and grading requirements for testing.

In [67]:
#changeable components for problem body, rubric and requirements
prompt_components_dict = {
    'ProblemBody':"""A small massive ball of mass [m] kg is dropped straight down into a tube containing an ideal spring. The spring has spring constant k = [k] N/m and relaxed length of L0 = [L0] meters. The ball was launched to a height of h = [h] meters above the top of the spring. When the spring was compressed to a length of L = [L] meters, what is the magnitude of velocity v of the ball at that time in units of meters per second?""",
    'Rubric':{},
    'Requirements':{}
}

prompt_components_dict['Rubric']['simple'] ="""
# Item 1: The student wrote down conservation of mechanical energy equation or indicated that mechanical energy can be used to solve the problem 

# Item 2: The potential energy term of the conservation of mechanical energy formula contains both a gravitational potential energy term and an elastic potential energy term.

# Item 3: The gravitational potential energy term contains an expression similar to mg(h + L - L_0), and shouldn't be just mgh or mgL
"""

prompt_components_dict['Rubric']['detailed']="""
# Item 1: The student wrote down conservation of mechanical energy equation or indicated that mechanical energy can be used to solve the problem 
   * The student could write mathematical expressions such as MEi = MEf, ME_i - ME_f = 0, or KE_i + PE_i = KE_f + PE_f, or other similar forms  
   * Students could use terms such as "Energy", or "Mechanical Energy". 
 
# Item 2: The potential energy term of the conservation of mechanical energy formula contains both a gravitational potential energy term and an elastic potential energy term.
   * The student must mention both gravitational potential energy and elastic potential energy in the solution, mentioning only one of the two will not satisfy this rubric item.
   * Elastic potential energy could be implied in mathematical expressions such as 1/2k(L-L0)^2, or 0.5*k*x^2, or 0.5*k(L0-L)^2.
   * For this rubric item only, gravitational potential energy could be implied in mathematical forms consisting of mg multiplied by a height or distance measure, such as mgh, mg(L-L0), or mg(h-L-L0)

# Item 3: The gravitational potential energy term contains an expression similar to mg(h + L - L_0), and shouldn't be just mgh or mgL
   * The gravitational potential energy term could take forms such as mg(h + L0 - L) or m*g*(h-l+l0), or other forms that involves modifications to the height h.
   * The student could also write expressions such as mgh + mg(L-L0) or mgh + mg*(L_0 - L) or similar forms
   * Stating that gravitational potential energy is mgh, or including mgh alone will not satisfy this rubric item.
"""

## This is the improved detailed rubric based on reviewing error grades.
## Major improvements to item 3: rubric item states the cognitive process, rather than using a specific representation of said operation.
prompt_components_dict['Rubric']['detailed_2']="""
# Item 1: The student wrote down conservation of mechanical energy equation or indicated that mechanical energy can be used to solve the problem 
   * The student could write mathematical expressions such as MEi = MEf, ME_i - ME_f = 0, or KE_i + PE_i = KE_f + PE_f, or other similar forms  
   * Students could use terms such as "Energy", or "Mechanical Energy". 
   * The student could also write the kinetic and potential energy terms, such as 0.5mv^2, 1/2kx^2, mgh, mg(h-L), or similar terms, without explicitly mentioning mechanical energy.
 
# Item 2: The student solution included both a gravitational potential energy term and an elastic potential energy term.
   * The student must mention both gravitational potential energy and elastic potential energy in the solution, mentioning only one of the two will not satisfy this rubric item.
   * Elastic potential energy could be implied in mathematical expressions such as 1/2k(L-L0)^2, or 0.5*k*x^2, or 0.5*k(L0-L)^2. 
   * For this rubric item only, gravitational potential energy could be implied in mathematical forms consisting of mg multiplied by a height or distance measure, such as mgh, mg(L-L0), mg(h-L-L0), mg(h-L0)
   * The final elastic potential energy is zero, so elastic potential energy can be omitted in the final potential energy of the situation.

# Item 3: The student indicated that the calculation of the gravitational potential energy involves a modification to the height h.
   * The gravitational potential energy term could take forms such as mg(h + L0 - L) or m*g*(h-l+l0), mg(h-L), mg(h+L0) or other forms that involves the term mg multiplied by a height that is not just h.
   * The student could also write expressions such as mgh + mg(L-L0) or mgh + mg*(L_0 - L), or include both an mgh term and mg(l-l0) on different sides of the equation. 
   * The student could also indicate in words that the height in gravitational potential energy can be calculated by adding the spring compression length to the height of the ball. 
   * Stating that gravitational potential energy is just mgh, or including mgh alone in the equation will not satisfy this rubric item.
"""

#the 'test item is added here to verify that the code is correct and different prompts are acutally being passed to the LLM.
prompt_components_dict['Requirements']['test'] = """# For each rubric item, first say the exact same words "This is a test", then give it a grade of 1."""
prompt_components_dict['Requirements']['naive_cot']="""
# For each rubric item, first write step by step reasoning on why or why not the student explanation satisfies or contradicts the item. Then assign a binary grade of either 0 or 1, with 1 indicating the student explanation satisfied the rubric item, and 0 otherwise.
"""
prompt_components_dict['Requirements']['comparison']="""
# For each rubric item, first compare student explanation with the rubric item and the item description, then conclude if the explanation satisfies or didn't satisfy the rubric item. Finally, assign a binary grade of either 0 or 1, with 1 indicating the student explanation satisfied the rubric item, and 0 otherwise.
"""

prompt_components_dict['Requirements']['force_compare'] = """
# For each rubric item, write the grading statement strictly following the order of the statements below: 
  ## First, state one of the following two:  
     "For item <<item number>>, the rubric states that <<quote from the rubric item description>>. The most relevant parts in the student explanation are <<direct quote or quotes from student explanation>>.
     "For item <<item number>>, the rubric states that <<quote from the rubric item description>>. No part in the students' explanation is relevant to the rubric"
  ## then state one of the following: 
     "the student explanation is similar to this part of the rubric description <<most similar part of the rubric>>", 
     "the student explanation and the rubric description are very different" 
     "the student explanation and the rubric description are irrelevant"
  ## Finally, conclude with a binary score: 
     "so the grade is 1"
     "so the grade is 0"
"""


Below is a dictionary that toggles the components to form grading condition dictionary that will be sent to the llm chain. Each element is a combination of a rubric style and a prompt style.

In [68]:

grading_conditions_noparser = {
    'naive_cot_1': {
        'Rubric' : prompt_components_dict['Rubric']['simple'],
        'Requirements': prompt_components_dict['Requirements']['naive_cot']
    },
    'naive_cot_2': {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements': prompt_components_dict['Requirements']['naive_cot']
    },
    'detailed_compare' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements' : prompt_components_dict['Requirements']['comparison']
    },
    'forced_compare' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed'],
        'Requirements' : prompt_components_dict['Requirements']['force_compare']
    },
    'detailed_compare_2' : {
        'Rubric' : prompt_components_dict['Rubric']['detailed_2'],
        'Requirements' : prompt_components_dict['Requirements']['comparison']
    }
}


In [14]:
#get the grading contion names for easier labeling of columns in the output dataframe
grading_condition_names = list(grading_conditions_noparser.keys())

In [15]:
grading_condition_names

['naive_cot_1',
 'naive_cot_2',
 'detailed_compare',
 'forced_compare',
 'detailed_compare_2']

## Create LLM communication and LLM chain. 

In [16]:
#define the llm chain and test one grading.
#setup the llm connection and basic parameters. 
#Empirically, temperatur 0.8 seems to be a good balance. This is of course a guess. 
#change to 800 tokens max to save some time and money based on previous experience.
#only 4 stop sequences are allowed for Azure via API. This is an API issue
llm_gpt35_chat = AzureChatOpenAI(    
    api_version="2024-02-01", #latest stable version
    deployment_name = "zchen-gpt35-chat",
    max_tokens = 1000,
    temperature= 0.8,
    #model_version = '0301',
    model_kwargs= {"stop": ["<|im_end|>", "Student response:", "Grading:", "Example "]}
    )

#define the grading chain. If parsing is needed, it can be connected to the output parser
grading_chain_gpt35 = prompt_template_noformatting | llm_gpt35_chat

The following function sends a response to llm for grading, and repeats the process until the outcome contains a binary string.

In [17]:
#A function to take the student response and grading input as parameters and invoke the chain to grade. 
def grade_by_llmChain(response: str, grading_input, chain, nItems = nRubricItems, problem = prompt_components_dict['ProblemBody']):
    grading_input['StudentResponse'] = response # student response is the parameter to feed to the function. grading_input is from the grainding_condition list
    grading_input['ProblemBody'] = problem #add the problem body
    grading_output = chain.invoke(input=grading_input) #invoke the llm chain to produce the grading.
    #check if the grading contains a binary output. If not, redo grading.
    binaryPattern = binary_output.Create_Search(nItems) #using the binary outuput module to create search pattern
    while not re.search(pattern=binaryPattern, string= grading_output.content):
        print("proper grading output not found, re-do grading again.")
        grading_output = chain.invoke(input=grading_input)
    return(grading_output)

In [18]:
#function to extract the grading text and grading outcome in one step.
def extract_info(df : pd.DataFrame, outcomeName : str, nItems = nRubricItems):
    df.loc[:,f'{outcomeName}_text'] = df.loc[:,outcomeName].apply(lambda x:x.content)
    df.loc[:,f'{outcomeName}_grade'] = df.loc[:,f'{outcomeName}_text'].apply(lambda x:binary_output.Extract_binary(x, nItems= nItems))
    return(df)

Test a random sample of 5 student answers.

In [19]:
batch_grading_test_gpt35 = randomResponse_5.copy()
promptStyle = 'detailed_compare'
batch_grading_test_gpt35[promptStyle] = batch_grading_test_gpt35['response'].apply(grade_by_llmChain, chain = grading_chain_gpt35, grading_input = grading_conditions_noparser[promptStyle])
batch_grading_test_gpt35 = extract_info(batch_grading_test_gpt35, promptStyle)
#print(batch_grading_test)

In [None]:
batch_grading_test_gpt35

In [None]:
print_response.print_gradingOutcome(batch_grading_test_gpt35, grading_colName = f"{promptStyle}_text")

In [23]:
#calculate the price of this test run
calc_price.Calc_Price(batch_grading_test_gpt35[promptStyle], modelUsed= 'gpt35')

0.01088

### This function automates the grading and extract of information process to avoid code copying errors.

In [71]:
def do_grading(promptStyle : str, llm_chain, response_df : pd.DataFrame, name_append = '', nItems = nRubricItems):
    colName = promptStyle if name_append == "" else f"{promptStyle}_{name_append}"
    response_df[colName] = response_df['response'].apply(grade_by_llmChain, grading_input = grading_conditions_noparser[promptStyle], chain = llm_chain, nItems = nItems)
    response_df = extract_info(response_df, colName, nItems = nItems)

In [73]:
do_grading(promptStyle=grading_condition_names[1], llm_chain=grading_chain_gpt35, response_df= batch_grading_test_gpt35)

In [None]:
batch_grading_test_gpt35

In [35]:
batch_grading_test_gpt35.to_csv(f"./data_chatAPI/{project_folder}/grading_test_gpt35.csv")

In [18]:
do_grading(promptStyle=grading_condition_names[4], llm_chain=grading_chain_gpt35, response_df= batch_grading_test_gpt35)

In [None]:
batch_grading_test_gpt35

In [None]:
print_response.print_gradingOutcome(batch_grading_test_gpt35,'detailed_compare_2')

## Grade all responses using GPT-35

In [39]:
# run this line only once as it creates a new variable
#full_grading_gpt35 = student_responses.copy() #use copy to create a new variable with new variable id.

In [45]:
grading_condition_names

['naive_cot_1', 'naive_cot_2', 'detailed_compare', 'forced_compare']

In [40]:
#naive COT
do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


Naive COT took 8m 26.7s to grade.

In [41]:
calc_price.Calc_Price(full_grading_gpt35[grading_condition_names[0]], modelUsed="gpt35")

0.14133

In [42]:
full_grading_gpt35.to_csv(f"./data_chatAPI/{project_folder}/full_grading_gpt35.csv")

In [44]:
do_grading(promptStyle=grading_condition_names[1], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

In [49]:
calc_price.Calc_Price(full_grading_gpt35[grading_condition_names[1]], modelUsed="gpt35")


0.188128

In [50]:
full_grading_gpt35.to_csv(f"./data_chatAPI/{project_folder}/full_grading_gpt35.csv")

In [51]:
do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

proper grading output not found, re-do grading again.


In [None]:
calc_price.Calc_Price(full_grading_gpt35[grading_condition_names[2]], modelUsed="gpt35")


In [54]:
do_grading(promptStyle=grading_condition_names[3], llm_chain=grading_chain_gpt35, response_df= full_grading_gpt35)

In [55]:
full_grading_gpt35.to_csv(f"./data_chatAPI/{project_folder}/full_grading_gpt35.csv")

In [None]:
full_grading_gpt35

## The code below grades with gpt-4o model

In [21]:
#setup the llm connection with gpt-4o. 
#empirically, temperatur 0.8 seems to be a good balance. This is of course a guess. 
#only 4 stop sequences are allowed for Azure via API. This is an API issue. Under chat API this seems to not be a problem.
gpt4_model = AzureChatOpenAI(    
    api_version="2024-02-01", #latest stable version
    deployment_name = "zchen-test-gpt-4o",
    max_tokens = 1000,
    temperature= 0.8,
    #model_version = '0301',
    model_kwargs= {"stop": ["<|im_end|>", "Student response:", "Grading:", "Example "]}
    )

#define the grading chain. If parsing is needed, it can be connected to the output parser
grading_chain_gpt4o = prompt_template_noformatting | gpt4_model

Test grading of multiple answers

In [27]:
test_grading_gpt4o = randomResponse_5.copy()

In [28]:
grading_condition_names

['naive_cot_1', 'naive_cot_2', 'detailed_compare', 'forced_compare']

In [29]:
promptStyle = grading_condition_names[2] # detailed_compare
test_grading_gpt4o[promptStyle] = test_grading_gpt4o['response'].apply(grade_by_llmChain, grading_input = grading_conditions_noparser[promptStyle], chain = grading_chain_gpt4o, nItems = nRubricItems) #grade
test_grading_gpt4o = extract_info(test_grading_gpt4o, promptStyle, nItems=nRubricItems)


In [None]:
test_grading_gpt4o

In [None]:
print_response.print_gradingOutcome(test_grading_gpt4o, grading_colName=f'{promptStyle}_text')

GPT-4o did a really good job of grading the 5 response. All of them are actually correct, even for the edge case where the verbal expression is confusing.
However, it took 29 seconds (as opposed to 7.9s) and costs 4 times as much as GPT 35.

In [36]:
#save for reproducibility
test_grading_gpt4o.to_csv(f"./data_chatAPI/{project_folder}/grading_test_gpt4o.csv")

In [37]:
#calculate the cost of running
#Note: I need a utility to calculate the price of multiple runs, and output a list
calc_price.Calc_Price(test_grading_gpt4o['detailed_compare'], 'gpt4o')
#calc_price.Calc_Price(test_grading_gpt4o['forced_compare'], 'gpt4o')

0.04315

Upload the identified more problematic items as a test

In [22]:
problematic_items = pd.read_csv("./data_chatAPI/Final_Q10/diff_both_low_entropy.csv")

In [None]:
problematic_items

In [None]:
problematic_items['response'][0]

In [70]:
grade_by_llmChain(problematic_items['response'][10], grading_input= grading_conditions_noparser['detailed_compare_2'], chain= grading_chain_gpt4o)

AIMessage(content='Let\'s evaluate the student response against each rubric item:\n\n### Item 1: Conservation of Mechanical Energy\nThe student wrote down the kinetic energy and potential energy terms, such as 0.5mv^2, 1/2kx^2, and mgh. Although the student did not explicitly mention "Mechanical Energy" or write an equation such as MEi = MEf, the use of these terms indicates an understanding of energy conservation.\n- **Satisfies Item 1:** Yes\n- **Binary Score:** 1\n\n### Item 2: Inclusion of Both Gravitational and Elastic Potential Energy\nThe student included both gravitational potential energy (PE_g = mgh) and elastic potential energy (PE_s = 1/2k(L-L0)^2) terms in their explanation.\n- **Satisfies Item 2:** Yes\n- **Binary Score:** 1\n\n### Item 3: Modification to the Height in Gravitational Potential Energy\nThe student calculated the gravitational potential energy as mgh without modifying the height based on the compression of the spring. The term used is just mgh (6.05), which 

In [45]:
grading_condition_names[4]

'detailed_compare_2'

In [60]:
do_grading(promptStyle=grading_condition_names[4], llm_chain=grading_chain_gpt4o, response_df=problematic_items)

In [None]:
problematic_items

In [None]:
print_response.print_gradingOutcome(problematic_items, "detailed_compare_2_text")

## Using GPT-4o to do full grading now

In [57]:
#The following line is run only once at the initiation of the process
#full_grading_gpt4o = student_responses.copy() #Need to use the copy method to create a different variable, not an alias of the same variable.

In [59]:
do_grading(promptStyle=grading_condition_names[0], llm_chain=grading_chain_gpt4o, response_df=full_grading_gpt4o)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [58]:
do_grading(promptStyle=grading_condition_names[1], llm_chain=grading_chain_gpt4o, response_df=full_grading_gpt4o)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [63]:
calc_price.Calc_Price(full_grading_gpt4o[grading_condition_names[0]], modelUsed='gpt4o')
calc_price.Calc_Price(full_grading_gpt4o[grading_condition_names[1]], modelUsed='gpt4o')

0.7623699999999999

GPT-4o took 18 minutes to perform teach round of grading, and costs a little less than a dollar for each run.

In [64]:
do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt4o, response_df= full_grading_gpt4o)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [65]:
do_grading(promptStyle=grading_condition_names[3], llm_chain=grading_chain_gpt4o, response_df= full_grading_gpt4o)

proper grading output not found, re-do grading again.


In [72]:
do_grading(promptStyle=grading_condition_names[4], llm_chain=grading_chain_gpt4o, response_df= full_grading_gpt4o)

proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


In [None]:
full_grading_gpt4o

In [78]:
calc_price.Calc_Price(full_grading_gpt4o['detailed_compare_2'], modelUsed='gpt4o')

0.7608900000000001

In [74]:

#save to csv file
full_grading_gpt4o.to_csv(f"./data_chatAPI/{project_folder}/full_grading_gpt4o.csv")

## Self-consistency run with detailed compare

In [76]:
self_consistency = student_responses.copy()

In [85]:
#do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_1") #first run
#do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_2") #second run 
#do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_3") #third run
#do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_4") #fourth run
#do_grading(promptStyle=grading_condition_names[2], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_5") #fifth run


proper grading output not found, re-do grading again.
proper grading output not found, re-do grading again.


Do detailed compare rubric 2. 

In [83]:
#do_grading(promptStyle=grading_condition_names[4], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_1") #first run
#do_grading(promptStyle=grading_condition_names[4], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_2") #second run 
#do_grading(promptStyle=grading_condition_names[4], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_3") #third run
#do_grading(promptStyle=grading_condition_names[4], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_4") #fourth run
#do_grading(promptStyle=grading_condition_names[4], llm_chain=grading_chain_gpt4o, response_df= self_consistency, name_append="run_5") #fifth run

proper grading output not found, re-do grading again.


In [None]:
self_consistency

In [85]:
self_consistency.to_csv(f"./data_chatAPI/{project_folder}/self_consistency_grading.csv")

## Serializing variables using pickle

In [86]:
#use the globalVars parameter to allow the function to access the current global variables list.
try:
    pickle_tools.save_as_pickle(
        variables=[student_responses, 
                   randomResponse_5, 
                   prompt_components_dict, 
                   prompt_dict, 
                   prompt_template_noformatting, 
                   grading_conditions_noparser, 
                   grading_condition_names,
                   do_grading, 
                   extract_info,
                   batch_grading_test_gpt35,
                   full_grading_gpt35, 
                   test_grading_gpt4o,
                   problematic_items,
                   full_grading_gpt4o,
                   self_consistency
                   ], 
        folderName = project_folder,
        globalVars=globals())
except IndexError as err:
    print(err)

print(pickled_varNames)

['student_responses', 'randomResponse_5', 'prompt_components_dict', 'prompt_dict', 'prompt_template_noformatting', 'grading_conditions_noparser', 'grading_condition_names', 'do_grading', 'extract_info', 'batch_grading_test_gpt35', 'full_grading_gpt35', 'test_grading_gpt4o', 'problematic_items', 'full_grading_gpt4o', 'self_consistency']
