# Testing LLMs on Chinese National Social Work Examination

## Imports and Setup

In [None]:
import pandas as pd
from openai import OpenAI
import time
from datetime import datetime
import glob
from typing import Dict
import os
from dotenv import load_dotenv
import qianfan
import anthropic
from mistralai import Mistral
import google.generativeai as genai 
from pathlib import Path
import json

## API configuration for foundation models

In this section we are configuring a total of eight foundation (web-based) models -- four Chinese models and four Western-centric models

In [2]:
# Set up API keys and configurations
# Load environment variables
load_dotenv()

# Common settings for all models
COMMON_SETTINGS = {
    "temperature": 0,  # Keep 0 for getting the most consistent answers
    "max_tokens": 450,  # Comfortable space for answer + 150 characters explanation
    "frequency_penalty": 0.1,  # Small penalty for better explanation readability
    "presence_penalty": 0  # Keep 0 to stay focused on the question topic
}

# Model configurations
MODEL_CONFIGS = {
    "kimi": {
        "api_key": os.getenv('MOONSHOT_API_KEY'),
        "base_url": "https://api.moonshot.cn/v1",
        "model_name": "moonshot-v1-8k",
        "settings": COMMON_SETTINGS
    },

    "deepseek": {
        "api_key": os.getenv('DEEPSEEK_API_KEY'),
        "base_url": "https://api.deepseek.com",
        "model_name": "deepseek-chat",
        "settings": COMMON_SETTINGS
    },

    "qwen": {
        "api_key": os.getenv('QWEN_API_KEY'),
        "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
        "model_name": "qwen-max",
        "settings": COMMON_SETTINGS
    },

    "ernie": {
        "access_key": os.getenv('QIANFAN_ACCESS_KEY'),
        "secret_key": os.getenv('QIANFAN_SECRET_KEY'),
        "model_name": "ERNIE-4.0-8K",
        "settings": COMMON_SETTINGS
    },

    "gpt": {
        "api_key": os.getenv('OPENAI_API_KEY'),
        "base_url": None,
        "model_name": "gpt-4o",
        "settings": COMMON_SETTINGS
    },

    "anthropic": {
        "api_key": os.getenv('ANTHROPIC_API_KEY'),
        "base_url": "https://api.anthropic.com/v1",
        "model_name": "claude-3-5-sonnet-20241022",
        "settings": COMMON_SETTINGS
    },

    "mistral": {
        "api_key": os.getenv('MISTRAL_API_KEY'),
        "base_url": "https://api.mistral.ai/v1/chat/completions",
        "model_name": "mistral-large-latest",
        "settings": COMMON_SETTINGS
    },

    "gemini": {
        "api_key": os.getenv('GEMINI_API_KEY'),
        "model_name": "gemini-1.5-pro",
        "settings": COMMON_SETTINGS
    }
}

## Prompt Configurations

This code defines a structured configuration for an exam-taking assistant system in Chinese. The configurations are organized into two distinct scenarios: single-choice questions and multiple-choice questions. For each type, there's a system prompt that establishes the AI's role as an exam expert and specifies the exact response format, followed by a user prompt template that will be filled with the actual question and answer choices. The general configuration allows for flexible answer selection, the single-choice configuration enforces selection of exactly one answer while explaining why others were rejected, and the multiple-choice configuration requires selecting all correct answers with justification for both selected and unselected options. Each prompt template requires responses to follow a strict format with the selected answer(s) and a roughly 150-character explanation of the reasoning. The templates use string formatting placeholders for dynamic insertion of questions and answer choices.

In [None]:
def load_prompts(prompts_dir="/user_prompts"):
    """Load all txt files as prompts and use filenames (without .txt) as keys"""
    EXAM_PROMPTS = {}
    
    # Get all txt files in the directory
    txt_files = [f for f in os.listdir(prompts_dir) if f.endswith('.txt')]
    
    for filename in txt_files:
        file_path = os.path.join(prompts_dir, filename)
        prompt_key = filename[:-4]  # Remove .txt extension
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                
                # Assume each file contains both system and user prompts separated by a delimiter
                if '\n---\n' in content:  # Example delimiter
                    system_prompt, user_prompt = content.split('\n---\n')
                else:
                    # Default system prompt if not specified in file
                    system_prompt = "你是一位精通中国大陆社会政策和社会工作领域的专家, 你正在参加中国社会工作者职业水平考试。你只能以指定的JSON格式回答，不能有任何其他对话或说明。"
                    user_prompt = content
                
                EXAM_PROMPTS[prompt_key] = {
                    "system": system_prompt.strip(),
                    "user": user_prompt.strip()
                }
                print(f"Loaded prompt: {prompt_key}")
        except Exception as e:
            print(f"Error loading {filename}: {str(e)}")
    
    return EXAM_PROMPTS

# Load prompts
EXAM_PROMPTS = load_prompts()


## Response parsing

In [4]:
def parse_model_response(response: str, prompt_name: str = "") -> tuple:
    """Parse model response from JSON format with Chinese keys"""
    
    try:
        # Clean the response string
        response = response.strip()
        if response.startswith('\n'):
            response = response.lstrip('\n')
        
        # Look for JSON structure
        start_idx = response.find('{')
        end_idx = response.rfind('}') + 1
        
        if start_idx != -1 and end_idx != -1:
            json_str = response[start_idx:end_idx]
            
            try:
                parsed = json.loads(json_str)
                
                if prompt_name.startswith("condition_4"):
                    return (
                        parsed.get('答案', ''), parsed.get('理由', '')
                    )
                return parsed.get('答案', ''), parsed.get('理由', ''), parsed.get('信心', '')
            
            except json.JSONDecodeError as je:
                print(f"\nJSON decode error: {je}")
                
        else:
            raise ValueError("No JSON structure found in response")
            
    except Exception as e:
        print(f"\nError in parse_model_response: {str(e)}")
        raise e

In [5]:
class ResultCreator:
    """Helper class to manage result creation"""
    def __init__(self, model_name, prompt_name, timestamp):
        self.model_name = model_name
        self.prompt_name = prompt_name
        self.timestamp = timestamp

    def create_dict(self, row, model_response):
        """Create result dictionary with current settings"""
        base_dict = {
            'Question_ID': row['Question_ID'],
            'Question': row['Question'],
            'Selections': row['Selections'],
            'Correct_Answer': row['Answer'],
            'Official_Explanation': row['Explanation'],
            'Model': self.model_name,
            'Prompt': self.prompt_name,
            'Timestamp': self.timestamp
        }
        
        # Parse response and update dict
        if self.prompt_name.startswith("condition_4"):
            answer, explanation = parse_model_response(model_response, self.prompt_name)
            base_dict.update({
                'Model_Answer': answer,
                'Model_Explanation': explanation
            })
        else:
            answer, explanation, confidence = parse_model_response(model_response, self.prompt_name)
            base_dict.update({
                'Model_Answer': answer,
                'Model_Explanation': explanation,
                'Model_Confidence': confidence
            })
            
        
        return base_dict

In [6]:
def process_exam(exam_df: pd.DataFrame, model_name: str, prompt_name: str, df_name: str = "sample") -> pd.DataFrame:
    """Process exam using specified model and prompt"""
    if model_name not in MODEL_CONFIGS:
        raise ValueError(f"Model '{model_name}' not found. Available models: {list(MODEL_CONFIGS.keys())}")
    if prompt_name not in EXAM_PROMPTS:
        raise ValueError(f"Prompt '{prompt_name}' not found. Available prompts: {list(EXAM_PROMPTS.keys())}")
        
    config = MODEL_CONFIGS[model_name]
    template = EXAM_PROMPTS[prompt_name]
    
    # Initialize client based on model type
    if model_name == "gpt":
        client = OpenAI(api_key=config["api_key"])
    else:
        client = OpenAI(
            api_key=config["api_key"],
            base_url=config["base_url"]
        )
    
    results = []
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    total_questions = len(exam_df)
    
    print(f"Starting to process {total_questions} questions...")

    # Initialize ResultCreator
    result_creator = ResultCreator(model_name, prompt_name, timestamp)
    
    for index, row in exam_df.iterrows():
        try:
            user_prompt = template["user"].format(
                question=row['Question'],
                selections=row['Selections']
            )
            
            completion = client.chat.completions.create(
                model=config["model_name"],
                messages=[
                    {"role": "system", "content": template["system"]},
                    {"role": "user", "content": user_prompt}
                ],
                **config["settings"]
            )
            
            model_response = completion.choices[0].message.content
            
            results.append(result_creator.create_dict(row, model_response))
           
            
        except Exception as e:
            print(f"Error processing question {row['Question_ID']}: {str(e)}")
            continue
    
    os.makedirs("frontier_results_datafiles", exist_ok=True) 
    filename = os.path.join("frontier_results_datafiles", f"results_{df_name}_{model_name}_{prompt_name}.csv")
    results_df = pd.DataFrame(results)
    results_df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"\nResults saved to: {filename}")
    print(f"Total questions processed: {len(results)}/{total_questions}")
    
    return results_df

In [7]:
def process_exam_other(exam_df: pd.DataFrame, model_name: str, prompt_name: str, df_name: str = "sample") -> pd.DataFrame:
    """Process exam using ERNIE, Claude, or Mistral"""
    print(f"\nStarting process with model: {model_name}, prompt: {prompt_name}")
    
    # Setup
    template = EXAM_PROMPTS[prompt_name]
    config = MODEL_CONFIGS[model_name]
    result_creator = ResultCreator(model_name, prompt_name, datetime.now().strftime("%Y%m%d_%H%M%S"))
    results = []
    
    # Initialize client
    if model_name == "ernie":
        os.environ["QIANFAN_ACCESS_KEY"] = config["access_key"]
        os.environ["QIANFAN_SECRET_KEY"] = config["secret_key"]
        client = qianfan.ChatCompletion()
    elif model_name == "anthropic":
        client = anthropic.Anthropic(api_key=config["api_key"])
    elif model_name == "mistral":
        client = Mistral(api_key=config["api_key"])
    elif model_name == "gemini":
        genai.configure(api_key=config["api_key"])
        client = genai.GenerativeModel('gemini-1.5-pro',
                                       system_instruction=template["system"])
    
    print(f"Starting to process {len(exam_df)} questions...")
    
    for index, row in exam_df.iterrows():
        try:
            user_prompt = template["user"].format(
                question=row['Question'],
                selections=row['Selections']
            )
            
            # Get model response based on model type
            if model_name == "ernie":
                response = client.do(
                    model=config["model_name"],
                    system=template["system"],
                    messages=[{"role": "user", "content": user_prompt}],
                    temperature=0.1,
                )
                model_response = response["body"]["result"]
                
            elif model_name == "anthropic":
                response = client.messages.create(
                    model=config["model_name"],
                    messages=[{"role": "user", "content": user_prompt}],
                    system=template["system"],
                    temperature=config["settings"]["temperature"],
                    max_tokens=config["settings"]["max_tokens"]
                )
                model_response = response.content[0].text
                
            elif model_name == "mistral":
                response = client.chat.complete(
                    model=config["model_name"],
                    messages=[{"role": "user", "content": f"{template['system']}\n\n{user_prompt}"}],
                    temperature=config["settings"]["temperature"],
                    max_tokens=config["settings"]["max_tokens"]
                )
                model_response = response.choices[0].message.content
            
            elif model_name == "gemini":
                response = client.generate_content(
                    user_prompt,
                    generation_config={
                        "temperature": config["settings"]["temperature"],
                        "max_output_tokens": config["settings"]["max_tokens"]
                    }
                )
                model_response = response.text
            
            results.append(result_creator.create_dict(row, model_response))
            
        except Exception as e:
            print(f"Error processing question {row['Question_ID']}: {str(e)}")
            continue
    
    # Save results
    results_df = pd.DataFrame(results)
    os.makedirs("frontier_results_datafiles", exist_ok=True)
    filename = os.path.join("frontier_results_datafiles", f"results_{df_name}_{model_name}_{prompt_name}.csv")
    results_df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"\nProcessed {len(results)}/{len(exam_df)} questions")
    
    return results_df

## Usage
Models are tested by the following conditions on both Jurisprudence and Applied Knowledge sections:<br/>
- Condition 1: Models received explicit instructions regarding question type (single-select or SATA) and were directed to answer every question<br/>
- Condition 2: While maintaining similar procedures to Condition 1, models could skip questions they couldn't answer with reasonable confidence<br/>
- Condition 3: Models are presented with only answer options, withholding the actual question stems.

In [None]:
# Load and prepare exam data
policy_exam_df = pd.read_csv(f'exam_datafiles/2023_policy_exam.csv')
comprehensive_exam_df = pd.read_csv(f'exam_datafiles/2023_comprehensive_exam.csv')

policy_exam_single = policy_exam_df.iloc[:60]
policy_exam_multiple = policy_exam_df.iloc[60:]

comprehensive_exam_single = comprehensive_exam_df.iloc[:60]
comprehensive_exam_multiple = comprehensive_exam_df.iloc[60:]

print("Policy Exam Questions:", len(policy_exam_df))
print("Comprehensive Exam Questions:", len(comprehensive_exam_df))
print("Available prompts:", list(EXAM_PROMPTS.keys()))

##### Condition 1 - with Jurisprudence(Policy) Questions

In [None]:
# Define the variables for the function parameters
exam_dfs = [policy_exam_multiple, policy_exam_single]
model_names = ["deepseek", "gpt", "qwen", "kimi"]
df_name = "policy"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_1_single" if exam_df is policy_exam_single else "condition_1_multiple"
    
    for model_name in model_names:
        process_exam(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

In [None]:
# Define the variables for the function parameters
exam_dfs = [policy_exam_multiple,policy_exam_single]
model_names = ["mistral", "gemini""ernie", "anthropic"]
df_name = "policy"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_1_single" if exam_df is policy_exam_single else "condition_1_multiple"
    
    for model_name in model_names:
        process_exam_other(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

##### Condition 1 - with Applied Skills(Comprehensive) Quesitons

In [None]:
# Define the variables for the function parameters
exam_dfs = [comprehensive_exam_multiple, comprehensive_exam_single]
model_names = ["qwen", "deepseek", "gpt", "kimi"]
df_name = "comprehensive"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_1_single" if exam_df is comprehensive_exam_single else "condition_1_multiple"
    
    for model_name in model_names:
        process_exam(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

In [None]:
# Define the variables for the function parameters
exam_dfs = [comprehensive_exam_multiple, comprehensive_exam_single]
model_names = ["mistral", "ernie", "anthropic", "gemini"]
df_name = "comprehensive"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_1_single" if exam_df is comprehensive_exam_single else "condition_1_multiple"
    
    for model_name in model_names:
        process_exam_other(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

##### Condition 2 - with Jurisprudence(Policy) Questions

In [None]:
# Define the variables for the function parameters
exam_dfs = [policy_exam_multiple, policy_exam_single]
model_names = ["qwen", "kimi","deepseek", "gpt"] 
df_name = "policy"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_2_single" if exam_df is policy_exam_single else "condition_2_multiple"
    
    for model_name in model_names:
        process_exam(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

In [None]:
# Define the variables for the function parameters
exam_dfs = [policy_exam_multiple, policy_exam_single]
model_names = ["ernie", "mistral", "anthropic", "gemini"]
df_name = "policy"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_2_single" if exam_df is policy_exam_single else "condition_2_multiple"
    
    for model_name in model_names:
        process_exam_other(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

##### Condition 2 - with Applied Skills(Comprehensive) Questions

In [None]:
# Define the variables for the function parameters
exam_dfs = [comprehensive_exam_multiple, comprehensive_exam_single]
model_names = ["qwen", "kimi","deepseek", "gpt"] 
df_name = "comprehensive"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_2_single" if exam_df is comprehensive_exam_single else "condition_2_multiple"
    
    for model_name in model_names:
        process_exam(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

In [None]:
# Define the variables for the function parameters
exam_dfs = [comprehensive_exam_multiple, comprehensive_exam_single]
model_names = ["ernie", "mistral", "anthropic", "gemini"]
df_name = "comprehensive"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_2_single" if exam_df is comprehensive_exam_single else "condition_2_multiple"
    
    for model_name in model_names:
        process_exam_other(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

##### Condition 3 - with Jurisprudence(Policy) Questions

In [None]:
# Define the variables for the function parameters
exam_dfs = [policy_exam_multiple, policy_exam_single] 
model_names = ["qwen", "kimi","deepseek", "gpt"] 
df_name = "policy"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_3_single" if exam_df is policy_exam_single else "condition_3_multiple"
    
    for model_name in model_names:
        process_exam(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

In [None]:
# Define the variables for the function parameters
exam_dfs = [policy_exam_multiple, policy_exam_single]
model_names = ["ernie", "mistral", "anthropic", "gemini"]
df_name = "policy"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_3_single" if exam_df is policy_exam_single else "condition_3_multiple"
    
    for model_name in model_names:
        process_exam_other(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

##### Condition 3 - with Applied Skills(Comprehensive) Questions

In [None]:
# Define the variables for the function parameters
exam_dfs = [comprehensive_exam_multiple, comprehensive_exam_single]
model_names = ["qwen", "kimi","deepseek", "gpt"] 
df_name = "comprehensive"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_3_single" if exam_df is comprehensive_exam_single else "condition_3_multiple"
    
    for model_name in model_names:
        process_exam(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)

In [None]:
# Define the variables for the function parameters
exam_dfs = [comprehensive_exam_multiple, comprehensive_exam_single]
model_names = ["ernie", "mistral", "anthropic", "gemini"]
df_name = "comprehensive"

# Loop through all combinations and execute the function
for exam_df in exam_dfs:
    # Set prompt_name based on the DataFrame being used
    prompt_name = "condition_3_single" if exam_df is comprehensive_exam_single else "condition_3_multiple"
    
    for model_name in model_names:
        process_exam_other(exam_df=exam_df, model_name=model_name, prompt_name=prompt_name, df_name=df_name)