# Qualitative Analysis of LLM Performance

This notebook analyzes how different LLMs perform on the Chinese Social Work Licensing Exam questions.
1. Load and clean the raw results data
2. Generate balanced samples for qualitative analysis 
3. Process the labeled results
4. Analyze distributions across exam types and question types

## Imports and Setup

In [1]:
import pandas as pd
import glob
import os
from typing import Dict

## Data Loading and Cleaning Functions

In [9]:
def clean_answer(answer: str) -> str:
    """
    Clean Model_Answer by removing spaces and commas
    Example: 'A, B, C' -> 'ABC'
    """
    if pd.isna(answer):  # Handle NaN values
        return answer
    return ''.join(char for char in str(answer) if char.isalpha())

def clean_confidence_value(x):
    """
    Convert confidence values to float, handling various formats
    """
    if pd.isna(x):
        return None
    try:
        if isinstance(x, str):
            # Remove any non-numeric characters except decimal point
            x = ''.join(c for c in x if c.isdigit() or c == '.')
        return float(x)
    except (ValueError, TypeError):
        return None

def load_and_process_files() -> pd.DataFrame:
    """
    Load CSV files for condition1 and process them
    Returns a single dataframe with the processed data
    """
    # List to store dataframes
    dfs = []
    
    # Get all CSV files
    csv_files = glob.glob(r'/Users/ziaqi/Library/CloudStorage/OneDrive-Personal/Data Projects 2024/chinese_sw_licensing_exam/frontier_results_datafiles/results_*.csv')
    
    for file in csv_files:
        # Get just the filename without the path
        filename = os.path.basename(file)
        # Parse filename components
        parts = filename.replace('.csv', '').split('_')
        
        if len(parts) != 6:
            print(f"Skipping {file} - parts are not matched in filename")
            continue
            
        # Extract required information from filename
        exam_type = parts[1]      # policy or comprehensive
        model = parts[2]          # qwen, mistral, etc.
        condition = parts[3]+parts[4]     # condition1, condition2, etc.
        question_type = parts[5]  # Add question type from parts[5]
        
        # Check if this is a condition1 file
        try:
            condition_index = parts.index('condition')
            condition_num = parts[condition_index + 1]
            if condition_num != '1':  # Skip if not condition1
                continue
        except ValueError:
            print(f"Could not find condition number in {filename}")
            continue
            
        print(f"Processing: {filename}")
        
        # Read the CSV file
        df = pd.read_csv(file)
        
        # Clean Model_Answer column
        df['Model_Answer'] = df['Model_Answer'].apply(clean_answer)
        df['Model_Confidence'] = df['Model_Confidence'].apply(clean_confidence_value)
        df['Correct_Answer'] = df['Correct_Answer'].apply(clean_answer)
        
        # Add new columns
        df['Condition'] = condition
        df['Exam_Type'] = exam_type
        df['Question_Type'] = question_type
        
        # Add comparison column
        df['Is_Correct'] = df['Model_Answer'] == df['Correct_Answer']
        
        # Append the dataframe to the list
        dfs.append(df)
    
    # Combine all dataframes
    if dfs:
        final_df = pd.concat(dfs, ignore_index=True)
        print(f"Concatenated dataframe: {len(final_df)} rows")
        return final_df
    else:
        print("Warning: No condition1 files found")
        return pd.DataFrame()  # Return empty dataframe if no files found


In [None]:
# Usage:
df = load_and_process_files()

## Generate Samples for Analysis

### Sampling

In [12]:
def create_samples(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Create samples based on specified rules and split into True/False dataframes.
    Each sample record will have a unique ID.
    
    Args:
        df (pd.DataFrame): Input DataFrame with all records
        
    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: (true_samples, false_samples)
    """
    # Initialize empty lists to store samples
    true_samples = []
    false_samples = []
    
    # Get unique combinations of model and exam type
    models = df['Model'].unique()
    exam_types = df['Exam_Type'].unique()
    
    for model in models:
        # Get subset for this model
        subset = df[df['Model'] == model]
        
        # Randomly sample correct answers (n=30)
        correct_answers = subset[subset['Is_Correct'] == True]
        if len(correct_answers) > 0:
            true_samples.append(correct_answers.sample(n=min(30, len(correct_answers)), random_state=1))
        
        # Sample all incorrect answers that do not contain "跳过" in "Model_Answer"
        incorrect_answers = subset[(subset['Is_Correct'] == False) & (~subset['Model_Answer'].str.contains("跳过", na=False))]
        if len(incorrect_answers) > 0:
            false_samples.append(incorrect_answers)
    
    # Combine all samples
    true_samples_df = pd.concat(true_samples, ignore_index=True) if true_samples else pd.DataFrame()
    false_samples_df = pd.concat(false_samples, ignore_index=True) if false_samples else pd.DataFrame()
    
    # Add new columns "code" and "comment" with no values
    true_samples_df['code'] = None
    true_samples_df['comment'] = None
    false_samples_df['code'] = None
    false_samples_df['comment'] = None

    # Sort samples by Exam_Type, Question_Type, Model
    true_samples_df = true_samples_df.sort_values(by=['Model','Exam_Type', 'Question_Type'], ignore_index=True, ascending=[True, True, False])
    false_samples_df = false_samples_df.sort_values(by=['Model','Exam_Type', 'Question_Type'], ignore_index=True, ascending=[True, True, False])

    # Add Sample_ID to combined dataframes
    if not true_samples_df.empty:
        true_samples_df['Sample_ID'] = [f"S{i:03d}" for i in range(1, len(true_samples_df) + 1)]
    if not false_samples_df.empty:
        false_samples_df['Sample_ID'] = [f"S{i:03d}" for i in range(len(true_samples_df) + 1, 
                                                                   len(true_samples_df) + len(false_samples_df) + 1)]
        
    
    # Print sampling statistics
    print("\nSampling Statistics:")
    print(f"Total True Samples: {len(true_samples_df)}")
    print(f"Total False Samples: {len(false_samples_df)}")
    
    # Print detailed breakdown by model and exam type
    print("\nDetailed Breakdown:")
    for model in models:
        true_count = len(true_samples_df[true_samples_df['Model'] == model])
        false_count = len(false_samples_df[false_samples_df['Model'] == model])
        print(f"\nModel: {model}")
        print(f"True Samples: {true_count}")
        print(f"False Samples: {false_count}")
    
    return true_samples_df, false_samples_df


In [None]:
# usage:
true_samples, false_samples = create_samples(df)

In [14]:
true_samples = true_samples[['Sample_ID', 'Question_ID', 'Question', 'Selections', 'Correct_Answer', 'Model_Answer', 'Official_Explanation', 'Model_Explanation', 'Model', 'Condition', 'Exam_Type', 'Question_Type', 'Is_Correct','code','comment']]
false_samples = false_samples[['Sample_ID', 'Question_ID', 'Question', 'Selections', 'Correct_Answer', 'Model_Answer', 'Official_Explanation', 'Model_Explanation', 'Model', 'Condition', 'Exam_Type', 'Question_Type', 'Is_Correct','code','comment']]
true_samples.to_csv('samples/sample_correct.csv', index=False, encoding='utf-8-sig')
false_samples.to_csv('samples/sample_incorrect.csv', index=False, encoding='utf-8-sig')

### Sample n=30 Records for Reliability Testing

In [None]:
# Randomly sample 30 records from true_samples and false_samples
true_raliability = true_samples.sample(n=30, random_state=1)
false_reliability = false_samples.sample(n=30, random_state=1)

# Keep only the required columns
true_raliability = true_raliability[['Sample_ID','Question', 'Selections', 'Correct_Answer', 'Model_Answer', 'Official_Explanation', 'Model_Explanation', 'code','comment']]
false_reliability = false_reliability[['Sample_ID','Question', 'Selections', 'Correct_Answer', 'Model_Answer', 'Official_Explanation', 'Model_Explanation', 'code','comment']]

In [None]:
# write to csv
true_raliability.to_csv('samples/sample_correct_reliability.csv', index=False, encoding='utf-8-sig')
false_reliability.to_csv('samples/sample_incorrect_reliability.csv', index=False, encoding='utf-8-sig')

### Shuffle Remaining Records and Create Sub-files for Raters

In [None]:
# subtract records used for reliability testing from true_samples and false_samples
true_samples_rater = true_samples[~true_samples['Sample_ID'].isin(true_raliability['Sample_ID'])]
false_samples_rater = false_samples[~false_samples['Sample_ID'].isin(false_reliability['Sample_ID'])]

In [None]:
# Shuffle records
true_samples_rater = true_samples_rater.sample(frac=1, random_state=77).reset_index(drop=True)
false_samples_rater = false_samples_rater.sample(frac=1, random_state=77).reset_index(drop=True)

# Keep only the required columns
true_samples_rater = true_samples_rater[['Sample_ID','Question', 'Selections', 'Correct_Answer', 'Model_Answer', 'Official_Explanation', 'Model_Explanation', 'code','comment']]
false_samples_rater = false_samples_rater[['Sample_ID','Question', 'Selections', 'Correct_Answer', 'Model_Answer', 'Official_Explanation', 'Model_Explanation', 'code','comment']]

In [None]:
# Save true_samples and false_samples to CSV files with each file containing 100 records
batch_size = 100
num_batches_true = len(true_samples_rater) // batch_size + 1
num_batches_false = len(false_samples_rater) // batch_size + 1

for i in range(num_batches_true):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    true_samples_rater.iloc[start_idx:end_idx].to_csv(f'samples/subsample_correct_batch{i+1}.csv', index=False, encoding='utf-8-sig')

for i in range(num_batches_false):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    false_samples_rater.iloc[start_idx:end_idx].to_csv(f'samples/subsample_incorrect_batch{i+1}.csv', index=False, encoding='utf-8-sig')

## Prepare Data after Labeling

In [54]:
# Read labeled data files into a dataframe uisng glob
labeled_files = glob.glob('/qualitative_analysis/labeled/labeled_*.csv')
# Turn the two files into two dataframes
labeled_dfs = [pd.read_csv(file) for file in labeled_files]
# Access the "correct" and "incorrect" dataframes
labeled_correct = labeled_dfs[0]
labeled_incorrect = labeled_dfs[1]

# Correct the column names "QuestionID" to "Sample_ID"
labeled_correct = labeled_correct.rename(columns={'QuestionID': 'Sample_ID'})
labeled_incorrect = labeled_incorrect.rename(columns={'QuestionID': 'Sample_ID'})


In [None]:
# Merge labeling results from raters into a single column
labeled_correct['label'] = labeled_correct.apply(lambda row: row['Zia'] if pd.notna(row['Zia']) else row['Miao'], axis=1)

labeled_incorrect['label'] = labeled_incorrect.apply(
    lambda row: row['Sitao'] if pd.notna(row['Sitao']) else (
        row['Cao'] if pd.notna(row['Cao']) else (
            row['Miao'] if pd.notna(row['Miao']) else row['Zia']
        )
    ), axis=1
)

# check for any missing labels
print(labeled_correct[labeled_correct['label'].isna()])
print(labeled_incorrect[labeled_incorrect['label'].isna()])

In [None]:
# merge with true_samples and false_samples to add fields "Exam_Type" and "Question_Type"
labeled_correct = labeled_correct.merge(true_samples[['Sample_ID', 'Exam_Type', 'Question_Type']], on='Sample_ID', how='left')[['Sample_ID', 'Question', 'Options', 'Correct_Answer', 'Official_Explanation', 'Model_Explanation', 'Model', 'label', 'Exam_Type', 'Question_Type']]
labeled_incorrect = labeled_incorrect.merge(false_samples[['Sample_ID', 'Exam_Type', 'Question_Type']], on='Sample_ID', how='left')[['Sample_ID', 'Question', 'Options', 'Correct_Answer', 'Model_Answer', 'Official_Explanation', 'Model_Explanation', 'Model', 'label', 'Exam_Type', 'Question_Type']]
# check the data shape
labeled_correct.shape, labeled_incorrect.shape

In [60]:
# prepare a dataframe for visualization
labeled_correct['Is_Correct'] = True
labeled_incorrect['Is_Correct'] = False
labeled_combined = pd.concat([labeled_correct, labeled_incorrect], ignore_index=True)[['Sample_ID', 'Model', 'label', 'Exam_Type', 'Question_Type', 'Is_Correct']]

In [None]:
# write to csv
labeled_correct.to_csv('labeled/clean_labeled_correct.csv', index=False, encoding='utf-8-sig')
labeled_incorrect.to_csv('labeled/clean_labeled_incorrect.csv', index=False, encoding='utf-8-sig')
labeled_combined.to_csv('labeled/clean_labeled_combined.csv', index=False, encoding='utf-8-sig')