# Compare the LLM categorization of harms vs human categorizations

In [1]:
import pandas as pd

In [None]:
def compare(report_name: str, llm_name: str) -> None:
    """
    Compare LLM categorization of harms vs human categorizations.
    
    Args:
        report_name (str): Name of the report to analyze
        llm_name (str): Name of the LLM model used for categorization
    
    Returns:
        None: Prints similarity metrics to console
    """
    # Load human and LLM results
    human_results = pd.read_excel(f'{report_name.replace(" ", "_")}_harms.xlsx') # type: ignore
    llm_results = pd.read_csv(f'reports/{report_name}/harm_analysis/{llm_name}.csv') # type: ignore
    
    # Convert human results to boolean format
    human_results = human_results['meaningful'].map({'y': True, 'n': False}) # type: ignore
    llm_results = llm_results['meaningful'] # type: ignore
    
    # Calculate similarity metrics
    similar_elements = human_results == llm_results
    similarity_count = similar_elements.sum() # type: ignore
    similarity_percentage = similar_elements.mean() * 100 # type: ignore
    
    # Display results
    print(f"Similar elements count: {similarity_count}/{len(human_results)}")
    print(f"Exact Similarity: {similarity_percentage:.2f}%")

## IT (Mistral Small)

In [5]:
compare('IT (Mistral Small)', 'claude-sonnet-4-20250514')

Similar elements count: 128/150
Exact Similarity: 85.33%


In [9]:
compare('IT (Mistral Small)', 'o3-pro')

Similar elements count: 145/150
Exact Similarity: 96.67%


In [12]:
compare('IT (Mistral Small)', 'gemini-2.5-pro-preview-06-05')

Similar elements count: 139/150
Exact Similarity: 92.67%


## IT (Gemma3)

In [6]:
compare('IT (Gemma3)', 'claude-sonnet-4-20250514')

Similar elements count: 145/150
Exact Similarity: 96.67%


In [10]:
compare('IT (Gemma3)', 'o3-pro')

Similar elements count: 148/150
Exact Similarity: 98.67%


In [13]:
compare('IT (Gemma3)', 'gemini-2.5-pro-preview-06-05')

Similar elements count: 148/150
Exact Similarity: 98.67%


## IT (Qwen3)

In [7]:
compare('IT (Qwen3)', 'claude-sonnet-4-20250514')

Similar elements count: 138/150
Exact Similarity: 92.00%


In [11]:
compare('IT (Qwen3)', 'o3-pro')

Similar elements count: 150/150
Exact Similarity: 100.00%


In [14]:
compare('IT (Qwen3)', 'gemini-2.5-pro-preview-06-05')

Similar elements count: 144/150
Exact Similarity: 96.00%


## Scanbike (Mistral Small)

In [15]:
compare('scanbike (Mistral Small)', 'claude-sonnet-4-20250514')

Similar elements count: 133/150
Exact Similarity: 88.67%


In [21]:
compare('scanbike (Mistral Small)', 'o3-pro')

Similar elements count: 137/150
Exact Similarity: 91.33%


In [19]:
compare('scanbike (Mistral Small)', 'gemini-2.5-pro-preview-06-05')

Similar elements count: 131/150
Exact Similarity: 87.33%


## Scanbike (Gemma3)

In [16]:
compare('scanbike (Gemma3)', 'claude-sonnet-4-20250514')

Similar elements count: 142/150
Exact Similarity: 94.67%


In [22]:
compare('scanbike (Gemma3)', 'o3-pro')

Similar elements count: 144/150
Exact Similarity: 96.00%


In [18]:
compare('scanbike (Gemma3)', 'gemini-2.5-pro-preview-06-05')

Similar elements count: 142/150
Exact Similarity: 94.67%


## Scanbike (Qwen3)

In [17]:
compare('scanbike (Qwen3)', 'claude-sonnet-4-20250514')

Similar elements count: 143/150
Exact Similarity: 95.33%


In [23]:
compare('scanbike (Qwen3)', 'o3-pro')

Similar elements count: 147/150
Exact Similarity: 98.00%


In [20]:
compare('scanbike (Qwen3)', 'gemini-2.5-pro-preview-06-05')

Similar elements count: 143/150
Exact Similarity: 95.33%
