# Analysing the human harm classifications

#### Imports

In [5]:
import pandas as pd
import glob

#### Function

In [6]:
def calculate_meaningful_harms(excel_file) -> None:
    # Read the Excel file
    df = pd.read_excel(excel_file)
    
    # Check for invalid values in 'meaningful' column
    valid_values = ['y', 'n']
    invalid_values = df[~df['meaningful'].isin(valid_values)]['meaningful'].unique()
    
    if len(invalid_values) > 0:
        print(f"WARNING: The 'meaningful' column contains values other than 'y' or 'n': {list(invalid_values)}")
        return None
    
    # Count total harms
    total_harms = len(df)
    
    # Count meaningful harms (where 'meaningful' column equals 'y')
    meaningful_harms = df[df['meaningful'] == 'y'].shape[0]
    
    # Calculate percentage
    if total_harms > 0:
        percentage = (meaningful_harms / total_harms) * 100
    else:
        percentage = 0
    
    # Display results
    print(f"Similar elements count: {meaningful_harms}/{total_harms}")
    print(f"Exact Similarity: {percentage:.2f}%")


#### Analysis

In [7]:
# Find all Excel files in the current directory
excel_files = glob.glob("*.xlsx")

# Process each file
for file in excel_files:
    print(f"\nAnalysing harms for {str(file)}")
    calculate_meaningful_harms(file)


Analysing harms for IT_(Mistral_Small)_harms.xlsx
Similar elements count: 145/150
Exact Similarity: 96.67%

Analysing harms for scanbike_(Gemma3)_harms.xlsx
Similar elements count: 143/150
Exact Similarity: 95.33%

Analysing harms for scanbike_(Mistral_Small)_harms.xlsx
Similar elements count: 145/150
Exact Similarity: 96.67%

Analysing harms for IT_(Qwen3)_harms.xlsx
Similar elements count: 150/150
Exact Similarity: 100.00%

Analysing harms for scanbike_(Qwen3)_harms.xlsx
Similar elements count: 147/150
Exact Similarity: 98.00%

Analysing harms for IT_(Gemma3)_harms.xlsx
Similar elements count: 145/150
Exact Similarity: 96.67%
