In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# Correct file paths
gpt4_path = "/content/drive/MyDrive/gpt4_full_transparency_scoring.csv"
manual_path = "/content/drive/MyDrive/sampled_50_unique_filtered.csv"


In [24]:
# Parsing the manually annotated file

import pandas as pd
import csv

# File path
manual_path = "/content/drive/MyDrive/sampled_50_unique_filtered.csv"

# Properly parse semicolon-delimited file with quoted multiline fields
with open(manual_path, mode='r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=';', quotechar='"')
    rows = list(reader)

# First row = header
header = rows[0]
data = rows[1:]

# Convert to DataFrame
manual_df = pd.DataFrame(data, columns=header)

# Clean up name column
manual_df['name'] = manual_df['name'].str.strip().str.lower()

# Rename scoring columns
manual_df.rename(columns={
    'goals_score': 'manual_goals_score',
    'logic_score': 'manual_logic_score',
    'risks_score': 'manual_limitations_score'
}, inplace=True)

# Convert scores to numeric
for col in ['manual_goals_score', 'manual_logic_score', 'manual_limitations_score']:
    manual_df[col] = pd.to_numeric(manual_df[col], errors='coerce')

manual_df.shape  # You should now see around (50, 10)

# Preview first 10 rows
manual_df.head(10)


Unnamed: 0,name,goal,manual_goals_score,Justification,methods_and_models,manual_logic_score,Justification.1,risks,manual_limitations_score,Justification.2
0,emptying underground waste containers,The aim is to predict as best as possible when...,2,"Specific, contextualized purpose with clear po...",The number of times a rubbish card is read by...,2,Clearly states the process and input data.,Failures of the system are monitored. Periodi...,1,Some limitations are suggested but not context...
1,passport for work,Passport for Work aims to help people distant ...,2,"Specific, contextualized purpose with clear po...",\nBelow is the functional explanation of how t...,2,"Clear explanation of inputs, process, and mode...",\n\nCircumstances while completing the tests c...,2,Risks are explicitly defined.
2,chatbot - environment act (ai - version),Support customer service. Assist residents in ...,2,"Specific, contextualized purpose with clear po...",FeedAdministrator enters sources via the under...,2,"Logic is mentioned as well as model type,",We estimate the risk is low. People are warned...,2,Risks are explicitly defined and it is mention...
3,camera surveillance traffic measure (closed),Goal is to digitally enforce the local closure...,2,"Specific, contextualized purpose with clear po...",Image recognition. ANPR cameras at four differ...,2,The model type and process are clearly explain...,"Because personal data is processed, extra atte...",1,Risks vaguely mentioned but not specified or c...
4,digital applications for civil affairs,The purpose of the algorithm is to help reside...,2,"Specific, contextualized purpose with clear po...","Through the municipality's website, a resident...",1,Input data is vaguely mentioned. Process is de...,,0,Field left empty.
5,traffic model accessibility control,The purpose of the model is to see the impact ...,2,"Specific, contextualized purpose with clear po...",This is an internally used model serving as in...,2,Input and output data including the decision a...,There are no specific risks in terms of privac...,1,Risks are vaguely mentioned but not specified ...
6,pressure measurement - group dynamics estimation,It was a pilot. The algorithm recognises wheth...,2,"Specific, contextualized purpose with clear po...",The algorithm detects people and their speed o...,2,"Input and output data is described, logic is m...",A DPIA has been carried out (DPIA pressure mea...,1,Risks are vaguely mentioned.
7,benefit recipients legality prediction,The model made a prediction for the chance of ...,2,"Specific, contextualized purpose with clear po...",The municipality of Rotterdam no longer uses t...,2,"Model type, input data and logic clearly descr...","To determine the risk of an algorithm, the mun...",2,Risks are clearly described and contextualized.
8,information supported decision - short-stay (s...,The algorithm is used to support the processin...,2,"Specific, contextualized purpose with clear po...",Comparison with the data present is done throu...,2,Input data and logic are clearly described.,Final decisions on an application are monitore...,1,Mitigation of risks mentioned without specifyi...
9,cryptshare,The purpose of this algorithm is to help ensur...,2,"Specific, contextualized purpose with clear po...","When composing a new e-mail, the terms in the ...",2,Input data and logic are clearly described.,The overall performance of the algorithm is mo...,2,Mentions risks of incorrect classifications.


In [30]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import csv

# === 1. Load and Parse Manual CSV ===
manual_path = "/content/drive/MyDrive/sampled_50_unique_filtered.csv"

# Properly parse semicolon-delimited file with multiline fields
with open(manual_path, mode='r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=';', quotechar='"')
    rows = list(reader)

header = rows[0]
data = rows[1:]
manual_df = pd.DataFrame(data, columns=header)

# Normalize and clean manual dataset
manual_df['name'] = manual_df['name'].str.strip().str.lower()
manual_df.rename(columns={
    'goals_score': 'manual_goals_score',
    'logic_score': 'manual_logic_score',
    'risks_score': 'manual_limitations_score'
}, inplace=True)

for col in ['manual_goals_score', 'manual_logic_score', 'manual_limitations_score']:
    manual_df[col] = pd.to_numeric(manual_df[col], errors='coerce')

# === 2. Load and Clean GPT Dataset ===
gpt4_path = "/content/drive/MyDrive/gpt4_full_transparency_scoring.csv"
gpt4_df = pd.read_csv(gpt4_path)
gpt4_df['name'] = gpt4_df['name'].str.strip().str.lower()

for col in ['goals_score', 'logic_score', 'limitations_score']:
    gpt4_df[col] = pd.to_numeric(gpt4_df[col], errors='coerce')

# === 3. Average Duplicate GPT Entries ===
gpt4_avg_df = gpt4_df.groupby('name', as_index=False).agg({
    'goals_score': 'mean',
    'logic_score': 'mean',
    'limitations_score': 'mean'
})

gpt4_avg_df[['goals_score', 'logic_score', 'limitations_score']] = gpt4_avg_df[[
    'goals_score', 'logic_score', 'limitations_score'
]].round(0).astype('Int64')  # nullable int for safety

# === 4. Merge Both Datasets ===
merged_df = pd.merge(
    manual_df[['name', 'manual_goals_score', 'manual_logic_score', 'manual_limitations_score']],
    gpt4_avg_df[['name', 'goals_score', 'logic_score', 'limitations_score']],
    on='name',
    how='inner'
)

print(f"Merged {len(merged_df)} entries after averaging.")

# === 5. Compute Cohen's Kappa ===
def print_kappa(label, col1, col2):
    if len(set(col1.dropna())) < 2 or len(set(col2.dropna())) < 2:
        print(f"Cohen's Kappa for {label}: Not computable (only one class present)")
    else:
        kappa = cohen_kappa_score(col1, col2)
        print(f"Cohen's Kappa for {label}: {kappa:.3f}")

print_kappa("Goals", merged_df['manual_goals_score'], merged_df['goals_score'])
print_kappa("Logic", merged_df['manual_logic_score'], merged_df['logic_score'])
print_kappa("Limitations", merged_df['manual_limitations_score'], merged_df['limitations_score'])

# === 6. Show Mismatches ===
mismatches = merged_df[
    (merged_df['manual_goals_score'] != merged_df['goals_score']) |
    (merged_df['manual_logic_score'] != merged_df['logic_score']) |
    (merged_df['manual_limitations_score'] != merged_df['limitations_score'])
]

print(f"\n🔍 Mismatched entries after averaging: {len(mismatches)}")
mismatches.head(10)


Merged 50 entries after averaging.
Cohen's Kappa for Goals: 0.545
Cohen's Kappa for Logic: 0.828
Cohen's Kappa for Limitations: 0.751

🔍 Mismatched entries after averaging: 13


Unnamed: 0,name,manual_goals_score,manual_logic_score,manual_limitations_score,goals_score,logic_score,limitations_score
4,digital applications for civil affairs,2,1,0,2,2,0
6,pressure measurement - group dynamics estimation,2,2,1,2,2,2
9,cryptshare,2,2,2,2,2,1
12,sounding language,1,2,1,2,2,1
18,textimprovementtool,1,2,0,2,2,0
22,advising on data breach notification,2,2,1,2,2,0
24,aibobjection,2,2,2,2,2,1
25,summarising legal objection opinions,2,2,2,2,1,2
26,invoice processing,2,1,1,2,2,1
27,improved bicycle traffic flow,1,2,1,2,2,1


In [31]:
from sklearn.metrics import cohen_kappa_score
import numpy as np

# Compute all kappa values
kappas = {
    "Goals": cohen_kappa_score(merged_df['manual_goals_score'], merged_df['goals_score']),
    "Logic": cohen_kappa_score(merged_df['manual_logic_score'], merged_df['logic_score']),
    "Limitations": cohen_kappa_score(merged_df['manual_limitations_score'], merged_df['limitations_score']),
}

# Filter out NaNs
valid_kappas = {k: v for k, v in kappas.items() if not np.isnan(v)}

# Simple average
if valid_kappas:
    kappa_avg = sum(valid_kappas.values()) / len(valid_kappas)
    print("Individual Cohen's Kappa scores:")
    for k, v in valid_kappas.items():
        print(f"  {k}: {v:.3f}")
    print(f"\n Average of valid Cohen's Kappa values: {kappa_avg:.3f}")
else:
    print("No valid Kappa scores available for averaging.")


Individual Cohen's Kappa scores:
  Goals: 0.545
  Logic: 0.828
  Limitations: 0.751

 Average of valid Cohen's Kappa values: 0.708
