# Evaluation of the annotations

In [None]:
with open("../data/labels_p.txt", "r") as f:
    labels_p = [line.strip() for line in f.readlines()]
with open("../data/labels_j.txt", "r") as f:
    labels_j = [line.strip() for line in f.readlines()]
with open("../data/labels_gp.txt", "r") as f:
    labels_g = [line.strip() for line in f.readlines()]

In [2]:
from statsmodels.stats.inter_rater import fleiss_kappa
import numpy as np
from collections import Counter


# Combine the labels into a single list of lists
annotations = [labels_p, labels_j]
annotations = list(zip(*annotations))  # Transpose so each sentence has its annotations grouped

# Define all possible topics
categories = ["Physical Action", "Directional Metaphor", "Cultural/Convention", "Interactive Entities", "No Analogy/Explanation"]

# Create the matrix of counts (rows: sentences, columns: categories)
def create_fleiss_matrix(annotations, categories):
    fleiss_matrix = []
    for sentence_annotations in annotations:
        # Ensure all categories are included, even if count is 0
        count = Counter({cat: 0 for cat in categories})
        count.update(sentence_annotations)
        row = [count[cat] for cat in categories]  # Ensure counts are aligned with category order
        fleiss_matrix.append(row)
    return np.array(fleiss_matrix)

fleiss_matrix = create_fleiss_matrix(annotations, categories)

# Print the Fleiss' matrix
print("Fleiss' Matrix:")
print(fleiss_matrix)

# Calculate Fleiss' Kappa
kappa = fleiss_kappa(fleiss_matrix, method='fleiss')
print("Fleiss' Kappa:", kappa)


Fleiss' Matrix:
[[1 0 1 0 0]
 [2 0 0 0 0]
 [2 0 0 0 0]
 [2 0 0 0 0]
 [0 0 2 0 0]
 [0 0 2 0 0]
 [2 0 0 0 0]
 [0 0 0 2 0]
 [0 0 2 0 0]
 [1 0 0 1 0]
 [2 0 0 0 0]
 [0 0 2 0 0]
 [2 0 0 0 0]
 [2 0 0 0 0]
 [1 0 1 0 0]
 [0 0 0 2 0]
 [1 0 0 1 0]
 [0 0 2 0 0]
 [0 0 2 0 0]
 [0 0 2 0 0]
 [0 0 2 0 0]
 [0 0 2 0 0]
 [1 0 0 0 1]
 [0 0 2 0 0]
 [0 0 0 2 0]
 [0 0 2 0 0]
 [1 0 1 0 0]
 [0 0 2 0 0]
 [1 0 0 1 0]
 [0 0 2 0 0]]
Fleiss' Kappa: 0.6243291592128801


In [3]:
from sklearn.metrics import cohen_kappa_score

# Compute Cohen's Kappa for each pair of annotators
kappa_j_p = cohen_kappa_score(labels_j, labels_p)
kappa_j_g = cohen_kappa_score(labels_j, labels_g)
kappa_p_g = cohen_kappa_score(labels_p, labels_g)

# Print the results
print("Cohen's Kappa (j-p):", kappa_j_p)
print("Cohen's Kappa (j-g):", kappa_j_g)
print("Cohen's Kappa (p-g):", kappa_p_g)



Cohen's Kappa (j-p): 0.6276595744680851
Cohen's Kappa (j-g): -0.01851851851851838
Cohen's Kappa (p-g): 0.12408759124087587


In [4]:
print(len(labels_g))
print(len(labels_j))
print(len(labels_p))


30
30
30
