In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.metrics import cohen_kappa_score
from scipy.stats import kendalltau
from statsmodels.stats.inter_rater import fleiss_kappa

In [2]:
df = pd.read_csv("Data.csv")[['label_Andreas', 'label_Jan', 'label_Konrad', 'label_Michael', 'label_Niklas', 'label_Peng', 'label_Rui', 'label_Valentin', 'label_Yijun']]
df

Unnamed: 0,label_Andreas,label_Jan,label_Konrad,label_Michael,label_Niklas,label_Peng,label_Rui,label_Valentin,label_Yijun
0,0.0,0.0,,,,0.0,0.0,0.0,
1,0.0,,,0.0,,0.0,,9.0,
2,,9.0,0.0,,,,1.0,,1.0
3,,0.0,0.0,,,0.0,1.0,,
4,,0.0,,,1.0,1.0,,,1.0
...,...,...,...,...,...,...,...,...,...
5330,,,,1.0,,0.0,,0.0,0.0
5331,0.0,,0.0,,1.0,,,0.0,
5332,,,,,,,9.0,9.0,9.0
5333,,9.0,9.0,,9.0,9.0,,9.0,


In [3]:
label_map = {'positive': 1, 'neutral': 0, 'negative': 2}
data_numeric = df.replace(label_map)
data_numeric

Unnamed: 0,label_Andreas,label_Jan,label_Konrad,label_Michael,label_Niklas,label_Peng,label_Rui,label_Valentin,label_Yijun
0,0.0,0.0,,,,0.0,0.0,0.0,
1,0.0,,,0.0,,0.0,,9.0,
2,,9.0,0.0,,,,1.0,,1.0
3,,0.0,0.0,,,0.0,1.0,,
4,,0.0,,,1.0,1.0,,,1.0
...,...,...,...,...,...,...,...,...,...
5330,,,,1.0,,0.0,,0.0,0.0
5331,0.0,,0.0,,1.0,,,0.0,
5332,,,,,,,9.0,9.0,9.0
5333,,9.0,9.0,,9.0,9.0,,9.0,


In [5]:
def compute_icc(data):
    data_filled = data.apply(lambda row: row.fillna(row.mode()[0]) if not row.dropna().empty else row, axis=1)
    
    if data_filled.isna().all().all():
        return np.nan  # Avoid computing if no valid data
    
    k = data_filled.shape[1]  # Number of annotators
    ms_between = np.var(data_filled.mean(axis=1), ddof=1) * k
    ms_within = np.mean(data_filled.var(axis=1, ddof=1))
    icc = (ms_between - ms_within) / (ms_between + (k - 1) * ms_within)
    return icc

icc_value = compute_icc(data_numeric)
print(f"Intra-Class Correlation Coefficient (ICC): {icc_value:.4f}")

Intra-Class Correlation Coefficient (ICC): 0.6299


In [9]:
import numpy as np
import pandas as pd

def compute_icc(data, type="consistency"):
    """
    Compute the Intra-Class Correlation Coefficient (ICC).

    Parameters
    ----------
    data : pd.DataFrame
        Rows = subjects, Columns = raters/annotators
    type : str, "consistency" or "agreement"
        ICC type to compute.

    Returns
    -------
    float
        ICC value (np.nan if invalid)
    """
    # Ensure numeric dtype only
    data = data.apply(pd.to_numeric, errors="coerce")

    # Fill missing values per row with mode
    data_filled = data.apply(
        lambda row: row.fillna(row.mode().iloc[0]) if not row.dropna().empty else row,
        axis=1
    )

    if data_filled.isna().all().all():
        return np.nan

    n, k = data_filled.shape  # n subjects, k raters
    grand_mean = float(data_filled.values.mean())

    # Compute ANOVA components explicitly as floats
    subject_means = data_filled.mean(axis=1).to_numpy()
    rater_means = data_filled.mean(axis=0).to_numpy()

    ss_between = float(k * np.sum((subject_means - grand_mean) ** 2))
    ss_rater = float(n * np.sum((rater_means - grand_mean) ** 2))
    ss_within = float(np.sum(
        (data_filled.values
         - subject_means[:, None]
         - rater_means[None, :]
         + grand_mean) ** 2
    ))

    df_between = n - 1
    df_rater = k - 1
    df_within = (n - 1) * (k - 1)

    ms_between = ss_between / df_between
    ms_rater = ss_rater / df_rater
    ms_within = ss_within / df_within

    # Compute ICC
    if type == "consistency":
        icc = (ms_between - ms_within) / (ms_between + (k - 1) * ms_within)
    elif type == "agreement":
        icc = (ms_between - ms_within) / (
            ms_between + (k - 1) * ms_within + (k * (ms_rater - ms_within) / n)
        )
    else:
        raise ValueError("type must be 'consistency' or 'agreement'")

    # Return scalar float
    return float(icc)


# Example usage:
icc_consistency = compute_icc(data_numeric, type="consistency")
icc_agreement = compute_icc(data_numeric, type="agreement")

print(f"ICC (Consistency): {icc_consistency:.4f}")
print(f"ICC (Agreement):   {icc_agreement:.4f}")


ICC (Consistency): 0.6320
ICC (Agreement):   0.6301


In [10]:
import numpy as np
import pandas as pd

def compute_reliability_coeffs(data):
    """
    Compute Robinson's A and Finn's coefficient for inter-rater reliability.
    
    Parameters
    ----------
    data : pd.DataFrame
        Rows = subjects, Columns = raters/annotators.
        
    Returns
    -------
    dict
        {
            "robinsons_a": float,
            "finns_coefficient": float
        }
    """
    # Ensure numeric
    data = data.apply(pd.to_numeric, errors="coerce")
    n, k = data.shape

    # Fill missing per row with mode
    data_filled = data.apply(
        lambda row: row.fillna(row.mode().iloc[0]) if not row.dropna().empty else row,
        axis=1
    )

    # Compute means
    subject_means = data_filled.mean(axis=1).to_numpy()
    grand_mean = float(subject_means.mean())

    # --- Robinson's A ---
    # Between-subject variance (true-score)
    ss_between = k * np.sum((subject_means - grand_mean) ** 2)
    var_between = ss_between / (n - 1)

    # Within-subject variance (error)
    ss_within = np.sum((data_filled.sub(subject_means, axis=0)) ** 2).sum()
    var_within = ss_within / (n * (k - 1))

    robinsons_a = var_between / (var_between + var_within)

    # --- Finn's Coefficient ---
    numerator = np.sum((data_filled.sub(subject_means, axis=0)) ** 2).sum()
    denominator = np.sum((data_filled - grand_mean) ** 2).sum()
    finns_coeff = 1 - numerator / denominator

    return {
        "robinsons_a": float(robinsons_a),
        "finns_coefficient": float(finns_coeff)
    }

# Example usage:
coeffs = compute_reliability_coeffs(data_numeric)
print(f"Robinson's A:       {coeffs['robinsons_a']:.4f}")
print(f"Finn's Coefficient: {coeffs['finns_coefficient']:.4f}")

Robinson's A:       0.9423
Finn's Coefficient: 0.6710


  return reduction(axis=axis, out=out, **passkwargs)


In [10]:
# Define a mapping for the labels (you can modify this according to your actual dataset)
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}

# Function to compute pairwise category agreement (positive vs. negative, negative vs. neutral, etc.)
def compute_pairwise_agreement(df, categories):
    pairwise_scores = []
    
    # Iterate over all combinations of annotators
    across_across_ag = 0
    combi_combi = 0
    for cat1, cat2 in categories:
        across_agreement = 0
        combi = 0
        for a1, a2 in itertools.combinations(df.columns, 2):
            # Drop rows where either annotator has a NaN value
            valid_indices = df[[a1, a2]].dropna().index
            # Get the subset of rows where both annotators fall into the two categories
            filtered = df.loc[valid_indices, [a1, a2]]
            filtered = filtered[filtered.isin([label_map[cat1], label_map[cat2]]).all(axis=1)]

            # Compute pairwise agreement: both annotators must agree within the given categories
            if filtered.empty:
                agreement = np.nan
            else:
                agreement = (filtered[a1] == filtered[a2]).mean()
                across_agreement += agreement
                combi += 1

        # Append the agreement score
        across_across_ag += across_agreement
        combi_combi += combi
        pairwise_scores.append((f"{cat1} vs. {cat2}", across_agreement/combi))

    print(across_across_ag/combi_combi)
    return pairwise_scores

# Example: Assuming 'df' is your DataFrame with labels and 'label_map' is defined as above

# Define the categories we want to compare
categories_to_compare = [('positive', 'negative'), ('negative', 'neutral'), ('positive', 'neutral')]

# Compute pairwise agreement for the categories
pairwise_agreements = compute_pairwise_agreement(data_numeric, categories_to_compare)

# Output the pairwise agreement scores
for category_pair, score in pairwise_agreements:
    print(f"{category_pair}: {score:.3f}")

0.7580599664842887
positive vs. negative: 0.727
negative vs. neutral: 0.678
positive vs. neutral: 0.868
