In [4]:
import pandas as pd
iaa_df = pd.read_csv("data/inter_annotator_agreement.csv")
indiv_df = pd.read_csv("data/test_data.csv")

## Basic stats

In [5]:
import pandas as pd

def get_stats(df):
    # Specify columns to exclude
    exclude_columns = ['source sentence', 'simplified sentence', 'Comment', 'Commentaire', 'Annotator', 'run_id', 'snt_id']
    
    # Select only the columns that represent errors (i.e., True/False values)
    error_columns = [col for col in df.columns if col not in exclude_columns]
    
    # Compute statistics for error columns
    error_stats = df[error_columns].apply(lambda col: {
        'Total Count': len(col),
        'True Count': (col == True).sum(),
        'False Count': (col == False).sum(),
        'Percentage True': ((col == True).sum() / len(col)) * 100
    }).to_dict()
    
    # Convert stats to a DataFrame for better visualization
    error_stats_df = pd.DataFrame.from_dict(error_stats, orient='index')

    return error_stats_df
    
    return error_stats_df

In [12]:
stats = get_stats(indiv_df)
display(stats)
display(f"number of columns in annotated datasets: {stats.shape[0]}")

Unnamed: 0,Total Count,True Count,False Count,Percentage True
Unnamed: 0.1,610,1,1,0.163934
Unnamed: 0,610,1,1,0.163934
No error,610,263,347,43.114754
A1. Random generation,610,39,571,6.393443
A2. Syntax error,610,48,562,7.868852
A3. Contradiction,610,3,607,0.491803
A4. Simple punctuation / grammar errors,610,45,565,7.377049
A5. Redundancy,610,30,580,4.918033
B1. Format misalignement,610,24,586,3.934426
B2. Prompt misalignement,610,11,599,1.803279


'number of columns in annotated datasets: 17'

In [13]:
# Count the number of rows per annotator for error and no-error cases
annotator_counts_indiv = indiv_df["Annotator"].value_counts()
# Count the number of rows per annotator for error and no-error cases
annotator_counts_iaa = iaa_df["Annotator"].value_counts()

# Display results
print("Annotator counts indiv:")
display(annotator_counts_indiv)

# Display results
print("Annotator counts IAA:")
display(annotator_counts_iaa)

Annotator counts indiv:


Annotator
B    281
D     96
C     92
E     49
F     49
A     43
Name: count, dtype: int64

Annotator counts IAA:


Annotator
A    104
D    104
C    104
B    104
E     52
F     23
Name: count, dtype: int64

# Analyzing Inter Annotators Agreement 

In [14]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa
import krippendorff  # pip install krippendorff


In [15]:
def pivot_annotations(df, value_col):
    """
    Pivot the annotation DataFrame so that each row corresponds to a unique item 
    (using 'item_id', which is a concatenation of snt_id and run_id) and 
    each column corresponds to an annotator’s rating for the given value_col.
    """
    pivot = df.pivot(index='item_id', columns='Annotator', values=value_col)
    return pivot


def compute_pairwise_cohen(pivot_df):
    """
    Compute pairwise Cohen's kappa between annotators from the pivot table.
    Also compute raw agreement (the fraction of items where the two annotators agree)
    and the number of overlapping items.
    
    If the two annotators’ ratings do not span both possible labels [0, 1],
    Cohen's kappa is set to NaN.
    
    Returns a DataFrame with the following columns:
      - Annotator1
      - Annotator2
      - CohenKappa: the chance-corrected agreement metric.
      - RawAgreement: the raw percentage agreement (0–1).
      - N_Items: the number of items on which both annotators rated.
    """
    results = []
    annotators = pivot_df.columns.tolist()
    for i in range(len(annotators)):
        for j in range(i + 1, len(annotators)):
            pair_data = pivot_df[[annotators[i], annotators[j]]].dropna()
            union_labels = sorted(set(pair_data[annotators[i]].unique()).union(set(pair_data[annotators[j]].unique())))
            if len(union_labels) < 2:
                kappa = np.nan
            else:
                kappa = cohen_kappa_score(pair_data[annotators[i]], pair_data[annotators[j]], labels=[0, 1])
            raw = pair_data.apply(lambda row: row[annotators[i]] == row[annotators[j]], axis=1).mean() if len(pair_data) > 0 else np.nan
            results.append({
                'Annotator1': annotators[i],
                'Annotator2': annotators[j],
                'CohenKappa': kappa,
                'RawAgreement': raw,
                'N_Items': len(pair_data)
            })
    return pd.DataFrame(results)


def compute_fleiss(pivot_df):
    """
    Compute Fleiss' kappa using only items rated by at least 2 annotators.
    For each item, build a count vector over the domain {0, 1}.
    If no item qualifies, return NaN.
    """
    matrix = []
    for idx, row in pivot_df.iterrows():
        ratings = row.dropna()
        if len(ratings) < 2:
            continue  # Need at least 2 ratings.
        counts = ratings.value_counts().reindex([0, 1], fill_value=0)
        matrix.append(counts.values)
    if len(matrix) == 0:
        return np.nan
    matrix = np.array(matrix)
    try:
        fk = fleiss_kappa(matrix)
        return fk
    except Exception:
        return np.nan


def compute_krippendorff(pivot_df):
    """
    Compute Krippendorff's alpha.
    The data is transposed so that rows represent raters.
    If, after removing missing data, there is only one unique value, return NaN.
    """
    data = pivot_df.to_numpy().T  # shape: (n_raters, n_items)
    flattened = data[~np.isnan(data)]
    if len(np.unique(flattened)) < 2:
        return np.nan
    try:
        alpha = krippendorff.alpha(reliability_data=data, level_of_measurement='nominal')
        return alpha
    except Exception:
        return np.nan


def analyze_agreement_for_label(df, value_col, analysis_label):
    """
    For a given column (value_col) in the DataFrame, pivot the data and compute:
      - Pairwise Cohen's kappa (per annotator pair), along with raw agreement and N_Items.
      - Fleiss' kappa (multi-rater)
      - Krippendorff's alpha (multi-rater)
    
    Returns a dictionary containing:
      - 'label': the analysis label (e.g., 'Binary', 'Aggregated_A', or a specific error name)
      - 'pivot': the pivoted DataFrame (indexed by item_id)
      - 'pairwise': a DataFrame with pairwise metrics (see compute_pairwise_cohen)
      - 'fleiss': the Fleiss' kappa value
      - 'krippendorff': the Krippendorff's alpha value
    """
    pivot_df = pivot_annotations(df, value_col)
    pairwise = compute_pairwise_cohen(pivot_df)
    fleiss_val = compute_fleiss(pivot_df)
    kripp_val = compute_krippendorff(pivot_df)
    return {
         'label': analysis_label,
         'pivot': pivot_df,
         'pairwise': pairwise,
         'fleiss': fleiss_val,
         'krippendorff': kripp_val
    }


In [16]:
def run_agreement_analysis(df):
    """
    Prepare the DataFrame by:
      - Creating an 'item_id' column that combines 'snt_id' and 'run_id'
      - Creating a binary classification column: 0 if 'No error' is True, 1 otherwise.
      - Creating aggregated error columns for each branch (A, B, C, D)
      - Converting specific error columns to integers.
      
    Then, for each of the following analyses, compute agreement metrics:
      - Binary classification
      - Aggregated error types (A, B, C, D)
      - Each specific error type
      
    Returns a dictionary with:
      - 'pairwise': a DataFrame with pairwise Cohen's kappa (with raw agreement and N_Items) for each analysis category.
      - 'multi_rater': a DataFrame with multi-rater metrics (Fleiss' and Krippendorff's alpha) for each category.
      - 'detailed': a list of dictionaries (one per analysis category) containing the pivot tables and metrics.
    """
    df = df.copy()
    
    # Create a unique identifier for each simplification.
    df['item_id'] = df['snt_id'].astype(str) + "_" + df['run_id'].astype(str)
    
    df = df.drop_duplicates(subset=['item_id', 'Annotator'])
    
    # --- Prepare Data ---
    # Binary classification: 0 if "No error" is True, 1 otherwise.
    df['binary'] = df['No error'].apply(lambda x: 0 if x else 1)
    
    # Aggregated error types: for each letter A, B, C, D, mark as 1 if any error in that branch is True.
    for letter in ['A', 'B', 'C', 'D']:
        error_cols = df.filter(regex=f'^{letter}').columns
        df[f'error_{letter}'] = df[error_cols].any(axis=1).astype(int)
    
    # Specific error types: list of specific error columns.
    specific_error_cols = [
        'A1. Random generation', 'A2. Syntax error', 'A3. Contradiction',
        'A4. Simple punctuation / grammar errors', 'A5. Redundancy',
        'B1. Format misalignement', 'B2. Prompt misalignement',
        'C1. Factuality hallucination', 'C2. Faithfulness hallucination', 'C3. Topic shift',
        'D1.1. Overgeneralization', 'D1.2 Overspecification of Concepts',
        'D2.1. Loss of Informative Content', 'D2.2. Out-of-Scope Generation'
    ]
    for col in specific_error_cols:
        if col in df.columns:
            df[col] = df[col].astype(int)
    
    # --- Compute Agreement ---
    analysis_results = []
    
    # 1. Binary classification
    analysis_results.append(analyze_agreement_for_label(df, 'binary', 'Binary'))
    
    # 2. Aggregated error types (A, B, C, D)
    for letter in ['A', 'B', 'C', 'D']:
        analysis_results.append(analyze_agreement_for_label(df, f'error_{letter}', f'Aggregated_{letter}'))
    
    # 3. Specific error types (each individual error)
    for col in specific_error_cols:
        if col in df.columns:
            analysis_results.append(analyze_agreement_for_label(df, col, col))
    
    # --- Combine Results into DataFrames ---
    pairwise_list = []
    multi_list = []
    
    for res in analysis_results:
        if res['pairwise'] is not None and not res['pairwise'].empty:
            temp = res['pairwise'].copy()
            temp['Analysis'] = res['label']
            pairwise_list.append(temp)
        multi_list.append({
            'Analysis': res['label'],
            'FleissKappa': res['fleiss'],
            'KrippendorffAlpha': res['krippendorff']
        })
    
    df_pairwise = pd.concat(pairwise_list, ignore_index=True) if pairwise_list else pd.DataFrame()
    df_multi = pd.DataFrame(multi_list)
    
    return {
        'pairwise': df_pairwise,
        'multi_rater': df_multi,
        'detailed': analysis_results
    }


In [17]:
def compute_unanimity(pivot_df):
    """
    For the given pivot table, compute the percentage of items (with at least 2 ratings)
    that are unanimously rated (i.e. all non-NaN ratings in the row are equal).
    
    Returns a percentage (0 to 100) or NaN if no items have at least 2 ratings.
    """
    unanimous = 0
    total = 0
    for idx, row in pivot_df.iterrows():
        ratings = row.dropna()
        if len(ratings) >= 2:
            total += 1
            if len(ratings.unique()) == 1:
                unanimous += 1
    return (unanimous / total * 100) if total > 0 else np.nan


In [18]:
def generate_paper_summary(results):
    """
    Given the detailed results from run_agreement_analysis, generate a concise summary DataFrame
    suitable for a paper. For each analysis category, the summary includes:
      - Analysis: the label
      - N_Items: number of unique annotated items
      - Mean_Annotators: average number of annotators per item
      - FleissKappa: the multi-rater Fleiss' kappa
      - KrippendorffAlpha: the multi-rater Krippendorff's alpha
      - Percent_Unanimous: percentage of items (with ≥2 ratings) where all annotators agree
      - Note: if a metric is NaN, this field indicates if it is likely because annotators fully agree.
    """
    summary_rows = []
    for res in results['detailed']:
        pivot_df = res['pivot']
        n_items = pivot_df.shape[0]
        mean_annotators = pivot_df.count(axis=1).mean() if n_items > 0 else np.nan
        percent_unanimous = compute_unanimity(pivot_df)
        
        note = ""
        # If either multi-rater metric is NaN, check if 100% unanimous items explain this.
        if (np.isnan(res['fleiss']) or np.isnan(res['krippendorff'])):
            if percent_unanimous == 100:
                note = "NaN due to complete agreement"
            else:
                note = "NaN due to insufficient overlap/variability"
                
        summary_rows.append({
            "Analysis": res['label'],
            "N_Items": n_items,
            "Mean_Annotators": mean_annotators,
            "FleissKappa": res['fleiss'],
            "KrippendorffAlpha": res['krippendorff'],
            "Percent_Unanimous": percent_unanimous,
            "Note": note
        })
    return pd.DataFrame(summary_rows)


In [19]:
def generate_pairwise_summary_by_category(results):
    """
    Generate a summary table of pairwise agreements for each analysis category.
    For each pair (Annotator1, Annotator2) in each category, a summary is created formatted as:
       "CohenKappa/RawAgreement/N_Items"
    Returns a dictionary mapping analysis label to the corresponding summary DataFrame.
    """
    summaries = {}
    for res in results['detailed']:
        label = res['label']
        pairwise_df = res['pairwise']
        if pairwise_df is None or pairwise_df.empty:
            continue
        # Create a summary string column.
        def format_summary(row):
            ck = row['CohenKappa']
            ra = row['RawAgreement']
            n = row['N_Items']
            ck_str = f"{ck:.2f}" if not pd.isna(ck) else "NaN"
            ra_str = f"{ra:.2f}" if not pd.isna(ra) else "NaN"
            return f"{ck_str}/{ra_str}/{n}"
        pairwise_df = pairwise_df.copy()
        pairwise_df['Summary'] = pairwise_df.apply(format_summary, axis=1)
        # Keep only the relevant columns.
        summary_df = pairwise_df[['Annotator1', 'Annotator2', 'Summary']]
        summaries[label] = summary_df
    return summaries


In [20]:
def generate_concise_pairwise_summary_table(results, nan_format="verbose", output_cohen_kappa_only=False):
    """
    Generate a single DataFrame where each row corresponds to a unique pair of annotators,
    and each additional column is an analysis category. Each cell follows the format:

       "CohenKappa/RawAgreement/N_Items"
       
    If `output_cohen_kappa_only` is True, only CohenKappa is output, with NaN formatted according to `nan_format`.
    
    If Cohen's kappa is NaN, it can be formatted differently based on `nan_format`:
      - "verbose" (default): "NaN (reason)"
      - "short": "io" (insufficient overlap), "p" (complete agreement), "iv" (insufficient variability)
      - None: Leaves NaNs as-is ("NaN")
    
    Parameters:
       results (dict): Output of `run_agreement_analysis`
       nan_format (str or None): Formatting for NaN values ("verbose", "short", or None)
       output_cohen_kappa_only (bool): If True, only CohenKappa is output.
    
    Returns:
       pd.DataFrame: Annotator pairwise agreement table
    """
    summaries = {}
    reason_map = {
        "insufficient overlap": "io",
        "complete agreement": "p",
        "insufficient variability": "iv",
    }

    for res in results['detailed']:
        label = res['label']
        pairwise_df = res['pairwise']
        if pairwise_df is None or pairwise_df.empty:
            continue

        def format_summary(row):
            ck = row['CohenKappa']
            ra = row['RawAgreement']
            n = row['N_Items']
            
            if pd.isna(ck):
                if n == 0:
                    reason = "insufficient overlap"
                elif ra == 1.0:
                    reason = "complete agreement"
                else:
                    reason = "insufficient variability"
                
                if nan_format == "verbose":
                    ck_str = f"NaN ({reason})"
                elif nan_format == "short":
                    ck_str = reason_map[reason]
                else:
                    ck_str = "NaN"
            else:
                ck_str = f"{ck:.2f}"
            
            if output_cohen_kappa_only:
                return ck_str
            
            ra_str = f"{ra:.2f}" if not pd.isna(ra) else "NaN"
            return f"{ck_str}/{ra_str}/{n}"

        df_temp = pairwise_df.copy()
        df_temp['Summary'] = df_temp.apply(format_summary, axis=1)
        df_temp = df_temp[['Annotator1', 'Annotator2', 'Summary']].rename(columns={'Summary': label})
        summaries[label] = df_temp

    final_df = None
    for label, df_temp in summaries.items():
        if final_df is None:
            final_df = df_temp.copy()
        else:
            final_df = pd.merge(final_df, df_temp, on=['Annotator1', 'Annotator2'], how='outer')

    return final_df


In [21]:
import pandas as pd

def compute_annotator_coherence(df, annotation_cols=None):
    """
    Compute the self-consistency rate for each annotator based on duplicate rows.

    For each annotator, this function identifies duplicate entries (rows with the same 
    'snt_id' and 'run_id') and checks if the annotation columns are consistent across duplicates.
    The consistency rate is computed as the fraction of duplicate groups with identical annotations.

    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing annotated data. It must have the columns 'Annotator', 'snt_id',
        'run_id', and the annotation columns.
    annotation_cols : list of str, optional
        List of columns that hold the annotation data. If not provided, a default list is used.
    
    Returns:
    --------
    pd.DataFrame
        A DataFrame with annotators as index and a column 'Consistency Rate' that shows 
        the self-consistency rate.
    """
    
    # Default annotation columns (adjust as needed)
    if annotation_cols is None:
        annotation_cols = [
            'No error',
            'A1. Random generation',
            'A2. Syntax error',
            'A3. Contradiction',
            'A4. Simple punctuation / grammar errors',
            'A5. Redundancy',
            'B1. Format misalignement',
            'B2. Prompt misalignement',
            'C1. Factuality hallucination',
            'C2. Faithfulness hallucination',
            'C3. Topic shift',
            'D1.1. Overgeneralization',
            'D1.2 Overspecification of Concepts',
            'D2.1. Loss of Informative Content',
            'D2.2. Out-of-Scope Generation'
        ]
    
    # Group the DataFrame by Annotator, snt_id, and run_id.
    grouped = df.groupby(['Annotator', 'snt_id', 'run_id'])
    
    # Dictionary to hold consistency results per annotator
    coherence_results = {}
    
    # For each group, if there are duplicates (group size > 1), check consistency of annotation columns.
    for (annotator, snt_id, run_id), group in grouped:
        if len(group) > 1:
            # Check if each annotation column has a single unique value in the group.
            is_consistent = (group[annotation_cols].nunique() == 1).all()
            coherence_results.setdefault(annotator, []).append(is_consistent)
    
    # Compute the consistency rate per annotator.
    annotator_coherence = {}
    for annotator, results in coherence_results.items():
        if results:
            consistency_rate = sum(results) / len(results)
        else:
            consistency_rate = None  # Or you can set to 0 if preferred.
        annotator_coherence[annotator] = consistency_rate
    
    # Create and return a DataFrame with the results.
    coherence_df = pd.DataFrame.from_dict(annotator_coherence, orient='index', 
                                            columns=['Consistency Rate'])
    return coherence_df

In [22]:
#Keepin only annotators who annotated everything
iaa_df = pd.read_csv("data/inter_annotator_agreement.csv")
iaa_df = iaa_df[iaa_df['Annotator'].map(iaa_df['Annotator'].value_counts()) == 104]

In [23]:
# Measuring self-consistency rate for each annotator
coherence_df = compute_annotator_coherence(iaa_df)
display(coherence_df)

Unnamed: 0,Consistency Rate
A,0.777778
B,1.0
C,0.555556
D,1.0


In [24]:
# Optionally, suppress warnings during computation:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    results = run_agreement_analysis(iaa_df)

In [25]:
# Multi-rater (overall) metrics summary (for a paper):
df_summary = generate_paper_summary(results)
display(df_summary)

Unnamed: 0,Analysis,N_Items,Mean_Annotators,FleissKappa,KrippendorffAlpha,Percent_Unanimous,Note
0,Binary,95,4.0,0.28868,0.290552,41.052632,
1,Aggregated_A,95,4.0,,,100.0,NaN due to complete agreement
2,Aggregated_B,95,4.0,0.434698,0.436185,80.0,
3,Aggregated_C,95,4.0,-0.266667,-0.263333,15.789474,
4,Aggregated_D,95,4.0,0.218107,0.220165,30.526316,
5,A1. Random generation,95,4.0,0.254227,0.25619,91.578947,
6,A2. Syntax error,95,4.0,0.024758,0.027325,88.421053,
7,A3. Contradiction,95,4.0,-0.005291,-0.002646,97.894737,
8,A4. Simple punctuation / grammar errors,95,4.0,0.135536,0.137811,74.736842,
9,A5. Redundancy,95,4.0,-0.010638,-0.007979,95.789474,


In [26]:
# Generate a single DataFrame for all pairwise agreement summaries.
concise_pairwise_df = generate_concise_pairwise_summary_table(results, nan_format="short", output_cohen_kappa_only=True)

# Display the table.
display(concise_pairwise_df.T)

# Optionally, print an explanation:
print("Each cell is formatted as 'CohenKappa/RawAgreement/N_Items', e.g., '0.70/0.95/50'.")
print(" insufficient overlap = io, complete agreement = p (for perfect), insufficient variability = iv")

Unnamed: 0,0,1,2,3,4,5
Annotator1,A,A,A,B,B,C
Annotator2,B,C,D,C,D,D
Binary,0.49,0.30,0.16,0.66,0.28,0.23
Aggregated_A,p,p,p,p,p,p
Aggregated_B,0.37,0.58,1.00,0.25,0.37,0.58
Aggregated_C,p,p,0.00,p,0.00,0.00
Aggregated_D,0.26,0.24,0.14,0.49,0.19,0.20
A1. Random generation,-0.03,0.26,1.00,-0.03,-0.03,0.26
A2. Syntax error,-0.02,0.00,-0.01,0.00,0.17,0.00
A3. Contradiction,p,0.00,p,0.00,p,0.00


Each cell is formatted as 'CohenKappa/RawAgreement/N_Items', e.g., '0.70/0.95/50'.
 insufficient overlap = io, complete agreement = p (for perfect), insufficient variability = iv
