In [1]:
import pandas as pd
import krippendorff

In [2]:
results = pd.read_csv("Reliability Step 2_April 24, 2025_10.00.csv")

In [3]:
results.columns

Index(['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
       'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
       'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
       'ExternalReference', 'LocationLatitude', 'LocationLongitude',
       'DistributionChannel', 'UserLanguage', 'Coder_ID', 'Post_ID',
       'Attribution_Presence', 'Attribution_Number', 'AttributionType_1',
       'SenderAccountMatch_1', 'Evaluation_1', 'PresenceIndividual_1',
       'SpecificIndividual_1', 'PresenceCollective_1', 'SpecificCollective_1',
       'PresenceSystems_1', 'SpecificSystem_1', 'PresenceNetwork_1',
       'SpecificNetworks_1', 'Economy_1', 'SocialPolicies_1',
       'PoliticsHumanRight_1', 'ExternalRelations_1', 'PastTense_1',
       'PresentTense_1', 'FutureTense_1'],
      dtype='object')

In [4]:
results['Coder_ID'].unique()

array(['Coder ID', '{"ImportId":"QID1"}', 'Luke', 'Francesco', 'Ada',
       'Mare'], dtype=object)

In [5]:
results = results.loc[:, 'Coder_ID':'FutureTense_1']

In [6]:
results = results.iloc[2:]

In [7]:
results.head()

Unnamed: 0,Coder_ID,Post_ID,Attribution_Presence,Attribution_Number,AttributionType_1,SenderAccountMatch_1,Evaluation_1,PresenceIndividual_1,SpecificIndividual_1,PresenceCollective_1,...,SpecificSystem_1,PresenceNetwork_1,SpecificNetworks_1,Economy_1,SocialPolicies_1,PoliticsHumanRight_1,ExternalRelations_1,PastTense_1,PresentTense_1,FutureTense_1
2,Luke,1,Yes,First,Affirmed causal responsibility,No,Negatively,No,,Yes,...,,,,No,No,Yes,,No,Yes,
3,Luke,2,Yes,First,Affirmed causal responsibility,No,Negatively,No,,Yes,...,,,,No,No,Yes,,No,Yes,
4,Luke,3,Yes,First,Affirmed causal responsibility,Yes,Negatively,No,,Yes,...,,,,No,No,Yes,,Yes,,
5,Francesco,1,Yes,First,Affirmed causal responsibility,Yes,Neutrally,,,,...,,,,,,,,,,
6,Luke,4,Yes,First,Affirmed causal responsibility,Yes,Positively,Yes,Other Individuals,,...,,,,No,No,Yes,,No,Yes,


In [8]:
results.columns

Index(['Coder_ID', 'Post_ID', 'Attribution_Presence', 'Attribution_Number',
       'AttributionType_1', 'SenderAccountMatch_1', 'Evaluation_1',
       'PresenceIndividual_1', 'SpecificIndividual_1', 'PresenceCollective_1',
       'SpecificCollective_1', 'PresenceSystems_1', 'SpecificSystem_1',
       'PresenceNetwork_1', 'SpecificNetworks_1', 'Economy_1',
       'SocialPolicies_1', 'PoliticsHumanRight_1', 'ExternalRelations_1',
       'PastTense_1', 'PresentTense_1', 'FutureTense_1'],
      dtype='object')

In [9]:
results_filtered = results[results['Coder_ID'] != 'Mare']

In [10]:
#Create a Unique Attribution Identifier
## This ensures annotations on the same post + attribution number
results_filtered['PostAttr_ID'] = results_filtered['Post_ID'].astype(str) + '_' + results_filtered['Attribution_Number']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_filtered['PostAttr_ID'] = results_filtered['Post_ID'].astype(str) + '_' + results_filtered['Attribution_Number']


In [11]:
pivot_df = results_filtered.pivot_table(
    index='PostAttr_ID',
    columns='Coder_ID',
    values='SpecificCollective_1',  # or any other column you want to compare
    aggfunc='first'  # just in case there's only one per coder
).dropna()  # drop rows where either coder didn’t code

In [12]:
pivot_df

Coder_ID,Ada,Francesco,Luke
PostAttr_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10_First,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...
11_First,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...
12_Second,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...
17_First,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...
19_First,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...,Progressive: Democrats/Liberals/Left-wing ideo...
24_First,Conservative: Republicans/Right-wing ideologists,The Trump-Administration,Conservative: Republicans/Right-wing ideologists
24_Second,Conservative: Republicans/Right-wing ideologists,The Trump-Administration,The Trump-Administration
26_First,Other Countries/Country Unions,Other Collectives,Other Countries/Country Unions
27_First,USA,The Trump-Administration,The Trump-Administration
2_First,Other Collectives,Other Collectives,Other Political/Ideological Groups


In [13]:
agreement_AdaLuke = (pivot_df['Ada'] == pivot_df['Luke']).mean()
print(f'Percent agreement: {agreement_AdaLuke:.2%}')

Percent agreement: 73.33%


In [14]:
agreement_AdaFrancesco = (pivot_df['Ada'] == pivot_df['Francesco']).mean()
print(f'Percent agreement: {agreement_AdaFrancesco:.2%}')

Percent agreement: 73.33%


In [15]:
agreement_FrancescoLuke = (pivot_df['Francesco'] == pivot_df['Luke']).mean()
print(f'Percent agreement: {agreement_FrancescoLuke:.2%}')

Percent agreement: 73.33%


In [16]:
# Each inner list is a unit (i.e., one PostAttr_ID)
data = pivot_df.transpose().values.tolist()

# Calculate K-alpha (nominal scale)
alpha = krippendorff.alpha(reliability_data=data, level_of_measurement='nominal')

print(f"Krippendorff's alpha: {alpha:.3f}")

Krippendorff's alpha: 0.648


In [17]:
def get_disagreement_rows(df):
    return df[df.nunique(axis=1) > 1]

disagreement_rows = get_disagreement_rows(pivot_df)
print(disagreement_rows)

Coder_ID                                                  Ada  \
PostAttr_ID                                                     
24_First     Conservative: Republicans/Right-wing ideologists   
24_Second    Conservative: Republicans/Right-wing ideologists   
26_First                       Other Countries/Country Unions   
27_First                                                  USA   
2_First                                     Other Collectives   
6_First                                                   USA   

Coder_ID                    Francesco  \
PostAttr_ID                             
24_First     The Trump-Administration   
24_Second    The Trump-Administration   
26_First            Other Collectives   
27_First     The Trump-Administration   
2_First             Other Collectives   
6_First                           USA   

Coder_ID                                                 Luke  
PostAttr_ID                                                    
24_First     Conservat

In [18]:
from sklearn.metrics import cohen_kappa_score
import itertools

# 1. Overall agreement
def overall_agreement(df):
    agreement = df.apply(lambda row: len(set(row)) == 1, axis=1)
    return agreement.mean()

# 2. Pairwise Cohen's Kappa
def pairwise_kappa(df):
    coders = df.columns
    results = {}
    for c1, c2 in itertools.combinations(coders, 2):
        kappa = cohen_kappa_score(df[c1], df[c2])
        results[f"{c1} vs {c2}"] = kappa
    return results

# Usage
overall_agree = overall_agreement(pivot_df)
pairwise_kappas = pairwise_kappa(pivot_df)

print(f"Overall Agreement: {overall_agree:.2%}")
print("Pairwise Cohen's Kappa:")
for pair, score in pairwise_kappas.items():
    print(f"{pair}: {score:.3f}")

Overall Agreement: 60.00%
Pairwise Cohen's Kappa:
Ada vs Francesco: 0.647
Ada vs Luke: 0.649
Francesco vs Luke: 0.636
