In [1]:
!pip install simpledorff



In [2]:
import pandas as pd
import simpledorff

In [3]:
results = pd.read_csv("Reliability Step 1_April 23, 2025_10.27.csv")

In [4]:
results.columns

Index(['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
       'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
       'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
       'ExternalReference', 'LocationLatitude', 'LocationLongitude',
       'DistributionChannel', 'UserLanguage', 'Coder ID', 'Post ID',
       'Attribution Presence', 'Number Attributions', 'Attribution1',
       'Attribution2', 'Attribution3', 'Attribution4', 'Attribution5',
       'Attribution6', 'Attribution7', 'AttributionOther'],
      dtype='object')

In [5]:
results['Coder ID'].unique()

array(['Coder ID', '{"ImportId":"QID17"}', 'Francesco', 'Mare', 'Luke',
       'Ada'], dtype=object)

In [6]:
results = results.loc[:, 'Coder ID':'AttributionOther']

In [7]:
results = results.iloc[2:]

In [8]:
results.head()

Unnamed: 0,Coder ID,Post ID,Attribution Presence,Number Attributions,Attribution1,Attribution2,Attribution3,Attribution4,Attribution5,Attribution6,Attribution7,AttributionOther
2,Francesco,1,No,,,,,,,,,
3,Francesco,2,No,,,,,,,,,
4,Francesco,3,Yes,1.0,...Donald Trump delivered the American Dream. ...,,,,,,,
5,Francesco,4,No,,,,,,,,,
6,Francesco,5,No,,,,,,,,,


In [9]:
simpledorff.calculate_krippendorffs_alpha_for_df(results,experiment_col='Post ID',
                                                 annotator_col='Coder ID',
                                                 class_col='Attribution Presence')

np.float64(0.4801269398970548)

In [10]:
results_Q1 = results.pivot(index='Post ID', columns='Coder ID', values='Attribution Presence')

In [11]:
results_Q1

Coder ID,Ada,Francesco,Luke,Mare
Post ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,No,No,No,No
10,No,No,No,Yes
100,No,No,No,No
11,No,No,No,No
12,Yes,Yes,Yes,Yes
...,...,...,...,...
95,No,Yes,No,No
96,No,No,No,No
97,Yes,No,No,No
98,No,No,No,No


In [12]:
def get_disagreement_rows(df):
    return df[df.nunique(axis=1) > 1]

disagreement_rows = get_disagreement_rows(results_Q1)
print(disagreement_rows)

Coder ID  Ada Francesco Luke Mare
Post ID                          
10         No        No   No  Yes
13        Yes       Yes  Yes   No
14        Yes       Yes   No   No
18         No        No  Yes   No
19        Yes        No   No   No
23         No       Yes  Yes   No
26         No        No  Yes  Yes
28        Yes        No   No   No
3          No       Yes  Yes  Yes
31         No       Yes   No   No
35         No        No   No  Yes
40         No        No   No  Yes
51        Yes        No   No  Yes
53         No        No   No  Yes
55        Yes        No  Yes   No
59        Yes        No  Yes  Yes
60         No       Yes   No   No
66         No        No  Yes   No
70        Yes        No  Yes  Yes
72         No        No   No  Yes
76         No        No  Yes   No
79         No        No   No  Yes
8          No        No   No  Yes
80         No        No  Yes   No
82         No        No  Yes  Yes
86        Yes        No  Yes   No
87         No       Yes  Yes   No
88         No 

In [13]:
len(disagreement_rows)

31

In [14]:
(100-31)/100

0.69

In [15]:
from itertools import combinations

# Initialize a dictionary to store results
agreeability = {}

# Get list of coder names
coders = results_Q1.columns.tolist()

# Compare each pair
for coder1, coder2 in combinations(coders, 2):
    agreement = (results_Q1[coder1] == results_Q1[coder2]).sum()
    total = len(results_Q1)
    percentage = agreement / total * 100
    agreeability[(coder1, coder2)] = percentage

# Display as a DataFrame
agree_df = pd.DataFrame.from_dict(agreeability, orient='index', columns=['Agreeability (%)'])
agree_df = agree_df.sort_values(by='Agreeability (%)', ascending=False)
agree_df


Unnamed: 0,Agreeability (%)
"(Ada, Luke)",85.0
"(Francesco, Luke)",85.0
"(Ada, Francesco)",84.0
"(Luke, Mare)",83.0
"(Ada, Mare)",82.0
"(Francesco, Mare)",80.0


In [16]:
results_filtered = results[results['Coder ID'] != 'Mare']

In [17]:
simpledorff.calculate_krippendorffs_alpha_for_df(results_filtered,experiment_col='Post ID',
                                                 annotator_col='Coder ID',
                                                 class_col='Attribution Presence')

np.float64(0.5163513608551937)

In [18]:
all_yes_count = (results_Q1 == 'Yes').all(axis=1).sum()
print(f"Number of tweets coded as 'Yes' by all coders: {all_yes_count}")

Number of tweets coded as 'Yes' by all coders: 8


In [19]:
coder_cols = ['Ada', 'Francesco', 'Luke', 'Mare']
all_yes_df = results_Q1[results_Q1[coder_cols].eq('Yes').all(axis=1)]

In [20]:
all_yes_df

Coder ID,Ada,Francesco,Luke,Mare
Post ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,Yes,Yes,Yes,Yes
15,Yes,Yes,Yes,Yes
20,Yes,Yes,Yes,Yes
41,Yes,Yes,Yes,Yes
54,Yes,Yes,Yes,Yes
61,Yes,Yes,Yes,Yes
63,Yes,Yes,Yes,Yes
83,Yes,Yes,Yes,Yes
