# Calculate agreement
OG Author: Sara Lafia

Uses Krippendorff's Alpha implemented with [simpledorff](https://www.lighttag.io/blog/krippendorffs-alpha/) library to assess reliability of annotations

In [1]:
# !pip install simpledorff

Defaulting to user installation because normal site-packages is not writeable
Collecting simpledorff
  Downloading simpledorff-0.0.2-py3-none-any.whl (5.6 kB)
Installing collected packages: simpledorff
Successfully installed simpledorff-0.0.2


In [1]:
import simpledorff
import pandas as pd

In [2]:
df_sara = pd.read_excel('ICPSR_bib_data_citation_rhetoric_v03_top96_samples.xlsx', sheet_name=0)
df_leo = pd.read_excel('ICPSR_bib_data_citation_rhetoric_v03_top96_samples.xlsx', sheet_name=1)
df_leo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   paper_id             220 non-null    float64
 1   sent_id              220 non-null    float64
 2   dataset_prediction   220 non-null    object 
 3   sentence_text        220 non-null    object 
 4   subject              220 non-null    object 
 5   relation             220 non-null    object 
 6   object               220 non-null    object 
 7   subject_category     220 non-null    object 
 8   relation_categories  220 non-null    object 
 9   object_category      220 non-null    object 
 10  AEO_category         220 non-null    object 
 11  SO_Category          220 non-null    float64
 12  verb_lemmas          220 non-null    object 
 13  Category             220 non-null    object 
 14  Proximity            220 non-null    object 
 15  Function             220 non-null    obj

In [3]:
df_sara.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   paper_id             220 non-null    float64
 1   sent_id              220 non-null    float64
 2   dataset_prediction   220 non-null    object 
 3   sentence_text        220 non-null    object 
 4   subject              220 non-null    object 
 5   relation             220 non-null    object 
 6   object               220 non-null    object 
 7   subject_category     220 non-null    object 
 8   relation_categories  220 non-null    object 
 9   object_category      220 non-null    object 
 10  AEO_category         220 non-null    object 
 11  SO_Category          220 non-null    float64
 12  verb_lemmas          220 non-null    object 
 13  Category             220 non-null    object 
 14  is_rel_label         0 non-null      float64
 15  Proximity            220 non-null    obj

#### Bad triple rate
Sara: 51/220
Leo: 18/220

### ICR

In [4]:
# exclude bad triple rows (union)
document_id = []
annotator_id = []
annotation_Proximity = []
annotation_Function = []

for i in range(len(df_sara)):
    if df_sara["Bad triple"][i]=="x" or df_leo["Bad triple"][i]=="x":
        continue
    
    document_id.append(i)
    annotator_id.append("Sara")
    annotation_Proximity.append(df_sara["Proximity"][i])
    annotation_Function.append(df_sara["Function"][i])
    
    document_id.append(i)
    annotator_id.append("Leo")
    annotation_Proximity.append(df_leo["Proximity.1"][i])
    annotation_Function.append(df_leo["Function.1"][i])

annotation_df = pd.DataFrame({
    "document_id":document_id,
    "annotator_id":annotator_id,
    "Proximity":annotation_Proximity,
    "Function":annotation_Function
})

In [5]:
annotation_df

Unnamed: 0,document_id,annotator_id,Proximity,Function
0,3,Sara,Indirect,Context
1,3,Leo,Indirect,Context
2,4,Sara,Direct,Interaction
3,4,Leo,Direct,Interaction
4,5,Sara,Indirect,Context
...,...,...,...,...
399,217,Leo,Indirect,Context
400,218,Sara,Indirect,Interaction
401,218,Leo,Indirect,Context
402,219,Sara,Indirect,Interaction


Krippendorff for `Proximity`

In [6]:
proximity_agreement = simpledorff.calculate_krippendorffs_alpha_for_df(annotation_df,
                                                                       experiment_col='document_id',
                                                                       annotator_col='annotator_id',
                                                                       class_col='Proximity')
round(proximity_agreement,2)

0.92

Krippendorff for `Function`

In [7]:
function_agreement = simpledorff.calculate_krippendorffs_alpha_for_df(annotation_df,
                                                 experiment_col='document_id',
                                                 annotator_col='annotator_id',
                                                 class_col='Function')
round(function_agreement,2)

0.71

In [8]:
# export the agreeing rows
agree_index = list(set(annotation_df.document_id))

In [11]:
df_sara[df_sara.index.isin(agree_index)].to_csv("agree_samples.csv",index=False)