### Imports

In [78]:
import pandas as pd
import os
import json
from tqdm import tqdm 

In [79]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/Work/Frontline/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


### Import Annotations

Merge all annotation files

In [81]:
dfs=[]
for doc in os.listdir("annotated"):
  if doc.startswith("annotations"):
    #read json data
    json_data=json.load(open("annotated/"+doc, encoding="utf-8"))
    #convert to dataframe
    data=pd.DataFrame(json_data["documents"])
    #for now: filter out paragraphs that have not been annotated 
    data=data[data["annotations"].apply(len)>0]    
    dfs.append(data)


In [153]:
# merge jsons
data=pd.concat(dfs)
data=data.reset_index(drop=True)

### Extract Labels by Annotator
Extract each label by annotator to compare the intraclass correlation coefficient of our own annotations

In [83]:
import numpy as np

In [154]:
def extract_annotations(test):
  ann_dict=dict()
  for ann in test:
    name=ann["annotator"]["name"][0]
    # Remane old label names
    label=update_label(ann["concept"]["preferred_label"]["name"])
    if name in ann_dict.keys():
      ann_dict[name].append(label)
    else:
      ann_dict[name]=[label]
  #remove duplicates
  for key in ann_dict.keys():
    ann_dict[key]=set(ann_dict[key])
  return ann_dict

In [155]:
def update_label(label):
  if label=="NA":
    return "Domestic Violence"
  elif label=="Victim blaming":
    return 'Statement of responsibility'
  else:
    return label

In [156]:
data["annotations"]=data["annotations"].apply(extract_annotations)

### Intraclass Correlation Coefficient (own annotations)
Compare all co-assigned paragraphs 

In [94]:
def jaccard_similarity(sets):
    # Calculate the intersection
    intersection = set.intersection(*sets)
    
    # Calculate the union
    union = set.union(*sets)
    
    # Compute the Jaccard similarity
    similarity = len(intersection) / len(union)
    
    return similarity

In [162]:
def calculate_similarity(annotations: dict, sim="jaccard"):
  #if no co-annotation
  if len(annotations)==1:
    return np.nan
  else:
    if sim=="jaccard":
      return jaccard_similarity(list(annotations.values()))
    elif sim=="dice":
      return dice_similarity_multiple(list(annotations.values()))

In [104]:
def dice_similarity_multiple(sets):
    num_sets = len(sets)
    similarity_sum = 0

    # Pairwise comparisons
    for i in range(num_sets - 1):
        for j in range(i + 1, num_sets):
            set1 = sets[i]
            set2 = sets[j]
            
            # Calculate the intersection
            intersection = set1.intersection(set2)
            
            # Calculate the sum of set sizes
            set_sum = len(set1) + len(set2)
            
            # Compute the Dice similarity coefficient
            similarity = 2 * len(intersection) / set_sum
            
            similarity_sum += similarity

    # Calculate the average similarity
    average_similarity = similarity_sum / (num_sets * (num_sets - 1) / 2)
    
    return average_similarity

In [163]:
data["jaccard"]=data["annotations"].apply(calculate_similarity)
data["dice"]=data["annotations"].apply(calculate_similarity,args=["dice"])

In [170]:
data[~data["jaccard"].isnull()].head()

Unnamed: 0,id,text,annotations,attributes_flat,jaccard,dice
3,043e3909-bcdd-4c6b-a54f-f947d46ad18e,Das Schöffengericht hatte es in diesem Fall of...,"{'J': {'Sensationalist', 'Statement of respons...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...,0.0,0.444444
20,4dfd6faf-e631-4f0e-83cc-cee4b69a9632,"Berlin - Sie war jung, hübsch und intelligent...","{'K': {'Sensationalist'}, 'J': {'Sensationalis...",{'artikel_id': 'MDS-A-A696462F-F0DB-42E7-BADD-...,0.5,0.666667
23,0cb62255-3781-4466-b9ac-e29c29714d9f,Möglicherweise war ihr Job in Kiel auch der Gr...,"{'K': {'Sensationalist', 'Statement of respons...",{'artikel_id': 'MDS-A-A696462F-F0DB-42E7-BADD-...,0.5,0.666667
44,0964c70c-687f-4d9b-9c9d-555b3853a456,Das Frauenhaus Osterode sei durchgängig voll b...,"{'J': {'Domestic Violence'}, 'B': {'Domestic V...",{'artikel_id': '721580E0DCDD055C72CE05C3135E41...,1.0,1.0
50,0b701b26-4e70-4514-8c9b-355bf042acbc,Frankfurt. Von der 43-Jährigen fehlt noch imme...,"{'J': {'Domestic Violence'}, 'B': {'Domestic V...","{'artikel_id': '20271339FA53F1033000', 'name':...",1.0,1.0


In [177]:
print("Average jaccard similarity: ", round(data["jaccard"].mean(),2))
print("Average dice similarity: ", round(data["dice"].mean(),2))

Average jaccard similarity:  0.7
Average dice similarity:  0.78


### Own Annotations and Amazons Annotations

In [None]:
# import amazon

### Amazons Annotations