### Imports

In [223]:
import pandas as pd
import os
import json
from tqdm import tqdm 
import numpy as np

In [224]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [225]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/Work/Frontline/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


### Import Annotations

Merge all annotation files

In [226]:
dfs=[]
for doc in os.listdir("annotated"):
  if doc.startswith("annotations"):
    #read json data
    json_data=json.load(open("annotated/"+doc, encoding="utf-8"))
    #convert to dataframe
    data=pd.DataFrame(json_data["documents"])
    #for now: filter out paragraphs that have not been annotated 
    data=data[data["annotations"].apply(len)>0]    
    dfs.append(data)


In [227]:
# merge jsons
data=pd.concat(dfs)
data=data.reset_index(drop=True)

### Extract Labels by Annotator
Extract each label by annotator to compare the intraclass correlation coefficient of our own annotations

In [228]:
def extract_annotations(test):
  ann_dict=dict()
  for ann in test:
    ### FOR NOW: later change to id
    name=ann["annotator"]["name"][0]
    # Remane old label names
    label=update_label(ann["concept"]["preferred_label"]["name"])
    if name in ann_dict.keys():
      ann_dict[name].append(label)
    else:
      ann_dict[name]=[label]
  #remove duplicates
  for key in ann_dict.keys():
    ann_dict[key]=set(ann_dict[key])
  return ann_dict

In [229]:
def update_label(label):
  if label=="NA":
    return "Domestic Violence"
  elif label=="Victim blaming":
    return 'Statement of responsibility'
  else:
    return label

In [230]:
data["annotations"]=data["annotations"].apply(extract_annotations)

In [231]:
data.head()

Unnamed: 0,id,text,annotations,attributes_flat
0,4572dea4-6a08-4f1e-b312-5821112bb5f5,Ein Mann (25) ist jetzt vom Schöffengericht am...,{'J': {'Domestic Violence'}},{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
1,0bcada32-8dc5-41cf-b83b-67d2e742bada,Als Zeugin trat die Ex-Lebensgefährtin des Syr...,{'J': {'Statement of responsibility'}},{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
2,a30791b9-522e-45c1-8b33-79d4165282af,"Zunächst leugnete der Angeklagte, dass es über...",{'J': {'Statement of responsibility'}},{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
3,043e3909-bcdd-4c6b-a54f-f947d46ad18e,Das Schöffengericht hatte es in diesem Fall of...,"{'J': {'Sensationalist', 'Statement of respons...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
4,08cef91c-6d73-472c-8349-07a5b72009d1,"""Gewalt in der Familie ist weder Privatsache n...",{'J': {'Domestic Violence'}},"{'artikel_id': 'IRA-82182598', 'name': 'SÜDWES..."


### Intraclass Correlation Coefficient (own annotations)
Compare all co-assigned paragraphs 

In [212]:
def jaccard_similarity(sets):
    # Calculate the intersection
    intersection = set.intersection(*sets)
    
    # Calculate the union
    union = set.union(*sets)
    
    # Compute the Jaccard similarity
    similarity = len(intersection) / len(union)
    
    return similarity

In [213]:
def calculate_similarity(annotations: dict, sim="jaccard"):
  #if no co-annotation
  if len(annotations)==1:
    return np.nan
  else:
    if sim=="jaccard":
      return jaccard_similarity(list(annotations.values()))
    elif sim=="dice":
      return dice_similarity_multiple(list(annotations.values()))

In [214]:
def dice_similarity_multiple(sets):
    num_sets = len(sets)
    similarity_sum = 0

    # Pairwise comparisons
    for i in range(num_sets - 1):
        for j in range(i + 1, num_sets):
            set1 = sets[i]
            set2 = sets[j]
            
            # Calculate the intersection
            intersection = set1.intersection(set2)
            
            # Calculate the sum of set sizes
            set_sum = len(set1) + len(set2)
            
            # Compute the Dice similarity coefficient
            similarity = 2 * len(intersection) / set_sum
            
            similarity_sum += similarity

    # Calculate the average similarity
    average_similarity = similarity_sum / (num_sets * (num_sets - 1) / 2)
    
    return average_similarity

In [215]:
data["jaccard"]=data["annotations"].apply(calculate_similarity)
data["dice"]=data["annotations"].apply(calculate_similarity,args=["dice"])

In [216]:
data[~data["jaccard"].isnull()].head()

Unnamed: 0,id,text,annotations,attributes_flat,jaccard,dice
3,043e3909-bcdd-4c6b-a54f-f947d46ad18e,Das Schöffengericht hatte es in diesem Fall of...,"{'J': {'Sensationalist', 'Statement of respons...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...,0.0,0.444444
20,4dfd6faf-e631-4f0e-83cc-cee4b69a9632,"Berlin - Sie war jung, hübsch und intelligent...","{'K': {'Sensationalist'}, 'J': {'Sensationalis...",{'artikel_id': 'MDS-A-A696462F-F0DB-42E7-BADD-...,0.5,0.666667
23,0cb62255-3781-4466-b9ac-e29c29714d9f,Möglicherweise war ihr Job in Kiel auch der Gr...,"{'K': {'Sensationalist', 'Statement of respons...",{'artikel_id': 'MDS-A-A696462F-F0DB-42E7-BADD-...,0.5,0.666667
44,0964c70c-687f-4d9b-9c9d-555b3853a456,Das Frauenhaus Osterode sei durchgängig voll b...,"{'J': {'Domestic Violence'}, 'B': {'Domestic V...",{'artikel_id': '721580E0DCDD055C72CE05C3135E41...,1.0,1.0
50,0b701b26-4e70-4514-8c9b-355bf042acbc,Frankfurt. Von der 43-Jährigen fehlt noch imme...,"{'J': {'Domestic Violence'}, 'B': {'Domestic V...","{'artikel_id': '20271339FA53F1033000', 'name':...",1.0,1.0


In [217]:
print("Average jaccard similarity: ", round(data["jaccard"].mean(),2))
print("Average dice similarity: ", round(data["dice"].mean(),2))

Average jaccard similarity:  0.7
Average dice similarity:  0.78


### Evaluating Amazons Annotations

Extracting Ground Truth

In [219]:
def ground_truth_filter(entry, min_coannotation=1, min_similarity=0.5, similarity="jaccard"):
  """
      Extracts ground truth value of the annotated sample based on two filters:
      - a minimum number of people that annotated a text
      - a minimum of similarity between all annotations of a text

      Args:
      - annotations (dict): a dictionary containing all annotations of a text with the annotator initial as key
      - min_coannotation (int): minimum number of co-annotations of a text, by default 1, so all annotations are considered
      - min_similarity (int): if more than one annotator, the value minimum value of similarity so that a value is considered ground truth

      Returns:
      - either:
        - all_values (set): containing a set of values that are considered ground truth
        - NaN: if the annotation does not fulfill the conditions set for ground truth
  """
  if len(entry["annotations"])<min_coannotation or entry[similarity]<min_similarity:
    return np.nan
  else:
    all_values = [value for s in entry["annotations"].values() for value in s]
    return set(all_values)

In [220]:
ground_truth=data
ground_truth["annotations"]=ground_truth.apply(ground_truth_filter, 1, min_coannotation=2,min_similarity=0.6, similarity="dice")
ground_truth=ground_truth[~ground_truth["annotations"].isnull()][["id","annotations"]]


In [371]:
# this variable contains all values that are considered ground truth based on the previous filter
ground_truth=ground_truth.rename(columns={"annotations":"ground truth"})
ground_truth.head()

Unnamed: 0,id,ground truth
20,4dfd6faf-e631-4f0e-83cc-cee4b69a9632,"{Sensationalist, Statement of responsibility}"
23,0cb62255-3781-4466-b9ac-e29c29714d9f,"{Sensationalist, Statement of responsibility}"
44,0964c70c-687f-4d9b-9c9d-555b3853a456,{Domestic Violence}
50,0b701b26-4e70-4514-8c9b-355bf042acbc,{Domestic Violence}
82,1c2f201d-b408-4c6f-a455-f45a947b41f6,{Sensationalist}


Importing Annotations

In [373]:
# import amazon data
# pd.read_csv()
#for testing

amazon_data_all=pd.concat(dfs)
amazon_data_all["annotations"]=amazon_data_all["annotations"].apply(extract_annotations)
amazon_data=amazon_data_all[["id","annotations"]]

In [363]:
all_annotators=set([y for x in amazon_data["annotations"]for y in list(x.keys())])


In [368]:
for annotator in all_annotators:
  by_annotator=amazon_data.explode("annotations")[amazon_data.explode("annotations").annotations==annotator]
  by_annotator=by_annotator[["id"]].merge(amazon_data,left_on="id", right_on="id")
  by_annotator["annotations"]=[x[annotator]for x in by_annotator["annotations"]]
  common_ann=by_annotator.merge(ground_truth, left_on="id", right_on="id")
  common_ann["similarity"]=common_ann[["annotations","ground truth"]].apply(jaccard_similarity,axis=1)
  print(annotator,common_ann["similarity"].mean())

B 1.0
J 0.9375
K 0.9


### Amazons Annotations

Check annotations that do not have a ground truth value

In [404]:
mask=[ x not in list(ground_truth.id) for x in amazon_data_all.id]
amazon_no_gtruth=amazon_data_all[mask]
amazon_no_gtruth[["id","text","annotations"]].sample(5)

Unnamed: 0,id,text,annotations
1931,9f62e5f6-d76a-4b61-a130-f64f4ee8b52b,Die Beleuchtung des Rathauses ist eine gemeins...,{'K': {'Domestic Violence'}}
1883,1e8bddf1-2df2-4e38-9b65-4b73c5e07d2b,Der Jüngere zog eine Schreckschusswaffe und be...,{'K': {'Domestic Violence'}}
623,c3276b88-01c4-47bb-93a2-9eeef5912f13,Hagen-Mitte Eine Frau wurde am Montagmorgen im...,{'K': {'Graphic'}}
1914,3118f15b-b2d2-45f9-940f-67a50cd079c5,Auch die Eigensicherung für Berater und deren ...,{'K': {'Domestic Violence'}}
1181,3f865052-1050-417b-afae-23589286270c,Der 50-Jährige hat die Tat bislang nicht zugeg...,{'K': {'Domestic Violence'}}
