In [1]:
import json
import os
import pandas as pd
import shutil
import textwrap
import spacy
from string import punctuation
from spacy.lang.de import stop_words
import numpy as np
nlp = spacy.load('de_core_news_sm')
stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

# Read annotation data
The code below reads all annotation files from the annotations directory and matches them to the annotated samples.

In [32]:
anno_root_dir = os.path.expanduser('../../data/annotation_results/anno_02')

In [33]:
# Read all annotations
annotations = []

# go through dirs for annotators
for annotator_dir in os.listdir(anno_root_dir):

    # if not directory, skip
    if not os.path.isdir(os.path.join(anno_root_dir, annotator_dir)):
        continue
    
    # go through dataset files for each and read annotations
    for anno_file in os.listdir(os.path.join(anno_root_dir, annotator_dir)):

        if(anno_file == ".DS_Store"):
            continue

        with open(os.path.join(anno_root_dir, annotator_dir, anno_file)) as f:

            curr_annotated_samples = json.load(f)["annotatedSamples"]

            for sample in curr_annotated_samples:
                for annotation in sample["annotations"]:
                    annotations.append({
                        "annotator": annotator_dir,
                        "dataset": anno_file,
                        "sample": sample["sampleIndex"],
                        "rumor": annotation["rumorIndex"],
                        "label": annotation["label"]
                    })

anno_df = pd.DataFrame(annotations)

main_annotations = anno_df[anno_df["dataset"] == "anno_2.json"]
additional_annotations = anno_df[anno_df["dataset"] == "anno_2_additional.json"]

In [34]:
# Read datasets
main_samples = pd.read_csv("../../data/main_dataset/anno_2_set.csv", index_col=0).reset_index(drop=True)
additional_samples = pd.read_csv("../../data/main_dataset/anno_2_set-additional.csv", index_col=0).reset_index(drop=True)


In [35]:
# Read task
with open("../../data/tasks/task_v2.json") as f:
    task = json.load(f)

In [None]:
def get_annotations_for_annotator(row, anno_df, annotator):
    relevant_annotations = anno_df[(anno_df["sample"] == row.name) & (anno_df["annotator"] == annotator) & (anno_df["label"] == "related")]
    return sorted(list(set(relevant_annotations["rumor"].values.tolist())))

# Add annotation columns for each annotator to the datasets
main_samples["alex"] = main_samples.apply(lambda row: get_annotations_for_annotator(row, main_annotations, "alex"), axis=1)
main_samples["louisa"] = main_samples.apply(lambda row: get_annotations_for_annotator(row, main_annotations, "louisa"), axis=1)

additional_samples["alex"] = additional_samples.apply(lambda row: get_annotations_for_annotator(row, additional_annotations, "alex"), axis=1)
additional_samples["louisa"] = additional_samples.apply(lambda row: get_annotations_for_annotator(row, additional_annotations, "louisa"), axis=1)

# Compute annotator agreements, differences and general presence of annotations
main_same = main_samples[main_samples["alex"] == main_samples["louisa"]]
main_different = main_samples[main_samples["alex"] != main_samples["louisa"]]
main_any = main_samples[main_samples["alex"].apply(lambda x: len(x) > 0) | main_samples["louisa"].apply(lambda x: len(x) > 0)]

additional_same = additional_samples[additional_samples["alex"] == additional_samples["louisa"]]
additional_different = additional_samples[additional_samples["alex"] != additional_samples["louisa"]]
additional_any = additional_samples[additional_samples["alex"].apply(lambda x: len(x) > 0) | additional_samples["louisa"].apply(lambda x: len(x) > 0)]

total_rows_with_annotations = len(main_any) + len(additional_any)
total_differences = len(main_different) + len(additional_different)

print(f"Total rows with any annotations: {total_rows_with_annotations}")
print(f"Total differences: {total_differences}")

# Difference Inspection

In [42]:
"""
Get the rumor text for a given index
"""
def rumor_index_to_rumor(index):
  return task["rumors"][index]

"""
Print a text wrapped to fit the terminal width
"""
def print_wrapped_to_fit_terminal(text):
    if not isinstance(text, str):
        text = str(text)

    width = shutil.get_terminal_size(fallback=(80, 20)).columns
    print(textwrap.fill(text, width=width))

"""
Show differences between two annotators.
Example: show_differences(additional_samples, "alex", "louisa")
"""
def show_differences(df, annotator_1, annotator_2):
  for i, row in df.iterrows():
    if row[annotator_1] != row[annotator_2]:
      print("---")
      print(i)
      print(row["message_date"])
      print_wrapped_to_fit_terminal(row["message_text"])
      print("Alex: ", [rumor_index_to_rumor(rumor) for rumor in row["alex"]])
      print("Louisa: ", [rumor_index_to_rumor(rumor) for rumor in row["louisa"]])
      print()

# Difference correction
The following code adds the annotation changes agreed on during the discussion.

In [None]:
main_samples["alex_after"] = main_samples["alex"]
main_samples["louisa_after"] = main_samples["louisa"]

additional_samples["alex_after"] = additional_samples["alex"]
additional_samples["louisa_after"] = additional_samples["louisa"]

main_samples.at[20, "louisa_after"] = main_samples.at[20, "alex_after"]
main_samples.at[34, "alex_after"] = main_samples.at[34, "louisa_after"]
main_samples.at[44, "louisa_after"] = main_samples.at[44, "alex_after"]
main_samples.at[95, "louisa_after"] = main_samples.at[95, "alex_after"]
main_samples.at[205, "alex_after"] = main_samples.at[205, "louisa_after"]
main_samples.at[241, "louisa_after"] = main_samples.at[241, "alex_after"]
main_samples.at[258, "alex_after"] = main_samples.at[258, "louisa_after"]
main_samples.at[273, "louisa_after"] = main_samples.at[273, "alex_after"]
main_samples.at[304, "louisa_after"] = main_samples.at[304, "alex_after"]
main_samples.at[361, "louisa_after"] = main_samples.at[361, "alex_after"]
main_samples.at[365, "louisa_after"] = main_samples.at[365, "alex_after"]
main_samples.at[367, "louisa_after"] = main_samples.at[367, "alex_after"]
main_samples.at[378, "louisa_after"] = main_samples.at[378, "alex_after"]
main_samples.at[402, "louisa_after"] = main_samples.at[402, "alex_after"]
main_samples.at[420, "louisa_after"] = main_samples.at[420, "alex_after"]
main_samples.at[430, "louisa_after"] = main_samples.at[430, "alex_after"]
main_samples.at[457, "louisa_after"] = main_samples.at[457, "alex_after"]
main_samples.at[470, "louisa_after"] = main_samples.at[470, "alex_after"]
main_samples.at[502, "alex_after"] = main_samples.at[502, "louisa_after"]
main_samples.at[516, "louisa_after"] = main_samples.at[516, "alex_after"]
main_samples.at[522, "louisa_after"] = main_samples.at[522, "alex_after"]
main_samples.at[593, "louisa_after"] = main_samples.at[593, "alex_after"]

additional_samples.at[7, "louisa_after"] = additional_samples.at[7, "alex_after"]
additional_samples.at[18, "louisa_after"] = additional_samples.at[18, "alex_after"]
additional_samples.at[47, "louisa_after"] = additional_samples.at[47, "alex_after"]
additional_samples.at[49, "louisa_after"] = additional_samples.at[49, "alex_after"]
additional_samples.at[51, "alex_after"] = additional_samples.at[51, "louisa_after"]
additional_samples.at[52, "louisa_after"] = additional_samples.at[52, "alex_after"]
additional_samples.at[59, "alex_after"] = additional_samples.at[59, "louisa_after"]

all_after_discussion = pd.concat([main_samples, additional_samples])

differences_after_discussion = all_after_discussion[all_after_discussion["alex_after"] != all_after_discussion["louisa_after"]]
agreed_after_discussion = all_after_discussion[all_after_discussion["alex_after"] == all_after_discussion["louisa_after"]]

# print info about unresolved differences after discussion
print(f"Unresolved differences after discussion: {len(all_after_discussion[all_after_discussion['alex_after'] != all_after_discussion['louisa_after']])}")
print()

# print info about rumor distribution after discussion
agreed_rumors_after_discussion = [item for sublist in agreed_after_discussion["alex_after"].values.tolist() for item in sublist]
rumor_counts_after_discussion = pd.Series(agreed_rumors_after_discussion).value_counts()

print("Distribution of agreed rumors after discussion:")
for rumor_index, count in rumor_counts_after_discussion.items():
    print(f"{rumor_index_to_rumor(rumor_index)}: {count}")

print()

# print info about rumors involved in unresolved differences
different_rumors_after_discussion = [item for sublist in differences_after_discussion["alex_after"].values.tolist() + differences_after_discussion["louisa_after"].values.tolist() for item in sublist]
different_rumor_counts_after_discussion = pd.Series(different_rumors_after_discussion).value_counts()

print("Distribution of different rumors after discussion:")
for rumor_index, count in different_rumor_counts_after_discussion.items():
    print(f"{rumor_index_to_rumor(rumor_index)}: {count}")


# Export annotated dataset

In [57]:
all_after_discussion.to_csv("../../data/annotation_results/anno_02_annotated.csv", index=False)