In [1]:
import json
import os
import pandas as pd
import shutil
import textwrap
import spacy
from string import punctuation
from spacy.lang.de import stop_words
import numpy as np
nlp = spacy.load('de_core_news_sm')
stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

# Read annotation data
The code below reads all annotation files from the annotations directory and matches them to the annotated samples.

In [12]:
anno_root_dir = os.path.expanduser('../../data/annotation_results/anno_03')

In [18]:
# Read all annotations
annotations = []

# go through dirs for annotators
for annotator_dir in os.listdir(anno_root_dir):

    # if not directory, skip
    if not os.path.isdir(os.path.join(anno_root_dir, annotator_dir)):
        continue
    
    # go through dataset files for each and read annotations
    for anno_file in os.listdir(os.path.join(anno_root_dir, annotator_dir)):

        if(anno_file == ".DS_Store"):
            continue

        with open(os.path.join(anno_root_dir, annotator_dir, anno_file)) as f:

            curr_annotated_samples = json.load(f)["annotatedSamples"]

            for sample in curr_annotated_samples:
                for annotation in sample["annotations"]:
                    annotations.append({
                        "annotator": annotator_dir,
                        "dataset": anno_file,
                        "sample": sample["sampleIndex"],
                        "rumor": annotation["rumorIndex"],
                        "label": annotation["label"]
                    })

anno_df = pd.DataFrame(annotations)

In [19]:
# Read samples data
samples = pd.read_csv("../../data/main_dataset/anno_3_set.csv", index_col=0).reset_index(drop=True)

# Read task
with open("../../data/tasks/task_v2.json") as f:
    task = json.load(f)

In [None]:
def get_annotations_for_annotator(row, anno_df, annotator):
    relevant_annotations = anno_df[(anno_df["sample"] == row.name) & (anno_df["annotator"] == annotator) & (anno_df["label"] == "related")]
    return sorted(list(set(relevant_annotations["rumor"].values.tolist())))

# add annotations to samples
samples["alex"] = samples.apply(lambda row: get_annotations_for_annotator(row, anno_df, "alex"), axis=1)
samples["louisa"] = samples.apply(lambda row: get_annotations_for_annotator(row, anno_df, "louisa"), axis=1)

# get samples with disagreements, agreemtents and any annotations at all
same_anno = samples[samples["alex"] == samples["louisa"]]
different_anno = samples[samples["alex"] != samples["louisa"]]
any_anno = samples[samples["alex"].apply(lambda x: len(x) > 0) | samples["louisa"].apply(lambda x: len(x) > 0)]

total_rows_with_annotations = len(any_anno)
total_differences = len(different_anno)

print(f"Total rows with any annotations: {total_rows_with_annotations}")
print(f"Total differences: {total_differences}")

# Difference Inspection

In [22]:
"""
Get the rumor text for a given index
"""
def rumor_index_to_rumor(index):
  return task["rumors"][index]

"""
Print a text wrapped to fit the terminal width
"""
def print_wrapped_to_fit_terminal(text):
    if not isinstance(text, str):
        text = str(text)

    width = shutil.get_terminal_size(fallback=(80, 20)).columns
    print(textwrap.fill(text, width=width))

"""
Show differences between two annotators.
Example: show_differences(samples, "alex", "louisa")
"""
def show_differences(df, annotator_1, annotator_2):
  for i, row in df.iterrows():
    if row[annotator_1] != row[annotator_2]:
      print("---")
      print(i)
      print(row["message_date"])
      print_wrapped_to_fit_terminal(row["message_text"])
      print("Alex: ", [rumor_index_to_rumor(rumor) for rumor in row["alex"]])
      print("Louisa: ", [rumor_index_to_rumor(rumor) for rumor in row["louisa"]])
      print()

# Difference correction
The following code adds the annotation changes agreed on during the discussion.

In [None]:
samples["alex_after"] = samples["alex"]
samples["louisa_after"] = samples["louisa"]

samples.at[9, "louisa_after"] = samples.at[9, "alex_after"]
samples.at[69, "louisa_after"] = samples.at[69, "alex_after"]
samples.at[105, "louisa_after"] = samples.at[105, "alex_after"]
samples.at[147, "louisa_after"] = samples.at[147, "alex_after"]
samples.at[214, "louisa_after"] = samples.at[214, "alex_after"]
samples.at[231, "louisa_after"] = samples.at[231, "alex_after"]
samples.at[250, "louisa_after"] = samples.at[250, "alex_after"]
samples.at[256, "louisa_after"] = samples.at[256, "alex_after"]
samples.at[257, "alex_after"] = samples.at[257, "louisa_after"]
samples.at[274, "louisa_after"] = samples.at[274, "alex_after"]
samples.at[307, "louisa_after"] = samples.at[307, "alex_after"]
samples.at[314, "louisa_after"] = samples.at[314, "alex_after"]
samples.at[318, "alex_after"] = samples.at[318, "louisa_after"]
samples.at[348, "louisa_after"] = samples.at[348, "alex_after"]
samples.at[367, "alex_after"] = samples.at[367, "louisa_after"]
samples.at[405, "alex_after"] = samples.at[405, "louisa_after"]
samples.at[408, "alex_after"] = samples.at[408, "louisa_after"]
samples.at[410, "louisa_after"] = samples.at[410, "alex_after"]
samples.at[422, "alex_after"] = samples.at[422, "louisa_after"]
samples.at[439, "alex_after"] = samples.at[439, "louisa_after"]
samples.at[449, "louisa_after"] = samples.at[449, "alex_after"]
samples.at[450, "louisa_after"] = samples.at[450, "alex_after"]
samples.at[492, "louisa_after"] = samples.at[492, "alex_after"]
samples.at[498, "louisa_after"] = samples.at[498, "alex_after"]
samples.at[502, "alex_after"] = samples.at[502, "louisa_after"]
samples.at[511, "alex_after"] = samples.at[511, "louisa_after"]
samples.at[516, "louisa_after"] = samples.at[516, "alex_after"]
samples.at[527, "alex_after"] = samples.at[527, "louisa_after"]
samples.at[566, "alex_after"] = samples.at[566, "louisa_after"]
samples.at[575, "alex_after"] = samples.at[575, "louisa_after"]
samples.at[579, "louisa_after"] = samples.at[579, "alex_after"]
samples.at[592, "alex_after"] = samples.at[592, "louisa_after"]


# print information about unresolved differences after the discussion
differences_after_discussion = samples[samples["alex_after"] != samples["louisa_after"]]
agreed_after_discussion = samples[samples["alex_after"] == samples["louisa_after"]]

print(f"Unresolved differences after discussion: {len(differences_after_discussion)}")
print()

# print informtion about the distribution of rumors after the discussion
agreed_rumors_after_discussion = [item for sublist in agreed_after_discussion["alex_after"].values.tolist() for item in sublist]
rumor_counts_after_discussion = pd.Series(agreed_rumors_after_discussion).value_counts()

print("Distribution of agreed rumors after discussion:")
for rumor_index, count in rumor_counts_after_discussion.items():
    print(f"{rumor_index_to_rumor(rumor_index)}: {count}")

print()

# print information about the distribution of rumors involved in unresolved differences after the discussion
different_rumors_after_discussion = [item for sublist in differences_after_discussion["alex_after"].values.tolist() + differences_after_discussion["louisa_after"].values.tolist() for item in sublist]
different_rumor_counts_after_discussion = pd.Series(different_rumors_after_discussion).value_counts()

print("Distribution of different rumors after discussion:")
for rumor_index, count in different_rumor_counts_after_discussion.items():
    print(f"{rumor_index_to_rumor(rumor_index)}: {count}")

# Export annotated dataset

In [30]:
samples.to_csv("../../data/annotation_results/anno_03_annotated.csv", index=False)