In [140]:
import json
import os
import pandas as pd
import shutil
import textwrap
import spacy
from string import punctuation
from spacy.lang.de import stop_words
import numpy as np
nlp = spacy.load('de_core_news_sm')
stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

# Read annotation data
The code below reads all annotation files from the annotations directory and matches them to the annotated samples, creating one annotated dataframe per topic.

In [2]:
anno_root_dir = '../../data/annotation_results/anno_01'

In [3]:
# Read all annotations
annotations = []

# go through dirs for annotators
for annotator_dir in os.listdir(anno_root_dir):
    
    # go through dataset files for each and read annotations
    for anno_file in os.listdir(os.path.join(anno_root_dir, annotator_dir)):
        with open(os.path.join(anno_root_dir, annotator_dir, anno_file)) as f:
            curr_annotated_samples = json.load(f)["annotatedSamples"]

            for sample in curr_annotated_samples:
                for annotation in sample["annotations"]:
                    annotations.append({
                        "annotator": annotator_dir,
                        "dataset": anno_file,
                        "sample": sample["sampleIndex"],
                        "rumor": annotation["rumorIndex"],
                        "label": annotation["label"]
                    })

anno_df = pd.DataFrame(annotations)

migrant_anno_df = anno_df[anno_df["dataset"] == "tfidf_migrant.json"]
vacc_anno_df = anno_df[anno_df["dataset"] == "tfidf_vacc.json"]
brandenburg_anno_df = anno_df[anno_df["dataset"] == "tfidf_brandenburg.json"]
trans_anno_df = anno_df[anno_df["dataset"] == "tfidf_trans.json"]

In [4]:
# Read datasets
migrant_df = pd.read_csv("../../data/main_dataset/anno_1_set-migration.csv", index_col=0).head(150).reset_index(drop=True)
vacc_df = pd.read_csv("../../data/main_dataset/anno_1_set-vaccination.csv", index_col=0).head(150).reset_index(drop=True)
brandenburg_df = pd.read_csv("../../data/main_dataset/anno_1_set-brandenburg.csv", index_col=0).head(150).reset_index(drop=True)
trans_df = pd.read_csv("../../data/main_dataset/anno_1_set-trans.csv", index_col=0).head(150).reset_index(drop=True)


In [5]:
# Read task
with open("../../data/tasks/task_v1.json") as f:
    task = json.load(f)

In [None]:
def get_annotations_for_annotator(row, anno_df, annotator):
    relevant_annotations = anno_df[(anno_df["sample"] == row.name) & (anno_df["annotator"] == annotator) & (anno_df["label"] == "related")]
    return sorted(list(set(relevant_annotations["rumor"].values.tolist())))

# Add annotation columns for each annotator to the datasets
migrant_df["alex"] = migrant_df.apply(lambda row: get_annotations_for_annotator(row, migrant_anno_df, "alex"), axis=1)
migrant_df["louisa"] = migrant_df.apply(lambda row: get_annotations_for_annotator(row, migrant_anno_df, "louisa"), axis=1)

vacc_df["alex"] = vacc_df.apply(lambda row: get_annotations_for_annotator(row, vacc_anno_df, "alex"), axis=1)
vacc_df["louisa"] = vacc_df.apply(lambda row: get_annotations_for_annotator(row, vacc_anno_df, "louisa"), axis=1)

brandenburg_df["alex"] = brandenburg_df.apply(lambda row: get_annotations_for_annotator(row, brandenburg_anno_df, "alex"), axis=1)
brandenburg_df["louisa"] = brandenburg_df.apply(lambda row: get_annotations_for_annotator(row, brandenburg_anno_df, "louisa"), axis=1)

trans_df["alex"] = trans_df.apply(lambda row: get_annotations_for_annotator(row, trans_anno_df, "alex"), axis=1)
trans_df["louisa"] = trans_df.apply(lambda row: get_annotations_for_annotator(row, trans_anno_df, "louisa"), axis=1)

# Compute annotator agreements, differences and general presence of annotations
migrant_same = migrant_df[migrant_df["alex"] == migrant_df["louisa"]]
migrant_different = migrant_df[migrant_df["alex"] != migrant_df["louisa"]]
migrant_any = migrant_df[migrant_df["alex"].apply(lambda x: len(x) > 0) | migrant_df["louisa"].apply(lambda x: len(x) > 0)]

vacc_same = vacc_df[vacc_df["alex"] == vacc_df["louisa"]]
vacc_different = vacc_df[vacc_df["alex"] != vacc_df["louisa"]]
vacc_any = vacc_df[vacc_df["alex"].apply(lambda x: len(x) > 0) | vacc_df["louisa"].apply(lambda x: len(x) > 0)]

brandenburg_same = brandenburg_df[brandenburg_df["alex"] == brandenburg_df["louisa"]]
brandenburg_different = brandenburg_df[brandenburg_df["alex"] != brandenburg_df["louisa"]]
brandenburg_any = brandenburg_df[brandenburg_df["alex"].apply(lambda x: len(x) > 0) | brandenburg_df["louisa"].apply(lambda x: len(x) > 0)]

trans_same = trans_df[trans_df["alex"] == trans_df["louisa"]]
trans_different = trans_df[trans_df["alex"] != trans_df["louisa"]]
trans_any = trans_df[trans_df["alex"].apply(lambda x: len(x) > 0) | trans_df["louisa"].apply(lambda x: len(x) > 0)]

total_rows_with_annotations = len(migrant_any) + len(vacc_any) + len(brandenburg_any) + len(trans_any)
total_differences = len(migrant_different) + len(vacc_different) + len(brandenburg_different) + len(trans_different)

print(f"Total rows with any annotations: {total_rows_with_annotations}")
print(f"Total differences: {total_differences}")

# Difference Inspection

In [8]:
"""
Get the rumor text for a given index
"""
def rumor_index_to_rumor(index):
  return task["rumors"][index]

"""
Print a text wrapped to fit the terminal width
"""
def print_wrapped_to_fit_terminal(text):
    if not isinstance(text, str):
        text = str(text)

    width = shutil.get_terminal_size(fallback=(80, 20)).columns
    print(textwrap.fill(text, width=width))

"""
Show differences between two annotators.
Example: show_differences(migrant_df, "alex", "louisa")
"""
def show_differences(df, annotator_1, annotator_2):
  for i, row in df.iterrows():
    if row[annotator_1] != row[annotator_2]:
      print("---")
      print(i)
      print(row["message_date"])
      print_wrapped_to_fit_terminal(row["text"])
      print("Alex: ", [rumor_index_to_rumor(rumor) for rumor in row["alex"]])
      print("Louisa: ", [rumor_index_to_rumor(rumor) for rumor in row["louisa"]])
      print()

# Difference correction
The following code adds the annotation changes agreed on during the discussion.

In [None]:
migrant_df_after_discussion = migrant_df.copy()
migrant_df_after_discussion.at[1, "louisa"] = migrant_df_after_discussion.at[1, "alex"]
migrant_df_after_discussion.at[19, "louisa"] = migrant_df_after_discussion.at[19, "alex"]
migrant_df_after_discussion.at[31, "alex"] = migrant_df_after_discussion.at[31, "louisa"]
migrant_df_after_discussion.at[32, "alex"] = migrant_df_after_discussion.at[32, "louisa"]
migrant_df_after_discussion.at[55, "louisa"] = migrant_df_after_discussion.at[55, "alex"]
migrant_df_after_discussion.at[63, "alex"] = migrant_df_after_discussion.at[63, "louisa"]
migrant_df_after_discussion.at[64, "louisa"] = migrant_df_after_discussion.at[64, "alex"]
migrant_df_after_discussion.at[69, "alex"] = migrant_df_after_discussion.at[69, "louisa"]
migrant_df_after_discussion.at[124, "louisa"] = migrant_df_after_discussion.at[124, "alex"]
migrant_df_after_discussion.at[140, "alex"] = migrant_df_after_discussion.at[140, "louisa"]

trans_df_after_discussion = trans_df.copy()
trans_df_after_discussion.at[4, "louisa"] = trans_df_after_discussion.at[4, "alex"]
trans_df_after_discussion.at[14, "alex"] = trans_df_after_discussion.at[14, "louisa"]
trans_df_after_discussion.at[65, "louisa"] = trans_df_after_discussion.at[65, "alex"]
trans_df_after_discussion.at[99, "louisa"] = trans_df_after_discussion.at[99, "alex"]
trans_df_after_discussion.at[123, "louisa"] = trans_df_after_discussion.at[123, "alex"]
trans_df_after_discussion.at[135, "louisa"] = trans_df_after_discussion.at[135, "alex"]
trans_df_after_discussion.at[138, "louisa"] = trans_df_after_discussion.at[138, "alex"]

brandenburg_df_after_discussion = brandenburg_df.copy()
brandenburg_df_after_discussion.at[14, "louisa"] = brandenburg_df_after_discussion.at[14, "alex"]
brandenburg_df_after_discussion.at[19, "louisa"] = brandenburg_df_after_discussion.at[19, "alex"]
brandenburg_df_after_discussion.at[28, "alex"] = [0]
brandenburg_df_after_discussion.at[28, "louisa"] = [0]
brandenburg_df_after_discussion.at[33, "louisa"] = brandenburg_df_after_discussion.at[33, "alex"]
brandenburg_df_after_discussion.at[46, "louisa"] = brandenburg_df_after_discussion.at[46, "alex"]
brandenburg_df_after_discussion.at[47, "louisa"] = brandenburg_df_after_discussion.at[47, "alex"]
brandenburg_df_after_discussion.at[55, "louisa"] = brandenburg_df_after_discussion.at[55, "alex"]
brandenburg_df_after_discussion.at[58, "alex"] = brandenburg_df_after_discussion.at[58, "alex"] + brandenburg_df_after_discussion.at[58, "louisa"]
brandenburg_df_after_discussion.at[59, "alex"] = brandenburg_df_after_discussion.at[59, "alex"] + brandenburg_df_after_discussion.at[59, "louisa"]
brandenburg_df_after_discussion.at[89, "louisa"] = brandenburg_df_after_discussion.at[89, "alex"]
brandenburg_df_after_discussion.at[105, "alex"] = brandenburg_df_after_discussion.at[105, "louisa"]
brandenburg_df_after_discussion.at[111, "louisa"] = [1]
brandenburg_df_after_discussion.at[112, "alex"] = [0]
brandenburg_df_after_discussion.at[112, "louisa"] = [0]
brandenburg_df_after_discussion.at[117, "louisa"] = brandenburg_df_after_discussion.at[117, "alex"]
brandenburg_df_after_discussion.at[124, "alex"] = brandenburg_df_after_discussion.at[124, "louisa"]
brandenburg_df_after_discussion.at[133, "alex"] = brandenburg_df_after_discussion.at[133, "louisa"]
brandenburg_df_after_discussion.at[136, "louisa"] = brandenburg_df_after_discussion.at[136, "louisa"] + [1]

vacc_df_after_discussion = vacc_df.copy()
vacc_df_after_discussion.at[5, "alex"] = vacc_df_after_discussion.at[5, "louisa"]
vacc_df_after_discussion.at[23, "louisa"] = vacc_df_after_discussion.at[23, "alex"]
vacc_df_after_discussion.at[25, "louisa"] = vacc_df_after_discussion.at[25, "alex"]
vacc_df_after_discussion.at[31, "alex"] = vacc_df_after_discussion.at[31, "louisa"]
vacc_df_after_discussion.at[68, "louisa"] = vacc_df_after_discussion.at[68, "alex"]
vacc_df_after_discussion.at[129, "louisa"] = vacc_df_after_discussion.at[129, "alex"]
vacc_df_after_discussion.at[135, "louisa"] = vacc_df_after_discussion.at[135, "alex"]
vacc_df_after_discussion.at[148, "alex"] = vacc_df_after_discussion.at[148, "louisa"]


all_after_discussion = pd.concat([migrant_df_after_discussion, vacc_df_after_discussion, brandenburg_df_after_discussion, trans_df_after_discussion])
all_differences = len(all_after_discussion[all_after_discussion["alex"] != all_after_discussion["louisa"]])

equal_vals_after_discussion = all_after_discussion[all_after_discussion["alex"] == all_after_discussion["louisa"]]["alex"].values.tolist()
equal_vals_after_discussion = [item for sublist in equal_vals_after_discussion for item in sublist]
rumor_counts_after_discussion = pd.Series(equal_vals_after_discussion).value_counts()

# Print info about the distribution of rumors after discussion
print("RUMOR DISTRIBUTION AFTER DISCUSSION:")
for rumor_index, count in rumor_counts_after_discussion.items():
    print(f"{rumor_index_to_rumor(rumor_index)}: {count}")

print()

# Print info about the distribution of rumors before discussion
differences_after_discussion = all_after_discussion[all_after_discussion["alex"] != all_after_discussion["louisa"]]
print(f"Differences after discussion: {len(differences_after_discussion)}")

print()

# Print info about the distribution of rumors involved in unresolved differences
different_vals_after_discussion = differences_after_discussion["alex"].values.tolist() + differences_after_discussion["louisa"].values.tolist()
different_vals_after_discussion = [item for sublist in different_vals_after_discussion for item in sublist]
diff_rumor_counts_after_discussion = pd.Series(different_vals_after_discussion).value_counts()

print("RUMORS INVOLVED IN UNRESOLVED DIFFERENCES:")
for rumor_index, count in diff_rumor_counts_after_discussion.items():
    print(f"{rumor_index_to_rumor(rumor_index)}: {count}")

# Creating and exporting the final annotated dataframe

In [12]:
trans_df["alex_after"] = trans_df_after_discussion["alex"]
trans_df["louisa_after"] = trans_df_after_discussion["louisa"]
vacc_df["alex_after"] = vacc_df_after_discussion["alex"]
vacc_df["louisa_after"] = vacc_df_after_discussion["louisa"]
brandenburg_df["alex_after"] = brandenburg_df_after_discussion["alex"]
brandenburg_df["louisa_after"] = brandenburg_df_after_discussion["louisa"]
migrant_df["alex_after"] = migrant_df_after_discussion["alex"]
migrant_df["louisa_after"] = migrant_df_after_discussion["louisa"]
all_df = pd.concat([migrant_df, vacc_df, brandenburg_df, trans_df])

all_df["agree_after"] = all_df["alex_after"] == all_df["louisa_after"]

# export full annotated dataset
all_df.to_csv("../../data/annotation_results/anno_01_annotated.csv", index=False)


# Experimental negative lemma research
The code below is an experiment to find lemmas that are negatively correlated with positive samples, i.e. samples annotated as containing one of the rumors. This was later used to filter out off-topic posts.

In [189]:
# Lemmatizing the text
punctuations += ["\u2796", "--", "ⓜ", "⚡", "nan", "\uFEFF"]

def lemmatize(text):
    text = nlp(text)
    # lemmatizing
    sentence = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in text ]
    # removing stop words
    sentence = [ word for word in sentence if word not in stop_words and word not in punctuations ]        
    return sentence


all_df["lemma"] = all_df["text"].apply(lambda x: lemmatize(x))

In [None]:
# Get lists of all lemmas from agreed and from positive samples as well as frequency counts
valid = all_df[all_df["agree_after"] == True]
positive = valid[valid["alex_after"].apply(lambda x: len(x) > 0)]
positive_lemma = [lemma for text in positive["lemma"].values.tolist() for lemma in list(set(text))]
all_lemma = [lemma for text in valid["lemma"].values.tolist() for lemma in list(set(text))]

positive_lemma_counts = pd.Series(positive_lemma).value_counts()
all_lemma_counts = pd.Series(all_lemma).value_counts()

# Get total number of lemmas in positive and valid sets and vocabulary size
total_pos = sum(positive_lemma_counts.values)
total_valid = sum(all_lemma_counts.values)
unique_lemma = len(all_lemma_counts)

alpha = 0.001 # smoothing parameter


log_odds_ratios = {}

# Iterate over vocabulary
for token in all_lemma_counts.index:
    # Get counts for the token in positive and valid sets
    pos_count = positive_lemma_counts.get(token, 0)
    valid_count = all_lemma_counts.get(token, 0)

    # Compute token probabilities
    p_token_pos = (pos_count + alpha) / (total_pos + alpha * unique_lemma)
    p_token_valid = (valid_count + alpha) / (total_valid + alpha * unique_lemma)

    # Compute log-odds ratio
    log_odds_ratio = np.log(p_token_pos / p_token_valid)
    log_odds_ratios[token] = log_odds_ratio

# Convert log-odds to dataframe
log_odds_df = pd.DataFrame({
    'token': list(log_odds_ratios.keys()),
    'log_odds': list(log_odds_ratios.values()),
}).sort_values(by='log_odds', ascending=True)

# Add count: How often does lemma appear in positive and valid sets?
log_odds_df["positive_count"] = log_odds_df["token"].apply(lambda x: positive_lemma_counts.get(x, 0))
log_odds_df["valid_count"] = log_odds_df["token"].apply(lambda x: all_lemma_counts.get(x, 0))

# Inspect tokens with most negative log-odds ratios
print("Most negatively correlated tokens:")
for i, row in log_odds_df.head(200).iterrows():
    print(f"{row['token']}: {row['positive_count']} / {row['valid_count']}")