In [135]:
import pandas as pd
import ast
import spacy
from string import punctuation
from spacy.lang.de import stop_words
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load('de_core_news_sm')
stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

# Load datasets

In [136]:
# read the dataset
df = pd.read_csv('../../data/main_dataset/main_dataset-prepro-5000-10000.csv')

# read the annotation results from iteration one and create a dataframe of only positive examples
anno_1_results = pd.read_csv('../../data/annotation_results/anno_01_annotated.csv', index_col=0)
anno_1_positive = anno_1_results[(anno_1_results["alex_after"] == anno_1_results["louisa_after"]) & (anno_1_results["alex_after"].apply(ast.literal_eval).apply(len) > 0)]

# Lemmatize dataset

In [71]:
"""
Get the text of a sample
"""
def get_row_text(row):
  text = ''

  if type(row['message_text']) == str:
    text += row['message_text']
  if type(row['webpage_description']) == str:
    text += row['webpage_description']

  return text

anno_1_positive["text"] = anno_1_positive.apply(get_row_text, axis=1)

In [74]:
stop_lemmas = ["\u2796", "--", "ⓜ", "⚡", "nan", "\uFEFF"]

def lemmatize(row):
  text = get_row_text(row)

  text = nlp(text)
  # lemmatizing
  sentence = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in text ]
  # removing stop words
  sentence = [ word for word in sentence if word not in stop_words and word not in punctuations and word not in stop_lemmas ]        
  return sentence

df["lemma"] = df.apply(lemmatize, axis=1)
anno_1_positive["lemma"] = anno_1_positive.apply(lemmatize, axis=1)
df["lemma_joined"] = df["lemma"].apply(lambda x: " ".join(x))
anno_1_positive["lemma_joined"] = anno_1_positive["lemma"].apply(lambda x: " ".join(x))


# Remove negatively associated lemmas
The following code removes samples containing lemmas which were found to be negatively correlated with the rumors (see anno_1_eval.ipynb). This is meant to remove off-topic samples, mostly related to geopolitics.

In [None]:
keywords = ["russland", "usa", "israel", "trump", "biden", "harris", "hisbollah", "libanon", "amerikanisch", "amerika", "militärisch", "militär", "ukraine", "ukrainisch", "russisch", "inflation", "klimaschutz", "euro", "diddy", "putin", "krieg", "soldat", "nasrallah", "illuminat"]

df_filtered = df.copy()

for keyword in keywords:
  len_before = len(df_filtered)
  df_filtered = df_filtered[df_filtered["lemma"].apply(lambda x: keyword not in x)]
  print(f"'{keyword}': {len_before - len(df_filtered)}")

print("Remaining:", len(df_filtered))

# Create annotation set
The below code computes the cosine similarity between the tf-idf vectorized positive examples from the previous iteration and the tf-idf vectorized lemmatized dataset. For each sample, the highest similarity to any positive example is taken, ranking the dataset by this similarity. An annotation dataset consisting of the top 400 most similar as well as 200 random samples is created.

In [93]:
total_samples = 600
tfidf_ratio = 0.75


vectorizer = TfidfVectorizer()
vectorizer.fit(pd.concat([df_filtered["lemma_joined"], anno_1_positive["lemma_joined"]]))

samples_tfidf = vectorizer.transform(df_filtered["lemma_joined"])
positive_tfidf = vectorizer.transform(anno_1_positive["lemma_joined"])

cosine_similarities = cosine_similarity(samples_tfidf, positive_tfidf)

av_similarities = np.mean(cosine_similarities, axis=1)
max_similarities = np.max(cosine_similarities, axis=1)

sorted_indices = np.argsort(-max_similarities)
sorted_df = df_filtered.iloc[sorted_indices]

tf_idf_samples = sorted_df.head(int(total_samples * tfidf_ratio))
remaining_samples = sorted_df.tail(int(total_samples * (1 - tfidf_ratio))).sample(total_samples - int(total_samples * tfidf_ratio))

final_df = pd.concat([tf_idf_samples, remaining_samples]).sample(frac=1)

In [96]:
final_df.to_csv('../../data/main_dataset/anno_2_set.csv')

# Create additional rumors annotation set
In the below code, possible examples of two new rumors introduced in the second annotation round are searched, creating an additional annotation set specifically for these rumors.

In [None]:
long_covid_keywords = ["long", "long-covid", "lauterbach", "gesundheitsminister"]

vacc_danger_keywords = [
    "schweigekartell", "impfschaede", "impfnebenwirkung", "impfgeschädigte",
    "impfschade", "impfopfer", "impfstaat", "impfschäde", "impfzwang",
    "impfschaed", "impfschäden", "impfschaden", "dunkelziffer", "berninger",
    "mut-ärztin", "mut-arzt", "mut-psychologin", "mut-ärzt", "mut-politiker",
    "mut", "nebenwirkung", "nebenwirkungsfrei"
]

# Filter out the samples that contain the keywords from both sets
long_covid_df = df_filtered[df_filtered["lemma"].apply(lambda x: any([keyword in x for keyword in long_covid_keywords]))]
vacc_danger_df = df_filtered[df_filtered["lemma"].apply(lambda x: any([keyword in x for keyword in vacc_danger_keywords]))]
anno_2_keywords_df = pd.concat([long_covid_df, vacc_danger_df]).drop_duplicates(subset=["chat_handle", "telegram_message_id"])

# Drop duplicates
merged_df = anno_2_keywords_df.merge(final_df[['chat_handle', 'telegram_message_id']], on=['chat_handle', 'telegram_message_id'], how='left', indicator=True)
result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')

# Export data
result_df.to_csv('../../data/main_dataset/anno_2_set-additional.csv')
