In [123]:
import pandas as pd
import spacy
import re
from string import punctuation
from spacy.lang.de import stop_words
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load('de_core_news_sm')
stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

# Load datasets
- Samples for third annotation iteration
- Positive samples from second iteration, cleaned, meaning message footers removed

In [125]:
df = pd.read_csv('../../data/main_dataset/main_dataset-prepro-10000-15000.csv')
positive_cleaned = pd.read_csv('../../data/annotation_results/anno_02_positive-cleaned.csv', index_col=0)

# Lemmatize text

In [126]:
"""
Get the text of a sample
"""
def get_row_text(row):
    text = ""
    if type(row["message_text"]) == str:
        text += row["message_text"]

    if type(row["webpage_description"]) == str:
        text += row["webpage_description"]

    return text

In [127]:
stop_lemmas = ["\u2796", "--", "ⓜ", "⚡", "nan", "\uFEFF"]

def lemmatize(row):
  text = get_row_text(row)

  text = nlp(text)
  # lemmatizing
  sentence = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in text ]
  # removing stop words
  sentence = [ word for word in sentence if not word in stop_words ] 
  sentence = [ word for word in sentence if not word in punctuations ]
  sentence = [word for word in sentence if not word in stop_lemmas]
  sentence = [word for word in sentence if not "t.me" in word]
  sentence = [word for word in sentence if not "http" in word]
  sentence = [word for word in sentence if not "www" in word]
  sentence = [word for word in sentence if not "@" in word]
  sentence = [word for word in sentence if not ".html" in word]
  sentence = [word for word in sentence if not "utm_source" in word]
  sentence = [word for word in sentence if re.search("[A-Za-z0-9]", word)]
  sentence = [word for word in sentence if len(word) > 1]

  return sentence

df["lemma"] = df.apply(lemmatize, axis=1)
positive_cleaned["lemma"] = positive_cleaned.apply(lemmatize, axis=1)
df["lemma_joined"] = df["lemma"].apply(lambda x: " ".join(x))
positive_cleaned["lemma_joined"] = positive_cleaned["lemma"].apply(lambda x: " ".join(x))

# Remove negatively associated lemmas
The following code removes samples containing lemmas which were found to be negatively correlated with the rumors (see anno_1_eval.ipynb). This is meant to remove off-topic samples, mostly related to geopolitics.

In [None]:
keywords = ["russland", "usa", "israel", "trump", "biden", "harris", "hisbollah", "libanon", "amerikanisch", "amerika", "militärisch", "militär", "ukraine", "ukrainisch", "russisch", "inflation", "klimaschutz", "euro", "diddy", "putin", "krieg", "soldat", "nasrallah", "illuminat"]

df_filtered = df.copy()

for keyword in keywords:
  len_before = len(df_filtered)
  df_filtered = df_filtered[df_filtered["lemma"].apply(lambda x: keyword not in x)]

  print(f"'{keyword}': {len_before - len(df_filtered)}")

print("Remaining:", len(df_filtered))

# Create annotation set
The below code computes the cosine similarity between the tf-idf vectorized positive examples from the previous iteration and the tf-idf vectorized lemmatized dataset. For each sample, the average of the three highest similarities to positive samples is taken, ranking the dataset by this similarity. An annotation dataset consisting of the top 400 most similar as well as 200 random samples is created.

In [129]:
total_samples = 600
tfidf_ratio = 0.75


vectorizer = TfidfVectorizer()
vectorizer.fit(pd.concat([df_filtered["lemma_joined"], positive_cleaned["lemma_joined"]]))

samples_tfidf = vectorizer.transform(df_filtered["lemma_joined"])
positive_cleaned_tfidf = vectorizer.transform(positive_cleaned["lemma_joined"])

cosine_similarities = cosine_similarity(samples_tfidf, positive_cleaned_tfidf)
sorted_similarities = np.sort(cosine_similarities, axis=1)[:, ::-1]
top3_av_similarities = np.mean(sorted_similarities[:, :3], axis=1)


sorted_indices = np.argsort(-top3_av_similarities)
sorted_df = df_filtered.iloc[sorted_indices]

tf_idf_samples = sorted_df.head(int(total_samples * tfidf_ratio))
remaining_samples = sorted_df.tail(int(total_samples * (1 - tfidf_ratio))).sample(total_samples - int(total_samples * tfidf_ratio))

final_df = pd.concat([tf_idf_samples, remaining_samples]).sample(frac=1)

In [None]:
final_df.to_csv('../../data/main_dataset/anno_3_set.csv')