In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import spacy
from string import punctuation
from spacy.lang.de import stop_words
nlp = spacy.load('de_core_news_sm')
stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

# Import datasets

In [152]:
df_samples = pd.read_csv('../../data/main_dataset/main_dataset-prepro-0-5000.csv', index_col=0)
df_additional = pd.read_csv('../../data/main_dataset/main_dataset-additional-0-5000.csv', index_col=0)

# Create topic representation
The following code creates topic representations for each of the four rumor topics in german and english.

In [2]:
topic_representations = {
  "vacc": {
    "en": {
      "rumors": [
        'Doctors are being deliberately prevented from educating the public about the dangers of vaccines.',
        'The COVID-19 vaccine causes breast cancer in young women.',
        'The COVID-19 vaccine causes heart damage in children.',
        'The electronic vaccination pass in Austria is intended to be used for digital surveillance.'
      ],
      "keywords": ['Corona', 'Covid', 'Vaccine', 'Vaccinate', 'Vaccination', 'Pandemic', 'MRNA', 'Syringe']
    },
    "de": {
      "rumors": [
        'Ärzte werden gezielt an der Aufklärung über die Gefahren von Impfungen gehindert.',
        'Die Corona-Impfung löst Brustkrebs bei jungen Frauen aus.',
        'Die Corona-Impfung löst Herzschäden bei Kindern aus.',
        'Der elektronische Impfpass in Österreich soll für digitale Überwachung genutzt werden.'
      ],
      "keywords": ['Corona', 'Covid', 'Impfung', 'Impfen', 'Impfstoff', 'Pandemie', 'MRNA', 'Spritze']
    },
  },
  "trans": {
    "en": {
      "rumors": [
        'Transsexuality is a trend triggered by psychological manipulation.',
        'Trans propaganda is being deliberately spread in schools.',
        'Confronting children and adolescents with the topic of transsexuality harms them.'
      ],
      "keywords": ['Trans', 'Transgender', 'Transsexual', 'Gender', 'Binary', 'Nonbinary', 'Sex', 'Queer', 'LGBT', 'LGBTQ', 'LGBTQ+']
    },
    "de": {
      "rumors": [
        'Transsexuallität ist ein durch psychologische Manipulation ausgelöster Trend.',
        'Trans-Propaganda wird gezielt an Schulen verbreitet.',
        'Die Konfrontation mit dem Thema Transsexualität schadet Kindern und Jugendlichen.'
      ],
      "keywords": ['Trans', 'Transgender', 'Transsexuell', 'Gender', 'Binär', 'Nichtbinär', 'Geschlecht', 'Queer', 'LGBT', 'LGBTQ', 'LGBTQ+']
    },
  },
  "brandenburg": {
    "en": {
      "rumors": [
        'Targeted propaganda against the AfD is responsible for their election defeat in Brandenburg.',
        'There was election fraud in the Brandenburg elections.'
      ],
      "keywords": ['Brandenburg']
    },
    "de": {
      "rumors": [
        'Gezielte Propaganda gegen die AfD ist schuld an deren Wahlniederlage in Brandenburg.',
        'Bei den Wahlen in Brandenburg gab es Wahlbetrug.'
      ],
      "keywords": ['Brandenburg']
    },
  },
  "migrant": {
    "en": {
      "rumors": [
        'Financial support for migrants is a financial burden for German citizens.',
        'Knife attacks by migrants pose a growing threat to Germany.',
        'Turkey is ready to take back up to 500 Turkish citizens per week from Germany.',
        'The Mocro Mafia was responsible for the explosion in Cologne on 25.09.'
      ],
      "keywords": ['Migrant', 'Asylum', 'Refugee', 'Foreigner', 'Turkish', 'Arab', 'Muslim', 'Islam', 'Terror']
    },
    "de": {
      "rumors": [
        'Die finanzielle Unterstützung von Migranten stellt eine finanzielle Belastung für deutsche Bürger dar.',
        'Messerangriffe durch Migranten stellen eine wachsende Gefahr für Deutschland dar.',
        'Die Türkei ist bereit, bis zu 500 türkische Staatsbürger pro Woche aus Deutschland zurückzunehmen.',
        'Die Mocro-Mafia war für die Explosion in Köln am 25.09. verantwortlich.'
      ],
      "keywords": ['Migrant', 'Asyl', 'Flüchtling', 'Ausländer', 'Türkisch', 'Araber', 'Kanake', 'Muslim', 'Islam', 'Terror']
    },
  }
}

for topic in topic_representations:
  for lang in topic_representations[topic]:
    topic_representations[topic][lang]["full"] = " ".join(topic_representations[topic][lang]["rumors"]) + " " + " ".join(topic_representations[topic][lang]["keywords"])

# Similarity computation
The following code lemmatizes the samples and topic representations, tf-idf vectorizes them and computes the cosine similarity between the samples and the topic representations. For each topic, a dataframe of samples ranked by similarity is created.

In [142]:
"""
Get the text of a sample
"""
def get_row_text(row):

    text = ""

    if type(row['message_text']) == str:
        text += row['message_text']

    if type(row['webpage_description']) == str:
        text += row['webpage_description']

    return text

punctuations += ["\u2796", "--", "ⓜ", "⚡", "\uFEFF"]

"""
Lemmatize text
"""
def lemmatize(text):
    text = nlp(text)
    # lemmatizing
    sentence = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in text ]
    # removing stop words
    sentence = [ word for word in sentence if word not in stop_words and word not in punctuations ]        
    return sentence

In [144]:
df_samples["text"] = df_samples.apply(lambda x: str(x["message_text"]) + str(x["webpage_description"]), axis=1)
df_samples['lemma'] = [' '.join(lemmatize(curr_text)) for curr_text in df_samples.text]

In [145]:
tfidf_vectorizer = TfidfVectorizer()

# TFIDF vectorize dataset
samples_tfidf = tfidf_vectorizer.fit_transform(df_samples['lemma'])

# tfidf vectorize each topic representation for each language
for topic in topic_representations:
  curr_topic_tfidfs = {}

  for lang in topic_representations[topic]:
    curr_topic_lemma = [' '.join(lemmatize(topic_representations[topic][lang]["full"]))]
    curr_topic_lang_tfidf = tfidf_vectorizer.transform(curr_topic_lemma)
    curr_topic_lang_similarities = cosine_similarity(samples_tfidf, curr_topic_lang_tfidf)
    df_samples[f"tfidf_similarity_{topic}_{lang}"] = curr_topic_lang_similarities


def get_lang_aware_similarity(row, topic):
  langset = ast.literal_eval(row["langset_confident"])
  
  if len(langset) == 1 and langset[0] == 'de':
    return row[f"tfidf_similarity_{topic}_de"]

  elif len(langset) == 1 and langset[0] == 'en':
    return row[f"tfidf_similarity_{topic}_en"]

  else:
    return (row[f"tfidf_similarity_{topic}_de"] + row[f"tfidf_similarity_{topic}_en"]) / 2
  
for topic in topic_representations:
  df_samples[f'tfidf_similarity_{topic}'] = df_samples.apply(lambda x: get_lang_aware_similarity(x, topic), axis=1)

# Exporting topic datasets

In [146]:
df_samples.sort_values(by='tfidf_similarity_vacc', ascending=False).head(1000).to_csv("../../data/main_dataset/tvrt_dataset-tfidf_vacc-1000.csv")
df_samples.sort_values(by='tfidf_similarity_migrant', ascending=False).head(1000).to_csv("../../data/main_dataset/tvrt_dataset-tfidf_migrant-1000.csv")
df_samples.sort_values(by='tfidf_similarity_brandenburg', ascending=False).head(1000).to_csv("../../data/main_dataset/tvrt_dataset-tfidf_brandenburg-1000.csv")
df_samples.sort_values(by='tfidf_similarity_trans', ascending=False).head(1000).to_csv("../../data/main_dataset/tvrt_dataset-tfidf_trans-1000.csv")