In [1]:
import pandas as pd
import textwrap
import shutil
import re

data_path = '../../data/24_09_28-test_scrape/24_09_28-test_scrape-prepro.csv'

# Read the Dataset

In [4]:
df = pd.read_csv(data_path, index_col=0)

# Topic queries
Below, a number of complex keyword queries for a set of topics are defined. These queries are used to filter the dataset for relevant documents. The topics are:
- Gender and sexual identity
- COVID-19 and vaccines
- Migration
- Brandenburg
- Greta Thunberg
- Alice Weidel

In [3]:
"""
Print a string to the terminal, wrapping it to fit the terminal width
"""
def print_wrapped_to_fit_terminal(text):
    if not isinstance(text, str):
        text = str(text)

    width = shutil.get_terminal_size(fallback=(80, 20)).columns
    print(textwrap.fill(text, width=width))


"""
Filter a dataframe by a keyword query on a specified column. The keyword query can have the following rules:
- "all": all keywords must be present in the column
- "any": at least one of the keywords must be present in the column
- "none": none of the keywords can be present in the column
"""
def filter_by_keywords(df, column, keywords={}, ignore_case=True):
    query = pd.Series([True] * len(df)) 

    if "all" in keywords:
        for keyword in keywords["all"]:
            query &= df[column].str.contains(keyword, na=False, case=ignore_case)

    if "any" in keywords:
        some_query = pd.Series([False] * len(df))
        for keyword in keywords["any"]:
            some_query |= df[column].str.contains(keyword, na=False, case=ignore_case)
        query &= some_query

    if "none" in keywords:
        for keyword in keywords["none"]:
            query &= ~df[column].str.contains(keyword, na=False, case=ignore_case)

    return df[query]


"""
Filter a dataframe of messages by keywords in the message text, webpage title, and webpage description
"""
def filter_message_data_by_keywords(df, keywords={}, ignore_case=True):
    column_filtered = [
        filter_by_keywords(df, "message_text", keywords, ignore_case),
        filter_by_keywords(df, "webpage_title", keywords, ignore_case),
        filter_by_keywords(df, "webpage_description", keywords, ignore_case),
    ]

    return pd.concat(column_filtered).drop_duplicates()


"""
Display the matches for a keyword/regular expression in the message text, webpage title, and webpage description columns of a dataframe
"""
def get_matches_for_keyword_in_message_data(df, keyword, ignore_case=True):
    def find_matches(row):
        message_text = row.get("message_text", "")
        webpage_description = row.get("webpage_description", "")
        webpage_title = row.get("webpage_title", "")

        all_matches = []
        
        message_matches = list(re.finditer(keyword, message_text, re.IGNORECASE)) if ignore_case else list(re.finditer(keyword, message_text))
        for match in message_matches:
            start = max(0, match.start() - 20)
            end = min(len(message_text), match.end() + 20)
            all_matches.append(message_text[start:end])

        webp_matches = list(re.finditer(keyword, webpage_description, re.IGNORECASE)) if ignore_case else list(re.finditer(keyword, webpage_description))
        for match in webp_matches:
            start = max(0, match.start() - 20)
            end = min(len(webpage_description), match.end() + 20)
            all_matches.append(webpage_description[start:end])

        webp_title_matches = list(re.finditer(keyword, webpage_title, re.IGNORECASE)) if ignore_case else list(re.finditer(keyword, webpage_title))
        for match in webp_title_matches:
            start = max(0, match.start() - 20)
            end = min(len(webpage_title), match.end() + 20)
            all_matches.append(webpage_title[start:end])

        return all_matches

    df["all_matches"] = df.apply(find_matches, axis=1)
    
    return df["all_matches"]


In [5]:
greta_keywords = {
  "any": ['[Tt]hunberg', '[Gg]reta']
}
df_greta = filter_message_data_by_keywords(df, keywords=greta_keywords)

In [None]:
trans_keywords = {
  "any": [
      'trans(?!(port|form|paren|pir|action|it|kript|atlanti|fer|fusion|aktion|human|cript|lat|nation|nistri))',
      '(?<!(undle|vorra|leidi))(?<!(fol|rre|dri|ewe|sor|tra|tei))gender',
      'binär',
      'binary',
      'geschlecht',
      'queer',
      'lgbt'
      ]
}
df_trans = filter_message_data_by_keywords(df, keywords=trans_keywords)

In [None]:
migrant_keywords = {
  "all": [
     '(migrant|asyl|flücht|refug|ausländ|türk|arab|muslim|islam|terror)'
  ]
}
df_migrant = filter_message_data_by_keywords(df, keywords=migrant_keywords)

In [8]:

brandenburg_election_keywords = {
   "any": ['brandenburg']
}

df_brandenburg_election = filter_message_data_by_keywords(df, keywords=brandenburg_election_keywords)

In [9]:

weidel_keywords = {
  "any": ['weidel']
}

weidel_df = filter_message_data_by_keywords(df, keywords=weidel_keywords)

In [None]:
vaccine_keywords = {
  "any": ["corona", "covid","(?<!(sch|ngl))impf","vacc","pandem", "mrna", "spritze"]
}

vaccine_df = filter_message_data_by_keywords(df, keywords=vaccine_keywords)