In [98]:
import pandas as pd
import textwrap
import shutil
import re

In [99]:
data_path = '../../data/24_09_28-test_scrape/24_09_28-test_scrape-prepro.csv'

In [100]:
def print_wrapped_to_fit_terminal(text):
    if not isinstance(text, str):
        text = str(text)

    width = shutil.get_terminal_size(fallback=(80, 20)).columns
    print(textwrap.fill(text, width=width))


def filter_by_keywords(df, column, keywords={}, ignore_case=True):
    query = pd.Series([True] * len(df))  # Start with all True

    if "all" in keywords:
        for keyword in keywords["all"]:
            query &= df[column].str.contains(keyword, na=False, case=ignore_case)

    if "any" in keywords:
        some_query = pd.Series([False] * len(df))
        for keyword in keywords["any"]:
            some_query |= df[column].str.contains(keyword, na=False, case=ignore_case)
        query &= some_query

    if "none" in keywords:
        for keyword in keywords["none"]:
            query &= ~df[column].str.contains(keyword, na=False, case=ignore_case)

    return df[query]


# Filter a dataframe of messages by keywords in the message text, webpage title, and webpage description
def filter_message_data_by_keywords(df, keywords={}, ignore_case=True):
    column_filtered = [
        filter_by_keywords(df, "message_text", keywords, ignore_case),
        filter_by_keywords(df, "webpage_title", keywords, ignore_case),
        filter_by_keywords(df, "webpage_description", keywords, ignore_case),
    ]

    return pd.concat(column_filtered).drop_duplicates()


# Returns a list where each element holds matches for one row of a dataframe for a regex keyword in the columns message_text, webpage_title, and webpage_description of a dataframe
def get_matches_for_keyword_in_message_data(df, keyword, ignore_case=True):
    def find_matches(row):
        message_text = row.get("message_text", "")
        webpage_description = row.get("webpage_description", "")
        webpage_title = row.get("webpage_title", "")

        all_matches = []
        
        # Search for matches in message_text
        message_matches = list(re.finditer(keyword, message_text, re.IGNORECASE)) if ignore_case else list(re.finditer(keyword, message_text))
        for match in message_matches:
            start = max(0, match.start() - 20)
            end = min(len(message_text), match.end() + 20)
            all_matches.append(message_text[start:end])

        # Search for matches in webpage_description
        webp_matches = list(re.finditer(keyword, webpage_description, re.IGNORECASE)) if ignore_case else list(re.finditer(keyword, webpage_description))
        for match in webp_matches:
            start = max(0, match.start() - 20)
            end = min(len(webpage_description), match.end() + 20)
            all_matches.append(webpage_description[start:end])

        # Search for matches in webpage_title
        webp_title_matches = list(re.finditer(keyword, webpage_title, re.IGNORECASE)) if ignore_case else list(re.finditer(keyword, webpage_title))
        for match in webp_title_matches:
            start = max(0, match.start() - 20)
            end = min(len(webpage_title), match.end() + 20)
            all_matches.append(webpage_title[start:end])

        return all_matches

    # Apply the find_matches function to each row
    df["all_matches"] = df.apply(find_matches, axis=1)
    
    return df["all_matches"]


In [None]:
df = pd.read_csv(data_path, index_col=0)

# Topic Queries

In [None]:
greta_keywords = {
  "any": ['[Tt]hunberg', '[Gg]reta']
}
df_greta = filter_message_data_by_keywords(df, keywords=greta_keywords)

In [None]:
trans_keywords = {
  "any": [
      'trans(?!(port|form|paren|pir|action|it|kript|atlanti|fer|fusion|aktion|human|cript|lat|nation|nistri))',
      '(?<!(undle|vorra|leidi))(?<!(fol|rre|dri|ewe|sor|tra|tei))gender',
      'binär',
      'binary',
      'geschlecht',
      'queer',
      'lgbt'
      ]
}
df_trans = filter_message_data_by_keywords(df, keywords=trans_keywords)


# '(trans(?!(port|form|paren|pir|action|it|kript|atlanti|fer|fusion|aktion|human|cript|lat|nation|nistri))|(?<!(undle|vorra|leidi))(?<!(fol|rre|dri|ewe|sor|tra|tei))gender|binär|binary|geschlecht|queer|lgbt)'

In [None]:
migrant_keywords = {
  "all": [
     '(migrant|asyl|flücht|refug|ausländ|türk|arab|kanak|muslim|islam|terror)'
     #'messer'
  ]
}
df_migrant = filter_message_data_by_keywords(df, keywords=migrant_keywords)

# '(migrant|asyl|flücht|refug|ausländ|türk|arab|kanak|muslim|islam|terror)'

In [None]:

brandenburg_election_keywords = {
   "any": ['brandenburg']
}

df_brandenburg_election = filter_message_data_by_keywords(df, keywords=brandenburg_election_keywords)

In [None]:

weidel_keywords = {
  "any": ['weidel']
}

weidel_df = filter_message_data_by_keywords(df, keywords=weidel_keywords)

In [None]:
vaccine_keywords = {
  "any": ["corona", "covid","(?<!(sch|ngl))impf","vacc","pandem", "mrna", "spritze"]
}

vaccine_df = filter_message_data_by_keywords(df, keywords=vaccine_keywords)

# '(corona|covid|(?<!(sch|ngl))impf|vacc|pandem|mrna|spritze)'

In [None]:

display(vaccine_df)

for _, row in vaccine_df.iterrows():
    print_wrapped_to_fit_terminal(row['message_text'])
    print_wrapped_to_fit_terminal(row['webpage_description'])
    print(row['message_date'])

    message_text = row['message_text'] if isinstance(row['message_text'], str) else str(row['message_text'])
    webpage_description = row['webpage_description'] if isinstance(row['webpage_description'], str) else str(row['webpage_description'])

    all_matches = []

    for keyword in vaccine_keywords['any']:
        message_matches = list(re.finditer(keyword, message_text))
        webp_matches = list(re.finditer(keyword, webpage_description))

        for match in message_matches:
          start = max(0, match.start() - 20)
          end = min(len(row['message_text']), match.end() + 20)
          all_matches.append(row['message_text'][start:end])

        for match in webp_matches:
          start = max(0, match.start() - 20)
          end = min(len(row['webpage_description']), match.end() + 20)
          all_matches.append(row['webpage_description'][start:end])

    print("")
    print("MATCHES:")
    for match in all_matches:
      print(match)
                                     
    print('---')
