In [2]:
import pandas as pd
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException

In [3]:
raw_data_path = '../../data/main_dataset/main_dataset.csv'

In [4]:
"""
Remove unused columns from dataset
"""
def exclude_columns(df):
  columns_to_exclude = ['collection_time', 'sender_first_name', 'sender_last_name', 'sender_display_name', 'sender_username', 'fwd_from_user_name', 'post_author', 'is_group_elem', 'message_group_id']
  return df.drop(columns=columns_to_exclude)

"""
Filter out messages that are replies to messages that are not in the dataset
"""
def filter_out_of_dataset_replies(df, df_full):
  replies = df[df['reply_to_message_id'].notna()]

  for index, row in replies.iterrows():
    reply_to_message_id = row['reply_to_message_id']
    chat_handle = row['chat_handle']
    same_chat_df = df_full[df_full['chat_handle'] == chat_handle]

    # if message is a reply to a message that is not in the dataset, remove it.
    if reply_to_message_id not in same_chat_df['telegram_message_id'].values:
      df = df.drop(index)

  return df

"""
Detect languages in text
"""
def detect_text_langs(text):
  if pd.isna(text):
    return None
  
  text = str(text)

  try:
    return detect_langs(text)
  except LangDetectException:
    return None

"""
Get the combined word count of a sample
"""
def get_word_count(row):
    count = 0
    if pd.notnull(row['message_text']):
        count += len(row['message_text'].split(" "))
    if pd.notnull(row['webpage_description']):
        count += len(row['webpage_description'].split(" "))

    return count



# General filtering

In [None]:
print("Loading data...")
df_raw = pd.read_csv(raw_data_path, index_col=0)

# preprocessing which applies to all data, so no filtering
print("Preprocessing")
print(f"Row count before: {len(df_raw)}")

print("Excluding columns...")
df_raw = exclude_columns(df_raw)

print("Make messages unique by message text and webpage description...")
df_raw = df_raw.drop_duplicates(subset=['message_text', 'webpage_description'])
print(f"Row count after: {len(df_raw)}")

# add language information
print("Detecting webpage description languages...")
df_raw['webpage_description_lang'] = df_raw['webpage_description'].apply(detect_text_langs)
print("Detecting message text languages...")
df_raw['message_text_lang'] = df_raw['message_text'].apply(detect_text_langs)

# Language filtering

In [None]:
"""
Filter detected languages for those with high confidence
"""
def get_confident_langs(detect_langs_result):
  if detect_langs_result is None:
    return []
  return sorted([lang.lang for lang in detect_langs_result if lang.prob > 0.5])

"""
Get the unique languages of a sample
"""
def get_langset(row):
  langs = set()
  if row['webpage_description_lang']:
    langs.update([lang.lang for lang in row['webpage_description_lang']])
  if row['message_text_lang']:
    langs.update([lang.lang for lang in row['message_text_lang']])
  return list(langs)

"""
Get the unique languages of a sample with high confidence
"""
def get_confident_langset(row):
  langs = set()
  if row['webpage_description_lang']:
    langs.update([lang.lang for lang in row['webpage_description_lang'] if lang.prob > 0.5])
  if row['message_text_lang']:
    langs.update([lang.lang for lang in row['message_text_lang'] if lang.prob > 0.5])
  return list(langs)


df_lang_filtering = df_raw.copy()

df_lang_filtering["langset"] = df_lang_filtering.apply(lambda row: get_langset(row), axis=1)
df_lang_filtering["langset_confident"] = df_lang_filtering.apply(lambda row: get_confident_langset(row), axis=1)

# discard messages where message text is not either confidently german or german and english only
df_lang_filtering = df_lang_filtering[df_lang_filtering.apply(lambda row: get_confident_langs(row['message_text_lang']) == ['de'] , axis=1)]
message_text_only_german = df_lang_filtering.apply(lambda row: get_confident_langs(row['message_text_lang']) == ['de'] , axis=1)
message_text_german_and_english = df_lang_filtering.apply(lambda row: get_confident_langs(row['message_text_lang']) == ['de', 'en'], axis=1)

df_lang_filtering = df_lang_filtering[message_text_only_german | message_text_german_and_english]

df_raw = df_lang_filtering.copy()

print(f"Row count after: {len(df_raw)}")

# Additional filtering

In [None]:
print("Creating dfs for preprocessing and full dataset...")
df_prepro = df_raw.copy()
df_additional = df_raw.copy()

print(f"Row count before: {len(df_prepro)}")

# remove polls
print("Removing polls...")
df_prepro = df_prepro[df_prepro['message_media_type'] != 'MessageMediaPoll']
print(f"Row count after removing polls: {len(df_prepro)}")

# remove messages without text and webpage description
print("Removing messages without text and webpage description...")
df_prepro = df_prepro[df_prepro['message_text'].notna() | df_prepro['webpage_description'].notna()]
print(f"Row count after removing messages without text and webpage description: {len(df_prepro)}")

# remove messages where message text consists of only a url and no webpage description is attached
print("Removing messages with only a url...")
df_prepro = df_prepro[~(df_prepro['message_text'].str.contains('http') & ~df_prepro['message_text'].str.contains(' ') & df_prepro["webpage_description"].isna())]
print(f"Row count after removing messages with only a url: {len(df_prepro)}")

print("Removing messages with only a mention...")
df_prepro = df_prepro[~(df_prepro['message_text'].str.contains('@') & ~df_prepro['message_text'].str.contains(' ') & df_prepro['webpage_description'].isna())]
print(f"Row count after removing messages with only a mention: {len(df_prepro)}")

# remove messages shorter than 15 chars which are not replies to other messages and have no webpage attached
print("Removing short messages...")
df_prepro = df_prepro[
  ~(
    (df_prepro['message_text'].str.len() < 15) & 
    (df_prepro['reply_to_message_id'].isna()) & 
    (df_prepro['message_media_type'] != 'MessageMediaWebPage')
  )
]
print(f"Row count after removing short messages: {len(df_prepro)}")

print("Removing long messages...")
df_prepro['word_count'] = df_prepro.apply(get_word_count, axis=1)
df_prepro = df_prepro[df_prepro['word_count'] < 300]
df_prepro = df_prepro.drop(columns=['word_count'])
print(f"Row count after removing long messages: {len(df_prepro)}")


# remove messages that are replies to messages that are not in the dataset
print("Removing replies to messages not in dataset...")
df_prepro = filter_out_of_dataset_replies(df_prepro, df_additional)
print(f"Row count after removing replies to messages not in dataset: {len(df_prepro)}")

# shuffle dataset
print("Shuffling...")
df_prepro = df_prepro.sample(frac=1).reset_index(drop=True)

print("Done!")

# Retrieval of reply thread messages
Retrieve messages from the unfiltered dataset to which messages in the filtered datasets are replies.

In [153]:
"""
From a dataframe of samples and a dataframe of additional rows, find all additional rows to which a reply is made in the samples
"""
def get_additional_required(samples, additional)
  rows_to_check = pd.DataFrame(columns=samples.columns)
  additional_required = pd.DataFrame(columns=samples.columns)

  # Iterate through samples and add reply rows to rows_to_check
  for index, row in samples.iterrows():
      if pd.notna(row['reply_to_message_id']):
          rows_to_check = pd.concat([rows_to_check, row.to_frame().T])

  # While rows_to_check is not empty, process the rows
  while not rows_to_check.empty:
      row_to_check = rows_to_check.iloc[0]
      rows_to_check = rows_to_check.iloc[1:]

      # Check if row_to_check is a reply
      if pd.notna(row_to_check['reply_to_message_id']):
          # Look for a matching row in df_additional
          matching_rows = additional[
              (additional['chat_handle'] == row_to_check['chat_handle']) &
              (additional['telegram_message_id'] == row_to_check['reply_to_message_id'])
          ]

          # If a matching row is found, add it to additional_required
          if not matching_rows.empty:
              additional_required = pd.concat([additional_required, matching_rows])

              # Check if the matching row is a reply, add it to rows_to_check if so
              for _, match_row in matching_rows.iterrows():
                  if pd.notna(match_row['reply_to_message_id']):
                      rows_to_check = pd.concat([rows_to_check, match_row.to_frame().T])

  # Make additional_required unique by 'chat_handle' and 'telegram_message_id'
  result = additional_required.drop_duplicates(subset=['chat_handle', 'telegram_message_id'])
  return result

# Splitting and exporting
Split the filtered dataset into slices of 5000 samples each and export them and the corresponding additional messages

In [None]:
df_additional = get_additional_required(df_prepro, df_additional)

df_prepro.to_csv('../../data/main_dataset/main_dataset-prepro.csv')
df_additional.to_csv('../../data/main_dataset/main_dataset-additional.csv')

slice_width = 5000
curr_start = 0

while curr_start < len(df_prepro):
  print(f"Processing slice {curr_start} - {curr_start + slice_width}...")
  df_prepro_slice = df_prepro[curr_start:curr_start + slice_width]
  df_additional_slice = get_additional_required(df_prepro_slice, df_additional)
  
  df_prepro_slice.to_csv(f'../../data/main_dataset/main_dataset-prepro-{curr_start}-{curr_start + slice_width}.csv')
  df_additional_slice.to_csv(f'../../data/main_dataset/main_dataset-additional-{curr_start}-{curr_start + slice_width}.csv')
  curr_start += slice_width
