In [69]:
import pandas as pd
from langdetect import detect, detect_langs
from langdetect.lang_detect_exception import LangDetectException
import time

In [70]:
raw_data_path = '../../data/main_dataset/main_dataset.csv'

In [71]:
# exclude irrelevant columns from dataset
def exclude_columns(df):
  columns_to_exclude = ['collection_time', 'sender_first_name', 'sender_last_name', 'sender_display_name', 'sender_username', 'fwd_from_user_name', 'post_author', 'is_group_elem', 'message_group_id']
  return df.drop(columns=columns_to_exclude)

# filter out messages that are replies to messages that are not in the dataset
def filter_out_of_dataset_replies(df, df_full):
  # remove messages that are replies to messages that are not in the datasetet
  replies = df[df['reply_to_message_id'].notna()]

  for index, row in replies.iterrows():
    reply_to_message_id = row['reply_to_message_id']
    chat_handle = row['chat_handle']
    same_chat_df = df_full[df_full['chat_handle'] == chat_handle]

    # if message is a reply to a message that is not in the dataset, remove it.
    if reply_to_message_id not in same_chat_df['telegram_message_id'].values:
      df = df.drop(index)

  return df

# detect languages of rows
def detect_row_langs(text):
  if pd.isna(text):
    return None
  
  text = str(text)

  try:
    return detect_langs(text)
  except LangDetectException:
    return None
  
# get unique languages of row
def get_langset(row):
  langs = set()
  if row['webpage_description_lang']:
    langs.update([lang.lang for lang in row['webpage_description_lang']])
  if row['message_text_lang']:
    langs.update([lang.lang for lang in row['message_text_lang']])
  return list(langs)

# get unique languages with confidence greater than 0.5 of row
def get_confident_langset(row):
  langs = set()
  if row['webpage_description_lang']:
    langs.update([lang.lang for lang in row['webpage_description_lang'] if lang.prob > 0.5])
  if row['message_text_lang']:
    langs.update([lang.lang for lang in row['message_text_lang'] if lang.prob > 0.5])
  return list(langs)



In [None]:
print("Loading data...")
df_raw = pd.read_csv(raw_data_path, index_col=0)

# preprocessing which applies to all data, so no filtering
print("Preprocessing")
print(f"Row count before: {len(df_raw)}")

print("Excluding columns...")
df_raw = exclude_columns(df_raw)

# add language info
print("Detecting webpage description languages...")
df_raw['webpage_description_lang'] = df_raw['webpage_description'].apply(detect_row_langs)
print("Detecting message text languages...")
df_raw['message_text_lang'] = df_raw['message_text'].apply(detect_row_langs)
print("Detecting unique language set for each row...")
df_raw['langset'] = df_raw.apply(get_langset, axis=1)
print("Detecting unique language set with confidence > 0.5 for each row...")
df_raw['langset_confident'] = df_raw.apply(get_confident_langset, axis=1)

# filter out messages where none of the text is german or english
print("Filtering out messages where none of the text is german or english...")
df_raw = df_raw[df_raw['langset'].apply(lambda langs: 'de' in langs or 'en' in langs)]
print(f"Row count after: {len(df_raw)}")

print("Filter out messages where message text language is not german or english...")
df_raw = df_raw[df_raw['message_text_lang'].apply(lambda langs: langs == None or any(lang.lang in ['de', 'en'] for lang in langs))]
print(f"Row count after: {len(df_raw)}")

print("Filter out messages where webpage description language is not german or english...")
df_raw = df_raw[df_raw['webpage_description_lang'].apply(lambda langs: langs == None or any(lang.lang in ['de', 'en'] for lang in langs))]
print(f"Row count after: {len(df_raw)}")

print("Filter out messages where some of the text is confidently not german or english...")
df_raw = df_raw[df_raw['langset_confident'].apply(lambda langs: all(lang in ['de', 'en'] for lang in langs))]
print(f"Row count after: {len(df_raw)}")


In [None]:
print("Creating dfs for preprocessing and full dataset...")
df_prepro = df_raw.copy()
df_additional = df_raw.copy()

print(f"Row count before: {len(df_prepro)}")

# remove polls
print("Removing polls...")
df_prepro = df_prepro[df_prepro['message_media_type'] != 'MessageMediaPoll']
print(f"Row count after removing polls: {len(df_prepro)}")

# remove messages without text and webpage description
print("Removing messages without text and webpage description...")
df_prepro = df_prepro[df_prepro['message_text'].notna() | df_prepro['webpage_description'].notna()]
print(f"Row count after removing messages without text and webpage description: {len(df_prepro)}")

# remove messages where message text consists of only a url and no webpage description is attached
print("Removing messages with only a url...")
df_prepro = df_prepro[~(df_prepro['message_text'].str.contains('http') & ~df_prepro['message_text'].str.contains(' ') & df_prepro["webpage_description"].isna())]
print(f"Row count after removing messages with only a url: {len(df_prepro)}")

# remove messages shorter than 15 chars which are not replies to other messages and have no webpage attached
print("Removing short messages...")
df_prepro = df_prepro[
  ~(
    (df_prepro['message_text'].str.len() < 15) & 
    (df_prepro['reply_to_message_id'].isna()) & 
    (df_prepro['message_media_type'] != 'MessageMediaWebPage')
  )
]
print(f"Row count after removing short messages: {len(df_prepro)}")

# remove messages that are replies to messages that are not in the dataset
print("Removing replies to messages not in dataset...")
df_prepro = filter_out_of_dataset_replies(df_prepro, df_additional)
print(f"Row count after removing replies to messages not in dataset: {len(df_prepro)}")

# shuffle dataset
print("Shuffling...")
df_prepro = df_prepro.sample(frac=1).reset_index(drop=True)

print("Done!")
  

In [76]:
# remove language related columns again
#df_prepro = df_prepro.drop(columns=['webpage_description_lang', 'message_text_lang', 'langset', 'langset_confident'])
#df_additional = df_additional.drop(columns=['webpage_description_lang', 'message_text_lang', 'langset', 'langset_confident'])

In [77]:
df_prepro.to_csv('../../data/main_dataset/main_dataset-prepro.csv')
df_additional.to_csv('../../data/main_dataset/main_dataset-additional.csv')

In [58]:
df_something_foreign = df_raw.copy()

def get_confident_langset(row):
  langs = []
  if row['webpage_description_lang']:
    langs.extend([lang.lang for lang in row['webpage_description_lang'] if lang.prob > 0.5])
  
  if row['message_text_lang']:
    langs.extend([lang.lang for lang in row['message_text_lang'] if lang.prob > 0.5])
  return list(set(langs))

df_something_foreign["confident_langset"] = df_something_foreign.apply(get_confident_langset, axis=1)
df_something_foreign = df_something_foreign[df_something_foreign['confident_langset'].apply(lambda langs: any(lang not in ['de', 'en'] for lang in langs))]

In [None]:
print(len(df_something_foreign))

for index, row in df_something_foreign.head(100).iterrows():
  print("---")
  print(row["message_text_lang"])
  print(row["webpage_description_lang"])
  print(row["langset"])
  print(row["confident_langset"])
  print(row["message_text"])
  print(row["webpage_description"])
  print()