In [7]:
import pandas as pd

raw_data_path = '../../data/24_09_28-test_scrape/24_09_28-test_scrape-raw.csv'

# Reading the dataset

In [9]:
df_raw = pd.read_csv(raw_data_path, index_col=0)

# Preprocessing and filtering

In [10]:
"""
Exclude unnecessary columns from the dataset
"""
def exclude_columns(df):
  columns_to_exclude = ['collection_time', 'sender_first_name', 'sender_last_name', 'sender_display_name', 'sender_username', 'fwd_from_user_name', 'post_author', 'is_group_elem', 'message_group_id']
  return df.drop(columns=columns_to_exclude)

""" 
Remove unwanted messages from the dataset:
- Polls
- Messages without text (except for those referencing a webpage)
- Messages that are replies to messages that are not in the dataset
"""
def filter_dataset(df, full_df):
  # filter out polls
  df = df[df['message_media_type'] != 'MessageMediaPoll']

  # filter out messages without text except for those referencing a webpage
  df = df[df['message_text'].notna() | (df['message_media_type'] == 'MessageMediaWebPage')]

  # remove messages that are replies to messages that are not in the datasetet
  replies = df[df['reply_to_message_id'].notna()]

  for index, row in replies.iterrows():
    reply_to_message_id = row['reply_to_message_id']
    chat_handle = row['chat_handle']
    same_chat_df = full_df[full_df['chat_handle'] == chat_handle]

    # if message is a reply to a message that is not in the dataset, remove it.
    if reply_to_message_id not in same_chat_df['telegram_message_id'].values:
      df = df.drop(index)
  
  # reset index
  df.reset_index(drop=True, inplace=True)

  return df


In [None]:
# preprocessing which applies to all data, so no filtering
df_prepro = exclude_columns(df_raw)
df_additional = exclude_columns(df_raw)

# filtering
df_prepro = filter_dataset(df_prepro, df_raw)

# shuffle dataset
df_prepro = df_prepro.sample(frac=1).reset_index(drop=True)


# Exporting the data
- Filtered dataset for rumor extraction
- Unfiltered dataset for potentially resolving reply threads 

In [12]:
df_prepro.to_csv('../../data/24_09_28-test_scrape/24_09_28-test_scrape-prepro.csv')
df_additional.to_csv('../../data/24_09_28-test_scrape/24_09_28-test_scrape-additional')