In [None]:
import pandas as pd
import ftfy
import re

# Clean Text Data and Create Smaller Task Specific Datasets

In [None]:
# Load the dataset
reddit_comments = pd.read_csv('{add directory}/reddit_opinion_PSE_ISR_june.csv')

# Remove Irrelevant Columns

In [None]:
# List of columns to remove
columns_to_remove = [
    'author_name', 'controversiality', 'ups', 'downs',
    'user_is_verified', 'user_account_created_time', 'user_awardee_karma',
    'user_awarder_karma', 'user_link_karma', 'user_comment_karma',
    'user_total_karma', 'post_upvote_ratio', 'post_thumbs_ups', 
    'post_total_awards_received'
]

# Removing the columns from the DataFrame
reddit_comments = reddit_comments.drop(columns=columns_to_remove)


## FTFY Cleaning

In [None]:
# Fill NaN values in text columns with an empty string to prevent errors in ftfy
reddit_comments['self_text'] = reddit_comments['self_text'].fillna('')
reddit_comments['post_title'] = reddit_comments['post_title'].fillna('')
reddit_comments['post_self_text'] = reddit_comments['post_self_text'].fillna('')


# Clean the text columns in the merged dataset using ftfy
reddit_comments['self_text'] = reddit_comments['self_text'].apply(ftfy.fix_text)
reddit_comments['post_title'] = reddit_comments['post_title'].apply(ftfy.fix_text)
reddit_comments['post_self_text'] = reddit_comments['post_self_text'].apply(ftfy.fix_text)

# Remove any lingering problematic characters after ftfy processing
reddit_comments['self_text'] = reddit_comments['self_text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
reddit_comments['post_title'] = reddit_comments['post_title'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
reddit_comments['post_self_text'] = reddit_comments['post_self_text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

In [None]:
# Unique Posts Dataset
unique_posts_df = reddit_comments[['post_id', 'post_title', 'post_self_text', 'subreddit']].drop_duplicates(subset=['post_id'])


## Filter Out Irrelevant Posts

In [None]:
# Initialize keywords related to the Israel-Palestine topic
keywords = [
    'Israel', 'Palestine', 'Gaza', 'West Bank', 'Hamas', 'IDF', 'Jerusalem', 'Zionist', 'Hezbollah',
    'Middle East conflict', 'two-state solution', 'Intifada', 'settlements', 'Nakba', 'al-Aqsa', 'peace talks',
    'Palestinian Authority', 'Netanyahu', 'Fatah', 'aipac', 'rafah', 'palestinian', 'zionism',
    'jabalia', 'protest', 'protestor', 'knesset', 'occupation', 'Natenyahu'
]

# Convert keywords to lowercase for case-insensitive matching
keywords = [keyword.lower() for keyword in keywords]

# Function to check if any keyword is in a given text
def contains_keywords(text, keywords):
    if pd.isna(text):
        return False
    text = text.lower()
    return any(keyword in text for keyword in keywords)

# List of multi-topic subreddits to filter
target_subreddits = [
    'CrazyFuckingVideos', 'CombatFootage', 'PublicFreakout', 'worldnewsvideo', 'worldnews',
    'NonCredibleDefense', 'NoahGetTheBoat', 'AbruptChaos', 'TerrifyingAsFuck', 'ActualPublicFreakouts'
]

# Filter the dataframe to include only the target subreddits
target_subreddit_df = unique_posts_df[unique_posts_df['subreddit'].isin(target_subreddits)]

# Apply the keyword search to these filtered posts
target_subreddit_df['relevant'] = target_subreddit_df.apply(
    lambda row: contains_keywords(row['post_title'], keywords) or contains_keywords(row['post_self_text'], keywords),
    axis=1
)

# Separate the irrelevant posts
irrelevant_posts = target_subreddit_df[~target_subreddit_df['relevant']].drop(columns=['relevant'])

# All other posts (including those not in the target subreddits) are considered relevant
relevant_posts = unique_posts_df.drop(irrelevant_posts.index)

# Save the relevant and irrelevant datasets to separate CSV files
relevant_posts.to_csv('/{add directory}/relevant_posts.csv', index=False) # The relevant posts dataset will be used for running the RoBERTa on posts
irrelevant_posts.to_csv('{add directory}/irrelevant_posts.csv', index=False)



## Making Subdatasets 

In [None]:
# Drop the entries in the reddit_comments dataframe that have a post_id in the irrelevant posts dataframe
reddit_comments_updated = reddit_comments[~reddit_comments['post_id'].isin(irrelevant_posts['post_id'])]
reddit_comments_updated.to_csv('{add directory}/reddit_comments_updated.csv', index=False)


In [None]:
# 1. URL Extraction Dataset: Include only unique post_id
url_extraction_df = reddit_comments_updated[['post_id']].drop_duplicates()
url_extraction_df.to_csv('{add directory}/url_extraction_dfs.csv', index=False)

# 2. Unique Comments Dataset
unique_comments_df = reddit_comments_updated[['comment_id', 'self_text']].drop_duplicates(subset=['comment_id'])
unique_comments_df.to_csv('{add directory}/unique_comments_df.csv', index=False)
