# Preprocessing


In [95]:
import pandas as pd
import json
import re

In [97]:
file1 = 'Bopl_Battle.csv'

# Read the csvs
df = pd.read_csv(file1)

In [99]:
df.shape

(520, 7)

In [101]:
# Drop duplicates
df = df.drop_duplicates(subset= ['type', 'author', 'title', 'body']).reset_index(drop=True)

# Not adding score as it can change over time


In [103]:
df.shape

(519, 7)

In [105]:
# Exclude Comments and posts that were removed
df = df.loc[df['body'] != '[removed]']
df = df.loc[df['body'] != ' '] # Removes posts that are just links
df = df.loc[df['body'] != '[deleted]']
# Some posts have the body as just 'Title'
df.loc[df['body'] == 'Title'] = ' '

In [107]:
nan_in_columns = df[df[['title', 'body']].isna().any(axis=1)]
nan_in_columns

Unnamed: 0,type,author,title,body,created_date,score,url
61,submission,Schmiple,Petition for Mac Version,,2024-01-16 22:18:20,53,https://i.redd.it/ieyszht1nvcc1.jpeg
90,submission,,anyone wanna hop on some bopl battle?,,2024-01-16 04:12:29,4,https://www.reddit.com/r/Bopl_Battle/comments/...
108,submission,Diego-who,If SUPER SMASH had DEADLY SLIMES💣! ~ Bopl Battle,,2024-01-13 15:12:34,2,https://youtu.be/RCWQA-TAHpU
126,submission,Secret-Net-3660,Does anyone know what this ability is,,2024-01-09 17:26:28,20,https://i.redd.it/fo4648do8gbc1.jpeg
245,submission,BColen1c,Lookin for someone to play,,2023-12-27 20:26:04,3,https://www.reddit.com/r/Bopl_Battle/comments/...
250,submission,WackyModer,Bad Apple Bopl Battle (It sounds and looks bet...,,2023-12-24 22:12:36,30,https://v.redd.it/g2w50sfvgb8c1
266,submission,kurdistanmonke,My Friend Coping In Bopl Battle,,2023-12-23 21:41:34,2,https://www.youtube.com/watch?v=6q1RJk1ZKm8
267,submission,Serious-Thought-6138,can somebody play with me in bopl battle,,2023-12-23 07:34:45,5,https://www.reddit.com/r/Bopl_Battle/comments/...
272,submission,AmorguSUS,Can you cancle a grappling hook(your own and o...,,2023-12-23 01:10:58,4,https://www.reddit.com/r/Bopl_Battle/comments/...
275,submission,Other_East_6912,Particle accelerator,,2023-12-22 08:02:09,16,https://v.redd.it/ixp4ny2kzs7c1


In [109]:
df['body'] = df['body'].fillna(' ')
df['title'] = df['title'].fillna(' ')


In [111]:
def remove_emoticons_and_url_links(text):
    # Pattern for urls
    url_pattern = r'https?://\S+|www\.\S+'

    # Pattern for emojis using unicode
    emoji_pattern = (
        r'[^\x00-\x7F]'  # Non-ASCII characters
        r'|[\U00010000-\U0010FFFF]'  # emojis
    )

    # Remove URLs
    clean_text = re.sub(url_pattern, '', text)

    # Remove emoticon
    clean_text = re.sub(emoji_pattern, '', clean_text)

    return clean_text

In [113]:
df['title'] = df['title'].apply(remove_emoticons_and_url_links)
df['body'] = df['body'].apply(remove_emoticons_and_url_links)

# collecting youtube relevant data

In [115]:
# Define YouTube-related keywords
youtube_keywords = [
    'youtube', 'youtuber', 'video', 'channel', 'subscribe',
    'watched', 'stream', 'streamer', 'reaction', 'playthrough',
    "let's play", 'review', 'gameplay', 'walkthrough', 'tutorial',
    'live stream', 'commentary', 'episode', 'highlight', 'vlog',
    'content creator', 'subscribed', 'unboxing', 'collab',
    'featured in', 'reaction video', 'uploaded', 'livestreamed',
    'clip', 'clips', 'playback', 'fan of', 'mentioned in', 'shared by', 'suggested'
]

# List to store the indices of relevant rows
relevant_rows = []

# Loop through each row and search for YouTube-related keywords
for i in range(len(df)):
    # Combine title, body, and url columns for keyword search
    text = f"{df.iloc[i]['title']} {df.iloc[i]['body']} {str(df.iloc[i].get('url', ''))}"

    
    # Check if any YouTube-related keyword is present
    if any(re.search(rf'\b{keyword}\b', text, re.IGNORECASE) for keyword in youtube_keywords):
        relevant_rows.append(i)

# Create a new DataFrame with the relevant rows
youtube_relevant_df = df.iloc[relevant_rows]

list_of_dicts = youtube_relevant_df.to_dict(orient='records')

# Create the dictionary with the desired format
data_to_write = {"submissions": list_of_dicts}

# Write the dictionary to a JSON file
with open('processed_youtube_relevant_Bopl_Battle', 'w') as json_file:
    json.dump(data_to_write, json_file, indent=4)

# Save the filtered data to a new CSV file
output_path = "processed_youtube_relevant_Bopl_Battle.csv"
youtube_relevant_df.to_csv(output_path, index=False)

print(f"Number of relevant rows: {len(youtube_relevant_df)}")
print(f"Filtered data saved to: {output_path}")

Number of relevant rows: 15
Filtered data saved to: processed_youtube_relevant_Bopl_Battle.csv
