# About
This notebook is used to create a subset of the dataset for finetuning a GPT-style conversational language model.

In [None]:
from loguru import logger

In [None]:
# Config
SUBREDDITS = ['hasan_piker']

In [None]:
from datasets import load_dataset
dataset = load_dataset("brianmatzelle/2024-election-subreddit-threads-643k", split = "train")
prefilter_size = len(dataset)

from utils import to_k
logger.info(f"Prefilter dataset size: {to_k(prefilter_size)}")
dataset[0]

In [None]:
from collections import Counter

# Count the posts in each subreddit
subreddit_counts = Counter(post['metadata']['subreddit']['name'] for post in dataset)
# Sort by the number of posts in descending order
ranked_subreddits = sorted(subreddit_counts.items(), key=lambda x: x[1], reverse=True)

# Print the ranking
for i, (subreddit, count) in enumerate(ranked_subreddits, start=1):
    for post in dataset:
        if post['metadata']['subreddit']['name'] == subreddit:
            subscribers = post['metadata']['subreddit']['subscribers']
    print(f"{i}. r/{subreddit}: {count} posts, {subscribers} subscribers")

# Optionally log the results if needed
logger.info(f"Subreddit ranking:\n{ranked_subreddits}")

In [None]:
# show most controversial posts
from collections import defaultdict
from pprint import pprint

# Create a dictionary to store the posts for each subreddit
subreddit_posts = defaultdict(list)
most_controversial_posts = []
highest_score = 0

for post in dataset:
    controversiality = post['metadata']['controversiality']
    subreddit = post['subreddit']
    
    # Add the post to the corresponding subreddit list
    subreddit_posts[subreddit].append((post, controversiality))
    
    # Check if this post has the highest controversiality score so far
    if controversiality > highest_score:
        highest_score = controversiality
        most_controversial_posts = [post]  # Reset the list to include only this post
    elif controversiality == highest_score:
        most_controversial_posts.append(post)  # Add to the list if the score matches the highest

# Sort the posts in each subreddit by controversiality in descending order
for subreddit, posts in subreddit_posts.items():
    subreddit_posts[subreddit] = sorted(posts, key=lambda x: x[1], reverse=True)

# Display the results
print("Most Controversial Posts Across All Subreddits:")
pprint(most_controversial_posts)

print("\nMost Controversial Posts By Subreddit:")
for subreddit, posts in subreddit_posts.items():
    print(f"Subreddit: {subreddit}")
    pprint([post[0] for post in posts[:5]])  # Display top 5 most controversial posts per subreddit


In [None]:
dataset = dataset.filter(lambda x: x['metadata']['subreddit']['name'] in SUBREDDITS)
logger.info(f"Filtered {to_k(prefilter_size - len(dataset))} posts from the dataset")
del prefilter_size

logger.info(f"Dataset size: {to_k(len(dataset))} posts")
dataset[0]