# About
This notebook is used to create a subset of the dataset for finetuning a GPT-style conversational language model.

In [36]:
from loguru import logger

In [37]:
# Config
SUBREDDITS = ['hasan_piker']

In [38]:
from datasets import load_dataset
dataset = load_dataset("brianmatzelle/2024-election-subreddit-threads-643k", split = "train")
prefilter_size = len(dataset)

from utils import to_k
logger.info(f"Prefilter dataset size: {to_k(prefilter_size)}")
dataset[0]

[32m2024-11-27 00:01:57.342[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m36[0m - [1m643k rows[0m
[32m2024-11-27 00:01:57.343[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mPrefilter dataset size: 643k[0m


{'conversations': [{'content': 'You are a redditor, having a conversation with another redditor.',
   'role': 'system'},
  {'content': "On 270ToWin's website, North Carolina is now beige and South Carolina is now a lighter shade of red.",
   'role': 'user'},
  {'content': 'It would be awesome to clean sweep the east coast.',
   'role': 'assistant'}],
 'metadata': {'post': {'author': 'MrMockTurtle',
   'downvotes': 0,
   'flair': '📊 Poll',
   'score': 59,
   'suggested_sort': None,
   'upvote_ratio': 0.98,
   'upvotes': 59},
  'subreddit': {'name': 'democrats', 'subscribers': 465882}}}

In [None]:
from collections import Counter

# Count the posts in each subreddit
subreddit_counts = Counter(post['metadata']['subreddit']['name'] for post in dataset)
# Sort by the number of posts in descending order
ranked_subreddits = sorted(subreddit_counts.items(), key=lambda x: x[1], reverse=True)

# Print the ranking
for i, (subreddit, count) in enumerate(ranked_subreddits, start=1):
    for post in dataset:
        if post['metadata']['subreddit']['name'] == subreddit:
            subscribers = post['metadata']['subreddit']['subscribers']
    print(f"{i}. r/{subreddit}: {count} posts, {subscribers} subscribers")

# Optionally log the results if needed
logger.info(f"Subreddit ranking:\n{ranked_subreddits}")

[32m2024-11-27 00:02:19.503[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mSubreddit ranking:
[('politics', 308399), ('destiny', 89524), ('conservative', 43101), ('neoliberal', 40793), ('democrats', 34886), ('thedavidpakmanshow', 23745), ('vaushv', 13264), ('hasan_piker', 10740), ('republican', 10720), ('libertarian', 9904), ('jordanpeterson', 9247), ('samharris', 8719), ('worldnews', 7638), ('themajorityreport', 6382), ('socialism', 6035), ('news', 4728), ('seculartalk', 4242), ('joerogan', 4111), ('millenials', 3049), ('economics', 1761), ('daverubin', 1733), ('benshapiro', 716), ('progressive', 159)][0m


1. r/politics: 308399 posts
2. r/destiny: 89524 posts
3. r/conservative: 43101 posts
4. r/neoliberal: 40793 posts
5. r/democrats: 34886 posts
6. r/thedavidpakmanshow: 23745 posts
7. r/vaushv: 13264 posts
8. r/hasan_piker: 10740 posts
9. r/republican: 10720 posts
10. r/libertarian: 9904 posts
11. r/jordanpeterson: 9247 posts
12. r/samharris: 8719 posts
13. r/worldnews: 7638 posts
14. r/themajorityreport: 6382 posts
15. r/socialism: 6035 posts
16. r/news: 4728 posts
17. r/seculartalk: 4242 posts
18. r/joerogan: 4111 posts
19. r/millenials: 3049 posts
20. r/economics: 1761 posts
21. r/daverubin: 1733 posts
22. r/benshapiro: 716 posts
23. r/progressive: 159 posts


In [40]:
dataset = dataset.filter(lambda x: x['metadata']['subreddit']['name'] in SUBREDDITS)
logger.info(f"Filtered {to_k(prefilter_size - len(dataset))} posts from the dataset")
del prefilter_size

logger.info(f"Dataset size: {to_k(len(dataset))} posts")
dataset[0]

[32m2024-11-27 00:02:19.543[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m36[0m - [1m632k rows[0m
[32m2024-11-27 00:02:19.544[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mFiltered 632k posts from the dataset[0m
[32m2024-11-27 00:02:19.545[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m36[0m - [1m10k rows[0m
[32m2024-11-27 00:02:19.546[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mDataset size: 10k posts[0m


{'conversations': [{'content': 'You are a redditor, having a conversation with another redditor.',
   'role': 'system'},
  {'content': 'I just got done arguing in the comments of the r/codyko_snark about Hasans reaction to Deangelo’s video. Gen Z’s feel his reaction was untimely, and without substance. Specifically calling both of those “red flags,” and in general being dissatisfied with the way he shed light on it…. Am I the only one who thinks what he spoke about was completely sufficient? Like he’s a political commentator who’s been invited into live national tv shows, not a YouTube, drama, etc. channel. I know what happened between Cody and Tana was a crime, not drama. But would it not be incredibly inappropriate to stop talking about Palestinians getting murdered, and the dire spot our election is to dissect this entire situation? There are massive YouTube channels that are in that realm of content and have made their in depth analysis. Why are people attacking Hasan like this is 