# About
This notebook is used to create a subset of the dataset for finetuning a GPT-style conversational language model.

## Config

In [1]:
# Config
  # ALL_SUBREDDITS = ['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
  # 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
  # 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
  # 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
  # 'themajorityreport' 'seculartalk']
SUBREDDITS: [str] = ['hasan_piker']
# HUGGINGFACE_USER = 'brianmatzelle'
HUGGINGFACE_USER: str = 'BinghamtonUniversity'
K_COUNT: int = 173
HUGGINGFACE_BASE_DATASET: str = f'2024-election-subreddit-threads-{K_COUNT}k'

# Exports
SUBREDDIT_NAME: str = "-".join(SUBREDDITS)
HUGGINGFACE_SUBSET_DATASET: str = f'{HUGGINGFACE_BASE_DATASET.replace("subreddit", SUBREDDIT_NAME)}'


# DO NOT EDIT BELOW THIS LINE
# remove -637k from the subset dataset name
if HUGGINGFACE_SUBSET_DATASET.endswith(f'-{K_COUNT}k'):
  HUGGINGFACE_SUBSET_DATASET = HUGGINGFACE_SUBSET_DATASET[:-4]

# Print config
print(f'SUBREDDITS: {SUBREDDITS}')
print(f'HUGGINGFACE_BASE_DATASET: {HUGGINGFACE_BASE_DATASET}')
print(f'HUGGINGFACE_SUBSET_DATASET: {HUGGINGFACE_SUBSET_DATASET}')

SUBREDDITS: ['hasan_piker']
HUGGINGFACE_BASE_DATASET: 2024-election-subreddit-threads-173k
HUGGINGFACE_SUBSET_DATASET: 2024-election-hasan_piker-threads-


In [2]:
from datasets import load_dataset, Dataset

dataset: Dataset = load_dataset(f"{HUGGINGFACE_USER}/{HUGGINGFACE_BASE_DATASET}", split = "train")
# load locally bc im on a place
# dataset = load_dataset('json', data_files=f'data/datasets/2024-election-subreddit-threads-{K_COUNT}k.json', split='train')

from utils import to_k
from loguru import logger
logger.info(f"Prefilter dataset size: {to_k(len(dataset))}")
dataset[0]

  from .autonotebook import tqdm as notebook_tqdm
[32m2024-12-05 21:53:48.700[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mPrefilter dataset size: 173k[0m


{'conversations': [{'content': 'The GOP\'s big, mysterious enemy: "They"',
   'metadata': {'author': 'Steve____Stifler',
    'controversiality': None,
    'created_utc': 1721334266,
    'downvotes': 0,
    'flair': 'News (US)',
    'is_submitter': None,
    'no_follow': True,
    'removed_by_category': 'moderator',
    'score': 343,
    'suggested_sort': None,
    'title': 'The GOP\'s big, mysterious enemy: "They"',
    'total_awards_received': 0,
    'upvote_ratio': 0.97,
    'upvotes': 343},
   'role': 'user'},
  {'content': '"Well who\'s \'THEY\'?? ... ... What the hell is an Aluminium Falcon?"',
   'metadata': {'author': 'mad_cheese_hattwe',
    'controversiality': 0,
    'created_utc': 1721336569,
    'downvotes': 0,
    'flair': None,
    'is_submitter': False,
    'no_follow': True,
    'removed_by_category': None,
    'score': 30,
    'suggested_sort': None,
    'title': None,
    'total_awards_received': 0,
    'upvote_ratio': None,
    'upvotes': 30},
   'role': 'assistant'},

## To Do (Analysis)
Give the user the option to run an analysis on the dataset.

Move these next two cells to an analysis function, add more/better analysis.

In [3]:
from typing import Dict

from collections import Counter

# Count the posts in each subreddit
subreddit_counts: Counter = Counter(post['metadata']['subreddit']['name'] for post in dataset)
# Sort by the number of posts in descending order
ranked_subreddits: [(str, int)] = sorted(subreddit_counts.items(), key=lambda x: x[1], reverse=True)

# Get subscriber count for each subreddit
subs_map: Dict[str, int] = {}
for post in dataset:
    sr_metadata = post['metadata']['subreddit']
    subreddit_name = sr_metadata['name']
    if subreddit_name not in subs_map:
        subs_map[subreddit_name] = sr_metadata['subscribers']

for i, (subreddit, count) in enumerate(ranked_subreddits, start=1):
    subs = subs_map.get(subreddit, 0)
    print(f"{i}. r/{subreddit}: {count} posts, {subs} subs")

# Optionally log the results if needed
logger.info(f"Subreddit ranking:\n{ranked_subreddits}")

[('politics', 91061), ('destiny', 21991), ('neoliberal', 13620), ('conservative', 8825), ('thedavidpakmanshow', 5210), ('democrats', 5018), ('samharris', 3501), ('vaushv', 3330), ('worldnews', 3270), ('jordanpeterson', 3200), ('libertarian', 2855), ('hasan_piker', 2241), ('news', 1977), ('republican', 1265), ('joerogan', 1187), ('themajorityreport', 1094), ('economics', 1005), ('seculartalk', 859), ('millenials', 846), ('socialism', 824), ('daverubin', 289), ('benshapiro', 218), ('progressive', 27)]


[32m2024-12-05 21:54:44.459[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mSubreddit ranking:
[('politics', 91061), ('destiny', 21991), ('neoliberal', 13620), ('conservative', 8825), ('thedavidpakmanshow', 5210), ('democrats', 5018), ('samharris', 3501), ('vaushv', 3330), ('worldnews', 3270), ('jordanpeterson', 3200), ('libertarian', 2855), ('hasan_piker', 2241), ('news', 1977), ('republican', 1265), ('joerogan', 1187), ('themajorityreport', 1094), ('economics', 1005), ('seculartalk', 859), ('millenials', 846), ('socialism', 824), ('daverubin', 289), ('benshapiro', 218), ('progressive', 27)][0m


1. r/politics: 91061 posts, 8667968 subs
2. r/destiny: 21991 posts, 250789 subs
3. r/neoliberal: 13620 posts, 177896 subs
4. r/conservative: 8825 posts, 1128052 subs
5. r/thedavidpakmanshow: 5210 posts, 51685 subs
6. r/democrats: 5018 posts, 477550 subs
7. r/samharris: 3501 posts, 110082 subs
8. r/vaushv: 3330 posts, 66254 subs
9. r/worldnews: 3270 posts, 40162440 subs
10. r/jordanpeterson: 3200 posts, 305909 subs
11. r/libertarian: 2855 posts, 503892 subs
12. r/hasan_piker: 2241 posts, 143479 subs
13. r/news: 1977 posts, 28557108 subs
14. r/republican: 1265 posts, 199853 subs
15. r/joerogan: 1187 posts, 1316283 subs
16. r/themajorityreport: 1094 posts, 73354 subs
17. r/economics: 1005 posts, 4577817 subs
18. r/seculartalk: 859 posts, 26571 subs
19. r/millenials: 846 posts, 103891 subs
20. r/socialism: 824 posts, 458927 subs
21. r/daverubin: 289 posts, 22017 subs
22. r/benshapiro: 218 posts, 57525 subs
23. r/progressive: 27 posts, 77881 subs


In [4]:
# # show most controversial posts
# from collections import defaultdict
# from pprint import pprint

# # Create a dictionary to store the posts for each subreddit
# controversiality = defaultdict(list)
# for post in dataset:
#     if post['metadata']['controversiality'] < 90:
#         continue
#     controversiality[post['metadata']['controversiality']] += post


# # print the controversiality
# pprint(controversiality)

## To Do (Dynamically choose refinement options)
Give the user (the one running this notebook) config options after viewing the data, so they can curate a dataset of their own.

In [None]:
subset = dataset.filter(lambda x: x['metadata']['subreddit']['name'] in SUBREDDITS)
logger.info(f"Filtered {to_k(len(dataset) - len(subset))} posts from the dataset")
del dataset

filtered_size_k = to_k(len(subset))
logger.info(f"Dataset size: {filtered_size_k} posts")

# Append the new size to the subset dataset name before saving
HUGGINGFACE_SUBSET_DATASET += f"{filtered_size_k}"

subset[0]

In [None]:
# OPTIONAL: Prepare for OpenAI
openai_subset = subset
openai_subset = openai_subset.remove_columns(["metadata"])
openai_subset = openai_subset.rename_column("conversations", "messages")
# add a new column for weight, make every value 1
# openai_subset = openai_subset.add_column("weight", [1] * len(openai_subset))

import os
os.makedirs(f"data/subsets/openai/{SUBREDDIT_NAME}", exist_ok=True)

# save complete subset to jsonl file
openai_subset.to_json(f"data/subsets/openai/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai.jsonl", lines=True)

# split into train and test
train_subset, test_subset = openai_subset.train_test_split(test_size=0.1).values()

# save train and test subsets to jsonl files
train_subset.to_json(f"data/subsets/openai/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai-train.jsonl", lines=True)
test_subset.to_json(f"data/subsets/openai/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai-test.jsonl", lines=True)

# save openai_subset to jsonl file
# openai_subset.to_json(f"data/subsets/{HUGGINGFACE_SUBSET_DATASET}-openai.jsonl", lines=True)
openai_subset[0]

## To Do
Give the user option to analyze the dataset again, after removing unwanted subreddits.

In [7]:
# save locally
# subset.to_json(f"data/subsets/{HUGGINGFACE_SUBSET_DATASET}.json")

## Push to Hub
Before you can run this cell, you need to

1. `pip install huggingface_hub`
2. `huggingface-cli login`

OR

In your shell, run
1. `export HF_TOKEN=YOUR_WRITE_ACCESS_TOKEN_FROM_HUGGINGFACE`

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

# push curated dataset to huggingface
subset.push_to_hub(f"{HUGGINGFACE_USER}/{HUGGINGFACE_SUBSET_DATASET}", token=os.getenv("HF_TOKEN"))