# About
This notebook is used to create a subset of the dataset for finetuning a GPT-style conversational language model.

## Config

In [1]:
from typing import List
# Config
  # ALL SUBREDDITS == ['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
  # 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
  # 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
  # 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
  # 'themajorityreport' 'seculartalk']
SUBREDDITS: List[str] = ['hasan_piker']
# HUGGINGFACE_USER = 'brianmatzelle'
HUGGINGFACE_USER: str = 'BinghamtonUniversity'
K_COUNT: int = 173
HUGGINGFACE_BASE_DATASET: str = f'2024-election-subreddit-threads-{K_COUNT}k'

# Subset Config
SUBREDDIT_NAME: str = "-".join(SUBREDDITS)
HUGGINGFACE_SUBSET_DATASET: str = f'{HUGGINGFACE_BASE_DATASET.replace("subreddit", SUBREDDIT_NAME)}'


# DO NOT EDIT BELOW THIS LINE
# remove -173k from the subset dataset name
if HUGGINGFACE_SUBSET_DATASET.endswith(f'-{K_COUNT}k'):
  HUGGINGFACE_SUBSET_DATASET = HUGGINGFACE_SUBSET_DATASET[:-4]

# Print config
print(f'SUBREDDITS: {SUBREDDITS}')
print(f'HUGGINGFACE_BASE_DATASET: {HUGGINGFACE_BASE_DATASET}')
print(f'HUGGINGFACE_SUBSET_DATASET: {HUGGINGFACE_SUBSET_DATASET}')

SUBREDDITS: ['hasan_piker']
HUGGINGFACE_BASE_DATASET: 2024-election-subreddit-threads-173k
HUGGINGFACE_SUBSET_DATASET: 2024-election-hasan_piker-threads-


In [2]:
from datasets import load_dataset, Dataset

dataset: Dataset = load_dataset(f"{HUGGINGFACE_USER}/{HUGGINGFACE_BASE_DATASET}", split = "train")
# load locally bc im on a place
# dataset = load_dataset('json', data_files=f'data/datasets/2024-election-subreddit-threads-{K_COUNT}k.json', split='train')

from utils import to_k
from loguru import logger
logger.info(f"Prefilter dataset size: {to_k(len(dataset))}")
dataset[0]

[32m2024-12-11 01:20:09.904[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mPrefilter dataset size: 173k[0m


{'conversations': [{'content': 'The GOP\'s big, mysterious enemy: "They"',
   'metadata': {'author': 'Steve____Stifler',
    'controversiality': None,
    'created_utc': 1721334266,
    'downvotes': 0,
    'flair': 'News (US)',
    'is_submitter': None,
    'no_follow': True,
    'removed_by_category': 'moderator',
    'score': 343,
    'suggested_sort': None,
    'title': 'The GOP\'s big, mysterious enemy: "They"',
    'total_awards_received': 0,
    'upvote_ratio': 0.97,
    'upvotes': 343},
   'role': 'user'},
  {'content': '"Well who\'s \'THEY\'?? ... ... What the hell is an Aluminium Falcon?"',
   'metadata': {'author': 'mad_cheese_hattwe',
    'controversiality': 0,
    'created_utc': 1721336569,
    'downvotes': 0,
    'flair': None,
    'is_submitter': False,
    'no_follow': True,
    'removed_by_category': None,
    'score': 30,
    'suggested_sort': None,
    'title': None,
    'total_awards_received': 0,
    'upvote_ratio': None,
    'upvotes': 30},
   'role': 'assistant'},

## To Do (Analysis)
Give the user the option to run an analysis on the dataset.

Move these next two cells to an analysis function, add more/better analysis.

In [3]:
# from typing import Dict
# from collections import Counter

# # Count the posts in each subreddit
# subreddit_counts: Counter = Counter(post['metadata']['subreddit']['name'] for post in dataset)

# # Sort by the number of posts in descending order
# ranked_subreddits: List[tuple[str, int]] = sorted(subreddit_counts.items(), key=lambda x: x[1], reverse=True)

# # Get subscriber count for each subreddit
# subs_map: Dict[str, int] = {}
# for post in dataset:
#     sr_metadata = post['metadata']['subreddit']
#     subreddit_name = sr_metadata['name']
#     if subreddit_name not in subs_map:
#         subs_map[subreddit_name] = sr_metadata.get('subscribers', 0)

# # Print rankings
# for i, (subreddit, count) in enumerate(ranked_subreddits, start=1):
#     subs = subs_map.get(subreddit, 0)
#     print(f"{i}. r/{subreddit}: {count} posts, {subs} subs")

# # Optionally log the results if needed
# logger.info(f"Subreddit ranking:\n{ranked_subreddits}")

In [4]:
# # show most controversial posts
# from collections import defaultdict
# from pprint import pprint

# # Create a dictionary to store the posts for each subreddit
# controversiality = defaultdict(list)
# for post in dataset:
#     if post['metadata']['controversiality'] < 90:
#         continue
#     controversiality[post['metadata']['controversiality']] += post


# # print the controversiality
# pprint(controversiality)

In [5]:
# To Do: Dynamically choose refinement options
# Give the user (the one running this notebook) config options after viewing the data, so they can curate a dataset of their own.

In [6]:
# remove posts that are not in the SUBREDDITS list
filtered_dataset = dataset.filter(lambda x: x['metadata']['subreddit']['name'] in SUBREDDITS)
logger.info(f"Filtered {to_k(len(dataset) - len(filtered_dataset))} posts from the dataset")
del dataset

filtered_size_k = to_k(len(filtered_dataset))
logger.info(f"Dataset size: {filtered_size_k} posts")

# Append the new size to the subset dataset name before saving
HUGGINGFACE_SUBSET_DATASET += f"{filtered_size_k}"

filtered_dataset[0]

[32m2024-12-11 01:20:09.964[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mFiltered 171k posts from the dataset[0m
[32m2024-12-11 01:20:09.965[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mDataset size: 2k posts[0m


{'conversations': [{'content': 'Bernie, Come back! We need u!',
   'metadata': {'author': 'jared10011980',
    'controversiality': None,
    'created_utc': 1721598961,
    'downvotes': 0,
    'flair': None,
    'is_submitter': None,
    'no_follow': False,
    'removed_by_category': None,
    'score': 433,
    'suggested_sort': None,
    'title': 'Bernie, Come back! We need u!',
    'total_awards_received': 0,
    'upvote_ratio': 0.81,
    'upvotes': 433},
   'role': 'user'},
  {'content': 'Kamala’s voting record isn’t bad and is certainly more progressive than *many* senators. She’s also not as rabidly pro-Israel as Biden (but that’s a low fucking bar). It’s very possible and even probable that she’ll govern more progressively than Biden.\n\nShe’s still a centrist neolib and a cop (prosecutor) that jailed a ton of minorities.\n\nBoth of these things can be true. We don’t need to be circle jerking Kamala, she’s not a leftist nor a beacon of progressivism. Voting for her is just a lesse

In [7]:
subset = {'conversations': []}
for post in filtered_dataset:
    subset['conversations'].append([{'content': msg['content'], 'role': msg['role']} for msg in post['conversations']])
logger.info("Completely removed metadata from each message in the dataset")
subset = Dataset.from_dict(subset)
subset[0]

[32m2024-12-11 01:20:10.322[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mCompletely removed metadata from each message in the dataset[0m


{'conversations': [{'content': 'Bernie, Come back! We need u!',
   'role': 'user'},
  {'content': 'Kamala’s voting record isn’t bad and is certainly more progressive than *many* senators. She’s also not as rabidly pro-Israel as Biden (but that’s a low fucking bar). It’s very possible and even probable that she’ll govern more progressively than Biden.\n\nShe’s still a centrist neolib and a cop (prosecutor) that jailed a ton of minorities.\n\nBoth of these things can be true. We don’t need to be circle jerking Kamala, she’s not a leftist nor a beacon of progressivism. Voting for her is just a lesser of 2 evils situation, just like with Biden. It’s nice that she’s less evil than Biden though.',
   'role': 'assistant'},
  {'content': "If she's smart, she'll take a progressive VP and actually listen to them. Not only would she be more likely to win some of the younger voters over, but the US might even see some progressive policies come from it. Even if she's a Neo-Liberal, and her VP is pr

In [9]:
# OPTIONAL: Prepare for OpenAI
openai_subset = subset
openai_subset = openai_subset.rename_column("conversations", "messages")
# add a new column for weight, make every value 1
# openai_subset = openai_subset.add_column("weight", [1] * len(openai_subset))

import os
DIR_PATH: str = "data/subsets/openai"
os.makedirs(f"{DIR_PATH}/{SUBREDDIT_NAME}", exist_ok=True)

# save complete subset to jsonl file
openai_subset.to_json(f"{DIR_PATH}/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai.jsonl", lines=True)

# split into train and test
train_subset, test_subset = openai_subset.train_test_split(test_size=0.1).values()

# save train and test subsets to jsonl files
train_subset.to_json(f"{DIR_PATH}/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai-train.jsonl", lines=True)
test_subset.to_json(f"{DIR_PATH}/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai-test.jsonl", lines=True)

# save openai_subset to jsonl file
# openai_subset.to_json(f"data/subsets/{HUGGINGFACE_SUBSET_DATASET}-openai.jsonl", lines=True)
openai_subset[0]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

{'messages': [{'content': 'Bernie, Come back! We need u!', 'role': 'user'},
  {'content': 'Kamala’s voting record isn’t bad and is certainly more progressive than *many* senators. She’s also not as rabidly pro-Israel as Biden (but that’s a low fucking bar). It’s very possible and even probable that she’ll govern more progressively than Biden.\n\nShe’s still a centrist neolib and a cop (prosecutor) that jailed a ton of minorities.\n\nBoth of these things can be true. We don’t need to be circle jerking Kamala, she’s not a leftist nor a beacon of progressivism. Voting for her is just a lesser of 2 evils situation, just like with Biden. It’s nice that she’s less evil than Biden though.',
   'role': 'assistant'},
  {'content': "If she's smart, she'll take a progressive VP and actually listen to them. Not only would she be more likely to win some of the younger voters over, but the US might even see some progressive policies come from it. Even if she's a Neo-Liberal, and her VP is progressiv

## To Do
Give the user option to analyze the dataset again, after removing unwanted subreddits.

In [10]:
# save locally
# subset.to_json(f"data/subsets/{HUGGINGFACE_SUBSET_DATASET}.json")

## Push to Hub
Before you can run this cell, you need to

1. `pip install huggingface_hub`
2. `huggingface-cli login`

OR

In your shell, run
1. `export HF_TOKEN=YOUR_WRITE_ACCESS_TOKEN_FROM_HUGGINGFACE`

In [11]:
import os
from dotenv import load_dotenv
load_dotenv()

# push curated dataset to huggingface
subset.push_to_hub(f"{HUGGINGFACE_USER}/{HUGGINGFACE_SUBSET_DATASET}", token=os.getenv("HF_TOKEN"))

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BinghamtonUniversity/2024-election-hasan_piker-threads-2k/commit/c8cd3f3c23a37a8f77a94d67309d50a5100e212a', commit_message='Upload dataset', commit_description='', oid='c8cd3f3c23a37a8f77a94d67309d50a5100e212a', pr_url=None, pr_revision=None, pr_num=None)