# About
This notebook is used to create a subset of the dataset for finetuning a GPT-style conversational language model.

## Config

In [None]:
# Config
  # ALL_SUBREDDITS = ['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
  # 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
  # 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
  # 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
  # 'themajorityreport' 'seculartalk']
SUBREDDITS = ['hasan_piker']
# HUGGINGFACE_USER = 'brianmatzelle'
HUGGINGFACE_USER = 'BinghamtonUniversity'
K_COUNT = 173
HUGGINGFACE_BASE_DATASET = f'2024-election-subreddit-threads-{K_COUNT}k'

# Exports
SUBREDDIT_NAME = "-".join(SUBREDDITS)
HUGGINGFACE_SUBSET_DATASET = f'{HUGGINGFACE_BASE_DATASET.replace("subreddit", SUBREDDIT_NAME)}'


# DO NOT EDIT BELOW THIS LINE
# remove -637k from the subset dataset name
if HUGGINGFACE_SUBSET_DATASET.endswith(f'-{K_COUNT}k'):
  HUGGINGFACE_SUBSET_DATASET = HUGGINGFACE_SUBSET_DATASET[:-4]

# Print config
print(f'SUBREDDITS: {SUBREDDITS}')
print(f'HUGGINGFACE_BASE_DATASET: {HUGGINGFACE_BASE_DATASET}')
print(f'HUGGINGFACE_SUBSET_DATASET: {HUGGINGFACE_SUBSET_DATASET}')

In [None]:
from datasets import load_dataset
# dataset = load_dataset(f"{HUGGINGFACE_USER}/{HUGGINGFACE_BASE_DATASET}", split = "train")
# load locally bc im on a place
dataset = load_dataset('json', data_files=f'data/datasets/2024-election-subreddit-threads-{K_COUNT}k.json', split='train')

from utils import to_k
from loguru import logger
logger.info(f"Prefilter dataset size: {to_k(len(dataset))}")
dataset[0]

## To Do (Analysis)
Give the user the option to run an analysis on the dataset.

Move these next two cells to an analysis function, add more/better analysis.

In [3]:
# # TODO: there has to be a faster way to do this, it takes a few minutes to run
# from collections import Counter

# # Count the posts in each subreddit
# subreddit_counts = Counter(post['metadata']['subreddit']['name'] for post in dataset)
# # Sort by the number of posts in descending order
# ranked_subreddits = sorted(subreddit_counts.items(), key=lambda x: x[1], reverse=True)

# # Print the ranking
# for i, (subreddit, count) in enumerate(ranked_subreddits, start=1):
#     for post in dataset:
#         if post['metadata']['subreddit']['name'] == subreddit:
#             subscribers = post['metadata']['subreddit']['subscribers']
#     print(f"{i}. r/{subreddit}: {count} posts, {subscribers} subscribers")

# # Optionally log the results if needed
# logger.info(f"Subreddit ranking:\n{ranked_subreddits}")

In [4]:
# # show most controversial posts
# from collections import defaultdict
# from pprint import pprint

# # Create a dictionary to store the posts for each subreddit
# controversiality = defaultdict(list)
# for post in dataset:
#     if post['metadata']['controversiality'] < 90:
#         continue
#     controversiality[post['metadata']['controversiality']] += post


# # print the controversiality
# pprint(controversiality)

## To Do (Dynamically choose refinement options)
Give the user (the one running this notebook) config options after viewing the data, so they can curate a dataset of their own.

In [None]:
subset = dataset.filter(lambda x: x['metadata']['subreddit']['name'] in SUBREDDITS)
logger.info(f"Filtered {to_k(len(dataset) - len(subset))} posts from the dataset")
del dataset

filtered_size_k = to_k(len(subset))
logger.info(f"Dataset size: {filtered_size_k} posts")

# Append the new size to the subset dataset name before saving
HUGGINGFACE_SUBSET_DATASET += f"{filtered_size_k}"

subset[0]

In [None]:
# OPTIONAL: Prepare for OpenAI
openai_subset = subset
openai_subset = openai_subset.remove_columns(["metadata"])
openai_subset = openai_subset.rename_column("conversations", "messages")
# add a new column for weight, make every value 1
# openai_subset = openai_subset.add_column("weight", [1] * len(openai_subset))

import os
os.makedirs(f"data/subsets/openai/{SUBREDDIT_NAME}", exist_ok=True)

# save complete subset to jsonl file
openai_subset.to_json(f"data/subsets/openai/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai.jsonl", lines=True)

# split into train and test
train_subset, test_subset = openai_subset.train_test_split(test_size=0.1).values()

# save train and test subsets to jsonl files
train_subset.to_json(f"data/subsets/openai/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai-train.jsonl", lines=True)
test_subset.to_json(f"data/subsets/openai/{SUBREDDIT_NAME}/{HUGGINGFACE_SUBSET_DATASET}-openai-test.jsonl", lines=True)

# save openai_subset to jsonl file
# openai_subset.to_json(f"data/subsets/{HUGGINGFACE_SUBSET_DATASET}-openai.jsonl", lines=True)
openai_subset[0]

## To Do
Give the user option to analyze the dataset again, after removing unwanted subreddits.

In [7]:
# save locally
# subset.to_json(f"data/subsets/{HUGGINGFACE_SUBSET_DATASET}.json")

## Push to Hub
Before you can run this cell, you need to

1. `pip install huggingface_hub`
2. `huggingface-cli login`

OR

In your shell, run
1. `export HF_TOKEN=YOUR_WRITE_ACCESS_TOKEN_FROM_HUGGINGFACE`

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

# push curated dataset to huggingface
subset.push_to_hub(f"{HUGGINGFACE_USER}/{HUGGINGFACE_SUBSET_DATASET}", token=os.getenv("HF_TOKEN"))