# About
This notebook is used to create a subset of the dataset for finetuning a GPT-style conversational language model.

In [1]:
from loguru import logger

## Config

In [None]:
# Config
SUBREDDITS = ['hasan_piker']
HUGGINGFACE_USER = 'brianmatzelle'
HUGGINGFACE_BASE_DATASET = '2024-election-subreddit-threads-643k'

# Exports
HUGGINGFACE_SUBSET_DATASET = f'{HUGGINGFACE_BASE_DATASET.replace("subreddit", "-".join(SUBREDDITS))}'


# DO NOT EDIT BELOW THIS LINE
# remove -643k from the subset dataset name
if HUGGINGFACE_SUBSET_DATASET.endswith('-643k'):
  HUGGINGFACE_SUBSET_DATASET = HUGGINGFACE_SUBSET_DATASET[:-4]

# Print config
print(f'SUBREDDITS: {SUBREDDITS}')
print(f'HUGGINGFACE_BASE_DATASET: {HUGGINGFACE_BASE_DATASET}')
print(f'HUGGINGFACE_SUBSET_DATASET: {HUGGINGFACE_SUBSET_DATASET}')

In [None]:
from datasets import load_dataset
dataset = load_dataset(f"{HUGGINGFACE_USER}/{HUGGINGFACE_BASE_DATASET}", split = "train")
prefilter_size = len(dataset)

from utils import to_k
logger.info(f"Prefilter dataset size: {to_k(prefilter_size)}")
dataset[0]

## To Do (Analysis)
Give the user the option to run an analysis on the dataset.

Move these next two cells to an analysis function, add more/better analysis.

In [4]:
# from collections import Counter

# # Count the posts in each subreddit
# subreddit_counts = Counter(post['metadata']['subreddit']['name'] for post in dataset)
# # Sort by the number of posts in descending order
# ranked_subreddits = sorted(subreddit_counts.items(), key=lambda x: x[1], reverse=True)

# # Print the ranking
# for i, (subreddit, count) in enumerate(ranked_subreddits, start=1):
#     for post in dataset:
#         if post['metadata']['subreddit']['name'] == subreddit:
#             subscribers = post['metadata']['subreddit']['subscribers']
#     print(f"{i}. r/{subreddit}: {count} posts, {subscribers} subscribers")

# # Optionally log the results if needed
# logger.info(f"Subreddit ranking:\n{ranked_subreddits}")

In [5]:
# # show most controversial posts
# from collections import defaultdict
# from pprint import pprint

# # Create a dictionary to store the posts for each subreddit
# controversiality = defaultdict(list)
# for post in dataset:
#     if post['metadata']['controversiality'] < 90:
#         continue
#     controversiality[post['metadata']['controversiality']] += post


# # print the controversiality
# pprint(controversiality)

## To Do (Dynamically choose refinement options)
Give the user (the one running this notebook) config options after viewing the data, so they can curate a dataset of their own.

In [None]:
dataset = dataset.filter(lambda x: x['metadata']['subreddit']['name'] in SUBREDDITS)
logger.info(f"Filtered {to_k(prefilter_size - len(dataset))} posts from the dataset")
del prefilter_size

filtered_size_k = to_k(len(dataset))
logger.info(f"Dataset size: {filtered_size_k} posts")

# Append the new size to the subset dataset name before saving
HUGGINGFACE_SUBSET_DATASET += f"{filtered_size_k}"

dataset[0]

## To Do
Give the user option to analyze the dataset again, after removing unwanted subreddits.

In [None]:
# save locally
# TODO: implement save locally to json

## Push to Hub
Before you can run this cell, you need to

1. `pip install huggingface_hub`
2. `huggingface-cli login`

OR

In your shell, run
1. `export HF_TOKEN=YOUR_WRITE_ACCESS_TOKEN_FROM_HUGGINGFACE`

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

# push curated dataset to huggingface
dataset.push_to_hub(f"{HUGGINGFACE_USER}/{HUGGINGFACE_SUBSET_DATASET}", token=os.getenv("HF_TOKEN"))