# About
This notebook is used to create a subset of the dataset for finetuning a GPT-style conversational language model.

## Config

In [1]:
# Config
SUBREDDITS = ['hasan_piker']
HUGGINGFACE_USER = 'brianmatzelle'
HUGGINGFACE_BASE_DATASET = '2024-election-subreddit-threads-643k'

# Exports
HUGGINGFACE_SUBSET_DATASET = f'{HUGGINGFACE_BASE_DATASET.replace("subreddit", "-".join(SUBREDDITS))}'


# DO NOT EDIT BELOW THIS LINE
# remove -643k from the subset dataset name
if HUGGINGFACE_SUBSET_DATASET.endswith('-643k'):
  HUGGINGFACE_SUBSET_DATASET = HUGGINGFACE_SUBSET_DATASET[:-4]

# Print config
print(f'SUBREDDITS: {SUBREDDITS}')
print(f'HUGGINGFACE_BASE_DATASET: {HUGGINGFACE_BASE_DATASET}')
print(f'HUGGINGFACE_SUBSET_DATASET: {HUGGINGFACE_SUBSET_DATASET}')

SUBREDDITS: ['hasan_piker']
HUGGINGFACE_BASE_DATASET: 2024-election-subreddit-threads-643k
HUGGINGFACE_SUBSET_DATASET: 2024-election-hasan_piker-threads-


In [2]:
from datasets import load_dataset
# dataset = load_dataset(f"{HUGGINGFACE_USER}/{HUGGINGFACE_BASE_DATASET}", split = "train")
# load locally bc im on a place
dataset = load_dataset('json', data_files='data/datasets/2024-election-subreddit-threads-643k.json', split='train')

from utils import to_k
from loguru import logger
logger.info(f"Prefilter dataset size: {to_k(len(dataset))}")
dataset[0]

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 643627 examples [00:12, 51479.08 examples/s]
[32m2024-11-27 20:20:48.280[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m643k rows[0m
[32m2024-11-27 20:20:48.280[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mPrefilter dataset size: 643k[0m


{'conversations': [{'content': 'You are a redditor in a political subreddit, having a conversation with another redditor about politics.',
   'role': 'system'},
  {'content': 'The Old Ranting, Rambling Trump Was Back at the Republican Convention',
   'role': 'user'},
  {'content': "It was very bad. He had a chance to put a nail in the democrats' coffin. He had the crowd and everyone watching at home in the palm of his hands as he discussed the shooting incident. From there he turned into the drunk uncle/raving political lunatic that he has always been. Other than his base, Americans are tired of this. Even worse for Trump, his energy is nowhere near where it was 8 years ago. This is a Massive gift to the democrats. I wonder if any of trump's people have the balls to tell him how terrible this speech was. He has no business being the president again.",
   'role': 'assistant'},
  {'content': 'Spot on. \n\nThat speech was a fucking train wreck. And I’m not just saying that because it’s Tr

## To Do (Analysis)
Give the user the option to run an analysis on the dataset.

Move these next two cells to an analysis function, add more/better analysis.

In [3]:
# from collections import Counter

# # Count the posts in each subreddit
# subreddit_counts = Counter(post['metadata']['subreddit']['name'] for post in dataset)
# # Sort by the number of posts in descending order
# ranked_subreddits = sorted(subreddit_counts.items(), key=lambda x: x[1], reverse=True)

# # Print the ranking
# for i, (subreddit, count) in enumerate(ranked_subreddits, start=1):
#     for post in dataset:
#         if post['metadata']['subreddit']['name'] == subreddit:
#             subscribers = post['metadata']['subreddit']['subscribers']
#     print(f"{i}. r/{subreddit}: {count} posts, {subscribers} subscribers")

# # Optionally log the results if needed
# logger.info(f"Subreddit ranking:\n{ranked_subreddits}")

In [4]:
# # show most controversial posts
# from collections import defaultdict
# from pprint import pprint

# # Create a dictionary to store the posts for each subreddit
# controversiality = defaultdict(list)
# for post in dataset:
#     if post['metadata']['controversiality'] < 90:
#         continue
#     controversiality[post['metadata']['controversiality']] += post


# # print the controversiality
# pprint(controversiality)

## To Do (Dynamically choose refinement options)
Give the user (the one running this notebook) config options after viewing the data, so they can curate a dataset of their own.

In [5]:
subset = dataset.filter(lambda x: x['metadata']['subreddit']['name'] in SUBREDDITS)
logger.info(f"Filtered {to_k(len(dataset) - len(subset))} posts from the dataset")
del dataset

filtered_size_k = to_k(len(subset))
logger.info(f"Dataset size: {filtered_size_k} posts")

# Append the new size to the subset dataset name before saving
HUGGINGFACE_SUBSET_DATASET += f"{filtered_size_k}"

subset[0]

Filter: 100%|██████████| 643627/643627 [00:11<00:00, 53898.72 examples/s]
[32m2024-11-27 20:21:00.243[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m632k rows[0m
[32m2024-11-27 20:21:00.243[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mFiltered 632k posts from the dataset[0m
[32m2024-11-27 20:21:00.243[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m10k rows[0m
[32m2024-11-27 20:21:00.244[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mDataset size: 10k posts[0m


{'conversations': [{'content': 'You are a redditor in a political subreddit, having a conversation with another redditor about politics.',
   'role': 'system'},
  {'content': 'Dude is so stubborn.', 'role': 'user'},
  {'content': 'What do you expect him to say lmao', 'role': 'assistant'}],
 'metadata': {'controversiality': 0,
  'normalized_controversiality': 0.0,
  'post': {'author': 'Candid_Bicycle_6111',
   'downvotes': 0,
   'flair': None,
   'score': 1090,
   'suggested_sort': None,
   'upvote_ratio': 0.95,
   'upvotes': 1090},
  'subreddit': {'name': 'hasan_piker', 'subscribers': 139861}}}

In [6]:
# OPTIONAL: Prepare for OpenAI
openai_subset = subset
openai_subset = openai_subset.remove_columns(["metadata"])
openai_subset = openai_subset.rename_column("conversations", "messages")
# add a new column for weight, make every value 1
# openai_subset = openai_subset.add_column("weight", [1] * len(openai_subset))

# split into train and test
train_subset, test_subset = openai_subset.train_test_split(test_size=0.1).values()

# save train and test subsets to jsonl files
train_subset.to_json(f"data/subsets/train/{HUGGINGFACE_SUBSET_DATASET}-openai-train.jsonl", lines=True)
test_subset.to_json(f"data/subsets/test/{HUGGINGFACE_SUBSET_DATASET}-openai-test.jsonl", lines=True)

# save openai_subset to jsonl file
# openai_subset.to_json(f"data/subsets/{HUGGINGFACE_SUBSET_DATASET}-openai.jsonl", lines=True)
openai_subset[0]

Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 101.24ba/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 190.37ba/s]


{'messages': [{'content': 'You are a redditor in a political subreddit, having a conversation with another redditor about politics.',
   'role': 'system'},
  {'content': 'Dude is so stubborn.', 'role': 'user'},
  {'content': 'What do you expect him to say lmao', 'role': 'assistant'}]}

## To Do
Give the user option to analyze the dataset again, after removing unwanted subreddits.

In [7]:
# save locally
# subset.to_json(f"data/subsets/{HUGGINGFACE_SUBSET_DATASET}.json")

## Push to Hub
Before you can run this cell, you need to

1. `pip install huggingface_hub`
2. `huggingface-cli login`

OR

In your shell, run
1. `export HF_TOKEN=YOUR_WRITE_ACCESS_TOKEN_FROM_HUGGINGFACE`

In [8]:
# import os
# from dotenv import load_dotenv
# load_dotenv()

# # push curated dataset to huggingface
# subset.push_to_hub(f"{HUGGINGFACE_USER}/{HUGGINGFACE_SUBSET_DATASET}", token=os.getenv("HF_TOKEN"))