# About
This notebook is used to create a dataset of reddit conversations for finetuning a GPT-style language model.

In [1]:
from config import PROCESSED_DATA_FILE, HUGGINGFACE_USERNAME
import json
from loguru import logger

with open(PROCESSED_DATA_FILE, 'r') as f:
  data = json.load(f)
logger.info(f"Loaded {len(data)} rows from {PROCESSED_DATA_FILE}")

import pandas as pd
df = pd.DataFrame(data)
del data
prefilter_len = len(df)
logger.info(f"Converted json to pandas DataFrame with {prefilter_len} rows")
df.head(1)

[32m2024-11-26 00:42:33.813[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoaded 54215 rows from data/processed/posts-11-13-2024-processed.json[0m
[32m2024-11-26 00:42:34.366[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mConverted json to pandas DataFrame with 54215 rows[0m


Unnamed: 0,id,subreddit,selftext,title,downs,name,upvote_ratio,ups,removed_by_category,link_flair_text,...,no_follow,created_utc,author_flair_text,author,num_comments,subreddit_subscribers,send_replies,is_video,deleted,comments
0,1dx1b0z,Destiny,,New Vegan,0,t3_1dx1b0z,0.95,121,,Shitpost,...,False,1720304607,,TuningsGaming,2,248289,True,False,False,"[{'id': 'lbyv8mn', 'total_awards_received': 0,..."


In [2]:
# alter columns so they're easier to work with
df['subreddit'] = df['subreddit'].str.lower()

# Show some values that might be helpful for customizing configuration
print(df['subreddit'].unique())

['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
 'themajorityreport' 'seculartalk']


In [3]:
from config import SUBREDDITS

# COMMENT THIS OUT TO USE ALL SUBREDDITS
# df = df[df['subreddit'].isin([sub.lower() for sub in SUBREDDITS])]

logger.info(f"subreddits remaining: {df['subreddit'].unique()}")
logger.info(f"Filtered out {prefilter_len - len(df)} rows")
logger.info(f"Remaining rows: {len(df)}")

from utils import to_k, get_conversations_file
posts_count = to_k(len(df), logger)
logger.info(f"Using dataset size: {posts_count}")

[32m2024-11-26 00:42:34.461[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1msubreddits remaining: ['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
 'themajorityreport' 'seculartalk'][0m
[32m2024-11-26 00:42:34.461[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mFiltered out 0 rows[0m
[32m2024-11-26 00:42:34.462[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mRemaining rows: 54215[0m
[32m2024-11-26 00:42:34.464[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m54k rows[0m
[32m2024-11-26 00:42:34.464[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mUsing dataset size: 54k[0m


loop through posts and create conversations by alternating user/assistant with every comment/reply

In [4]:
from typing import Dict, Generator, List
def Turn(role: str, value: str) -> Dict[str, str]:
  return {
    'from': role,
    'value': value
  }

def traverse_thread(comment: Dict, role: str = 'gpt') -> Generator[List[Dict[str, str]], None, None]:
    """
    Recursively traverse a comment thread and yield each individual thread.
    """
    if role not in {'gpt', 'human'}:
        raise ValueError("role must be 'gpt' or 'human'")
    
    if not comment.get('body'):
        return
    
    if comment['body'] == '[deleted]' or comment['body'] == '[removed]':
        return
    
    # Start the thread with the current comment
    current_thread = [Turn(role, comment['body'])]
    
    # If no replies, yield the current thread as-is
    if not comment.get('replies'):
        yield current_thread
        return
    
    # Recurse into replies, yielding a full thread for each reply chain
    for reply in comment['replies']:
        for sub_thread in traverse_thread(reply, 'human' if role == 'gpt' else 'gpt'):
            yield current_thread + sub_thread

In [5]:
from utils import is_post_valid

# set to prevent duplicates which can occur if the final comment is deleted or removed
conversations = set()
for i, post_row in df.iterrows():
    system_turn = Turn('system', f"You are a redditor on r/{post_row['subreddit']} and you are having a conversation with another redditor.")
    valid, reason = is_post_valid(post_row)
    if not valid:
        continue

    if post_row['selftext'] == '[deleted]' or post_row['selftext'] == '[removed]':
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment, 'human'):
                # Serialize thread for hashable set element
                serialized_thread = json.dumps(thread)
                conversations.add(serialized_thread)
    else:
        initial_turn = Turn('human', post_row['selftext'] if post_row['selftext'] else post_row['title'])
        
        # Process comments
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment):
                # Serialize thread for hashable set element
                serialized_thread = json.dumps([system_turn] + [initial_turn] + thread)
                conversations.add(serialized_thread)

# Deserialize conversations back into Python objects if needed
conversations = [json.loads(conv) for conv in conversations]
logger.info(f"Extracted {len(conversations)} conversations from {len(df)} posts")
conversations[0]

[32m2024-11-26 00:42:45.657[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mExtracted 642940 conversations from 54215 posts[0m


[{'from': 'system',
  'value': 'You are a redditor on r/democrats and you are having a conversation with another redditor.'},
 {'from': 'human', 'value': 'Honestly disappointed for our country'},
 {'from': 'gpt',
  'value': 'When all is said and done, donuts to dollars the young failed us again by not voting. Unbelievable.'},
 {'from': 'human',
  'value': 'Dems have to give people candidates that they can believe in. Biden should’ve dropped out way sooner so we could’ve had a primary. Couple the economy with not having a primary and you get a candidate that people are not inspired by. Kind of hard to believe in and have enthusiasm for a candidate when you’re worrying about being able to afford to eat or have a place to sleep.'},
 {'from': 'gpt',
  'value': "Even so, most older Gen Z and Young Millenials remember very well the shitshow Trump caused from 2017-2021. \n\nIt should be enough to not vote for him regardless of the other candidate's qualities at this point."},
 {'from': 'human

In [6]:
# TODO: implement this and more ways to judge conversations
# def judge_str_toxicity(text: str) -> float:
#     """
#     Judge the toxicity of a string.
#     """
#     from transformers import pipeline
#     classifier = pipeline('text-classification', model='persiainbert/toxic-mahyar', tokenizer='persiainbert/toxic-mahyar')
#     result = classifier(text)[0]
#     return result['score']

# def judge_convo_toxicity(convo: List[Dict[str, str]]) -> float:
#     """
#     Judge the toxicity of a conversation.
#     """
#     return sum(judge_str_toxicity(turn['value']) for turn in convo) / len(convo)

In [7]:
# save to jsonl file
import json
subreddits_str = '-'.join(SUBREDDITS)
size_str = to_k(len(conversations), logger)
logger.info(f"Saving {size_str} conversations to file")
conversations_file, name = get_conversations_file(subreddits_str, size_str)

json_obj = []
for i, conversation in enumerate(conversations):
  json_obj.append({
    "conversation": conversation,
    # TODO: add more fields
    # "toxicity_rating": judge_convo_toxicity(conversation), # TODO: implement this
  })
with open(conversations_file, 'w') as f:
  json.dump(json_obj, f, indent=2)

[32m2024-11-26 00:42:45.811[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m642k rows[0m
[32m2024-11-26 00:42:45.812[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mSaving 642k conversations to file[0m


In [8]:
# push to huggingface
from datasets import load_dataset
dataset = load_dataset('json', data_files=conversations_file)

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
import os
from dotenv import load_dotenv
load_dotenv()

if not os.getenv('HF_TOKEN'):
  logger.error("No Hugging Face token found, not pushing to hub")
else:
  dataset.push_to_hub(f"{HUGGINGFACE_USERNAME}/{name}".lower(), token=os.getenv('HF_TOKEN'))

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/643 [00:00<?, ?ba/s]