In [None]:
from config import PROCESSED_DATA_FILE
import json
from loguru import logger

with open(PROCESSED_DATA_FILE, 'r') as f:
  data = json.load(f)
logger.info(f"Loaded {len(data)} rows from {PROCESSED_DATA_FILE}")

import pandas as pd
df = pd.DataFrame(data)
del data
prefilter_len = len(df)
logger.info(f"Converted json to pandas DataFrame with {prefilter_len} rows")
df.head(1)

In [None]:
# alter columns so they're easier to work with
df['subreddit'] = df['subreddit'].str.lower()

# Show some values that might be helpful for customizing configuration
print(df['subreddit'].unique())

In [None]:
from config import SUBREDDITS
df = df[df['subreddit'].isin([sub.lower() for sub in SUBREDDITS])]

logger.info(f"subreddits remaining: {df['subreddit'].unique()}")
logger.info(f"Filtered out {prefilter_len - len(df)} rows")
logger.info(f"Remaining rows: {len(df)}")

from utils import to_k, get_conversations_file
posts_count = to_k(len(df), logger)
logger.info(f"Using dataset size: {posts_count}")

loop through posts and create conversations by alternating user/assistant with every comment/reply

In [None]:
from typing import Dict, Generator, List
def Turn(role: str, value: str) -> Dict[str, str]:
  return {
    'from': role,
    'value': value
  }

def traverse_thread(comment: Dict, role: str = 'gpt') -> Generator[List[Dict[str, str]], None, None]:
    """
    Recursively traverse a comment thread and yield each individual thread.
    """
    if role not in {'gpt', 'human'}:
        raise ValueError("role must be 'gpt' or 'human'")
    
    if not comment.get('body'):
        return
    
    if comment['body'] == '[deleted]' or comment['body'] == '[removed]':
        return
    
    # Start the thread with the current comment
    current_thread = [Turn(role, comment['body'])]
    
    # If no replies, yield the current thread as-is
    if not comment.get('replies'):
        yield current_thread
        return
    
    # Recurse into replies, yielding a full thread for each reply chain
    for reply in comment['replies']:
        for sub_thread in traverse_thread(reply, 'human' if role == 'gpt' else 'gpt'):
            yield current_thread + sub_thread

In [None]:
from utils import is_post_valid
conversations = []
for i, post_row in df.iterrows():
    valid, reason = is_post_valid(post_row)
    if not valid:
      # logger.debug(f"Skipping post {post_row['id']} because it is not valid: {reason}")
      continue

    if post_row['selftext'] == '[deleted]' or post_row['selftext'] == '[removed]':
        # logger.warning(f"Starting thread with first comment because post {post_row['id']} is deleted or removed: {post_row['title']}")
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment, 'human'):
                conversations.append(thread)
    else:
      initial_turn = Turn('human', post_row['selftext'] if post_row['selftext'] else post_row['title'])
      
      # Process comments
      for comment in post_row.get('comments', []):
          for thread in traverse_thread(comment):
              conversations.append([initial_turn] + thread)

conversations[:1]

In [None]:
# save to jsonl file
import json
subreddits_str = '-'.join(SUBREDDITS)
size_str = to_k(len(conversations), logger)
logger.info(f"Saving {size_str} conversations to file")
conversations_file = get_conversations_file(subreddits_str, size_str)
with open(conversations_file, 'w') as f:
    for conv in conversations:
        f.write(json.dumps(conv) + '\n')