# About
This notebook is used to create a base dataset of reddit conversations, so that it can be filtered and subset in the future.

In [1]:
from config import PROCESSED_DATA_FILE, HUGGINGFACE_USERNAME
import json
from loguru import logger

with open(PROCESSED_DATA_FILE, 'r') as f:
  data = json.load(f)
logger.info(f"Loaded {len(data)} rows from {PROCESSED_DATA_FILE}")

import pandas as pd
df = pd.DataFrame(data)
del data
prefilter_len = len(df)
logger.info(f"Converted json to pandas DataFrame with {prefilter_len} rows")
df.head(1)

[32m2024-11-26 13:32:06.751[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoaded 54215 rows from data/processed/posts-11-13-2024-processed.json[0m
[32m2024-11-26 13:32:14.426[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mConverted json to pandas DataFrame with 54215 rows[0m


Unnamed: 0,id,subreddit,selftext,title,downs,name,upvote_ratio,ups,removed_by_category,link_flair_text,...,no_follow,created_utc,author_flair_text,author,num_comments,subreddit_subscribers,send_replies,is_video,deleted,comments
0,1dx1b0z,Destiny,,New Vegan,0,t3_1dx1b0z,0.95,121,,Shitpost,...,False,1720304607,,TuningsGaming,2,248289,True,False,False,"[{'id': 'lbyv8mn', 'total_awards_received': 0,..."


In [2]:
# alter columns so they're easier to work with
df['subreddit'] = df['subreddit'].str.lower()

# Show some values that might be helpful for customizing configuration
print(df['subreddit'].unique())

from utils import to_k
posts_count = to_k(len(df), logger)
logger.info(f"Dataset size (posts): {posts_count}")

[32m2024-11-26 13:32:14.451[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m54k rows[0m
[32m2024-11-26 13:32:14.451[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mDataset size (posts): 54k[0m


['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
 'themajorityreport' 'seculartalk']


loop through posts and create conversations by alternating user/assistant with every comment/reply

In [4]:
from typing import Dict, Generator, List
def Turn(role: str, content: str) -> Dict[str, str]:
  return {
    'role': role,
    'content': content
  }

def traverse_thread(comment: Dict, role: str = 'gpt') -> Generator[List[Dict[str, str]], None, None]:
    """
    Recursively traverse a comment thread and yield each individual thread.
    """
    if role not in {'gpt', 'human'}:
        raise ValueError("role must be 'gpt' or 'human'")
    
    if not comment.get('body'):
        return
    
    if comment['body'] == '[deleted]' or comment['body'] == '[removed]':
        return
    
    # Start the thread with the current comment
    current_thread = [Turn(role, comment['body'])]
    
    # If no replies, yield the current thread as-is
    if not comment.get('replies'):
        yield current_thread
        return
    
    # Recurse into replies, yielding a full thread for each reply chain
    for reply in comment['replies']:
        for sub_thread in traverse_thread(reply, 'human' if role == 'gpt' else 'gpt'):
            yield current_thread + sub_thread

## To Do
- Add custom metadata based on analysis of the conversation

In [5]:
from utils import is_post_valid

# set to prevent duplicates which can occur if the final comment is deleted or removed
conversations = set()
for i, post_row in df.iterrows():
    valid, reason = is_post_valid(post_row)
    if not valid:
        continue

    # Prepare metadata
    metadata = {
        "subreddit": {
            "name": post_row.get("subreddit", "unknown"),
            "subscribers": post_row.get("subreddit_subscribers", None),
        },
        "post": {
            "score": post_row.get("score", None),
            "upvotes": post_row.get("ups", None),
            "downvotes": post_row.get("downs", None),
            "upvote_ratio": post_row.get("upvote_ratio", None),
            "flair": post_row.get("link_flair_text", None),
            "author": post_row.get("author", "unknown"),
            "suggested_sort": post_row.get("suggested_sort", None),
        }
    }

    system_turn = Turn('system', f"You are a redditor, having a conversation with another redditor.")

    # if the post is deleted or removed, use the first comment as the initial turn
    if post_row['selftext'] == '[deleted]' or post_row['selftext'] == '[removed]':
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment, 'human'):
                # Serialize thread with metadata
                serialized_thread = json.dumps({
                    "metadata": metadata,
                    "conversation": thread
                })
                conversations.add(serialized_thread)
    # otherwise, use the post title or selftext as the initial turn
    else:
        initial_turn = Turn('human', post_row['selftext'] if post_row['selftext'] else post_row['title'])
        # Process comments
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment):
                # Serialize thread with metadata
                serialized_thread = json.dumps({
                    "metadata": metadata,
                    "conversation": [system_turn] + [initial_turn] + thread
                })
                conversations.add(serialized_thread)

# Deserialize conversations back into Python objects if needed
conversations = [json.loads(conv) for conv in conversations]
logger.info(f"Extracted {len(conversations)} conversations from {len(df)} posts")
conversations[0]

[32m2024-11-26 13:32:24.727[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m54[0m - [1mExtracted 643596 conversations from 54215 posts[0m


{'metadata': {'subreddit': {'name': 'democrats', 'subscribers': 473338},
  'post': {'score': 648,
   'upvotes': 648,
   'downvotes': 0,
   'upvote_ratio': 0.93,
   'flair': 'üó≥Ô∏è Beat Trump',
   'author': 'politicalthrow99',
   'suggested_sort': None}},
 'conversation': [{'role': 'system',
   'content': 'You are a redditor, having a conversation with another redditor.'},
  {'role': 'human',
   'content': 'Incredibly Different Standards Are Applied to Trump and Biden. WHY?'},
  {'role': 'gpt',
   'content': 'Presidential elections are about how the candidate makes you "feel" and how you think he will make you "feel" for the next four years.\n\nIdiot Trump makes those people "feel" one way. \n\nAfter the debate, Joe Biden made us "feel" a particular way.\n\nA new Democratic candidate will make us "feel" a whole new way - on top of how that idiot makes us "feel."'}]}

In [7]:
# Save to JSON file
import json

name = 'political-subreddit-threads'
size_str = to_k(len(conversations), logger)
logger.info(f"Saving {size_str} conversations to file")

from utils import make_dataset_path
dataset_path, hf_name = make_dataset_path(name, size_str)

# Create JSON object with metadata and conversation
json_obj = []
for conversation_data in conversations:
    # Each conversation_data should already include metadata and conversation structure
    json_obj.append({
        "metadata": conversation_data.get("metadata", {}),
        "conversation": conversation_data.get("conversation", [])
    })

# Save to file
with open(dataset_path, 'w') as f:
    json.dump(json_obj, f, indent=2)
logger.info(f"Conversations saved to {dataset_path}")

[32m2024-11-26 13:32:24.737[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m643k rows[0m
[32m2024-11-26 13:32:24.738[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mSaving 643k conversations to file[0m
[32m2024-11-26 13:32:39.252[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mConversations saved to data/datasets/political-subreddit-threads-643k.json[0m


In [11]:
# push to huggingface
from datasets import load_dataset
dataset = load_dataset('json', data_files=dataset_path)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 643596 examples [00:10, 62483.21 examples/s]


In [12]:
import os
from dotenv import load_dotenv
load_dotenv()

if not os.getenv('HF_TOKEN'):
  logger.error("No Hugging Face token found, not pushing to hub")
else:
  dataset.push_to_hub(f"{HUGGINGFACE_USERNAME}/{hf_name}".lower(), token=os.getenv('HF_TOKEN'))

Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 322/322 [00:00<00:00, 442.86ba/s]
Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 322/322 [00:00<00:00, 427.51ba/s]
Uploading the dataset shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:14<00:00,  7.03s/it]
