# About
This notebook is used to create a base dataset of reddit conversations, so that it can be filtered and subset in the future.

### To Do
- Add custom metadata based on analysis of the conversation
  - [X] normalized controversiality
  - [ ] fix normalized contr., currently some values are greater than 1

In [1]:
# CONFIG
# Description: Configuration for the dataset module, could eventually be used as flags
HUGGINGFACE_USERNAME = 'brianmatzelle'

# change if you know what you're doing
RAW_DATA_FILE_NAME = 'posts-11-13-2024'

# DONT CHANGE
RAW_DATA_FILE = f'data/raw/{RAW_DATA_FILE_NAME}.json'
PROCESSED_DATA_FILE = f'data/processed/{RAW_DATA_FILE_NAME}-processed.json'

In [None]:
import json
from loguru import logger

with open(PROCESSED_DATA_FILE, 'r') as f:
  data = json.load(f)
logger.info(f"Loaded {len(data)} rows from {PROCESSED_DATA_FILE}")

import pandas as pd
df = pd.DataFrame(data)
del data
prefilter_len = len(df)
logger.info(f"Converted json to pandas DataFrame with {prefilter_len} rows")
df.head(1)

In [None]:
# alter columns so they're easier to work with
df['subreddit'] = df['subreddit'].str.lower()

# Show some values that might be helpful for customizing configuration
print(df['subreddit'].unique())

from utils import to_k
posts_count = to_k(len(df))
logger.info(f"Dataset size (posts): {posts_count}")

loop through posts and create conversations by alternating user/assistant with every comment/reply

In [30]:
from typing import Dict, Generator, List
def Turn(role: str, content: str) -> Dict[str, str]:
  return {
    'role': role,
    'content': content
  }

def traverse_thread(comment: Dict, controversiality_sum: Dict, role: str = 'assistant') -> Generator[List[Dict[str, str]], None, None]:
    """
    Recursively traverse a comment thread and yield each individual thread.
    """
    if role not in {'assistant', 'user'}:
        raise ValueError("role must be 'assistant' or 'user'")
    
    if not comment.get('body'):
        return
    
    if comment['body'] == '[deleted]' or comment['body'] == '[removed]':
        return
  
    controversiality_sum['val'] += comment.get('controversiality')
    
    # Start the thread with the current comment
    current_thread = [Turn(role, comment['body'])]
    
    # If no replies, yield the current thread as-is
    if not comment.get('replies'):
        yield current_thread
        return
    
    # Recurse into replies, yielding a full thread for each reply chain
    for reply in comment['replies']:
        for sub_thread in traverse_thread(reply, controversiality_sum, 'user' if role == 'assistant' else 'assistant'):
            yield current_thread + sub_thread

### To Do
- Add custom metadata based on analysis of the conversation

In [31]:
def get_metadata(post_row):
  return {
    "subreddit": {
      "name": post_row.get("subreddit", "unknown"),
      "subscribers": post_row.get("subreddit_subscribers", None),
    },
    "post": {
      "score": post_row.get("score", None),
      "upvotes": post_row.get("ups", None),
      "downvotes": post_row.get("downs", None),
      "upvote_ratio": post_row.get("upvote_ratio", None),
      "flair": post_row.get("link_flair_text", None),
      "author": post_row.get("author", "unknown"),
      "suggested_sort": post_row.get("suggested_sort", None),
    },
    "controversiality": 0,
    "normalized_controversiality": 0
  }

In [None]:
from utils import is_post_valid
from lib.analysis import normalize_controversiality_rating

# set to prevent duplicates which can occur if the final comment is deleted or removed
conversations = set()
for i, post_row in df.iterrows():
    valid, reason = is_post_valid(post_row)
    if not valid:
        continue

    # Prepare metadata
    metadata = get_metadata(post_row)

    system_turn = Turn('system', f"You are a redditor, having a conversation with another redditor.")

    # if the post is deleted or removed, use the first comment as the initial turn (user)
    if post_row['selftext'] == '[deleted]' or post_row['selftext'] == '[removed]':
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment, controversiality_sum, 'user'):
                controversiality_sum = {"val": 0}
                # set controversiality metadata
                metadata["controversiality"] = controversiality_sum["val"]
                metadata["normalized_controversiality"] = normalize_controversiality_rating(sum=controversiality_sum["val"], thread_length=len(thread))
                # Serialize thread with metadata
                serialized_thread = json.dumps({
                    "metadata": metadata,
                    "conversation": [system_turn] + thread
                })
                conversations.add(serialized_thread)
    # otherwise, use the post title or selftext as the initial turn (user)
    else:
        initial_turn = Turn('user', post_row['selftext'] if post_row['selftext'] else post_row['title'])
        # Process comments
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment, controversiality_sum):
                controversiality_sum = {"val": 0}
                # prepend the initial turn to the thread, since the post is not deleted or removed
                thread = [initial_turn] + thread
                # set controversiality metadata
                metadata["controversiality"] = controversiality_sum["val"]
                metadata["normalized_controversiality"] = normalize_controversiality_rating(sum=controversiality_sum["val"], thread_length=len(thread)-1) # subtract 1 since post does not have a controversiality rating
                # Serialize thread with metadata
                serialized_thread = json.dumps({
                    "metadata": metadata,
                    "conversation": [system_turn] + thread
                })
                conversations.add(serialized_thread)


# Deserialize conversations back into Python objects if needed
conversations = [json.loads(conv) for conv in conversations]
logger.info(f"Extracted {len(conversations)} conversations from {len(df)} posts")
logger.info(f"Deleting dataframe from memory since it hoards resources and is no longer needed")
del df
conversations[0]

In [None]:
# Save to JSON file
import json

name = '2024-election-subreddit-threads'
size_str = to_k(len(conversations))
from utils import make_dataset_path
dataset_path, hf_dataset_name = make_dataset_path(name, size_str)
logger.info(f"Writing {size_str} conversations to {dataset_path}...")


# Create JSON object with metadata and conversation
json_obj = []
for conversation_data in conversations:
    # Each conversation_data should already include metadata and conversation structure
    json_obj.append({
        "metadata": conversation_data.get("metadata", {}),
        "conversations": conversation_data.get("conversation", [])
    })

# Save to file
with open(dataset_path, 'w') as f:
    json.dump(json_obj, f, indent=2)
logger.info(f"Conversations saved to {dataset_path}")

In [None]:
# push to huggingface
from datasets import load_dataset
dataset = load_dataset('json', data_files=dataset_path)

import os
from dotenv import load_dotenv
load_dotenv()

if not os.getenv('HF_TOKEN'):
  logger.error("No Hugging Face token found, not pushing to hub")
else:
  dataset.push_to_hub(f"{HUGGINGFACE_USERNAME}/{hf_dataset_name}".lower(), token=os.getenv('HF_TOKEN'))