# About
This notebook is used to create a base dataset of reddit conversations, so that it can be filtered and subset in the future.

### To Do
- Add custom metadata based on analysis of the conversation
  - [X] normalized controversiality
  - [ ] fix normalized contr., currently some values are greater than 1

In [1]:
# CONFIG
# Description: Configuration for the dataset module, could eventually be used as flags
# HUGGINGFACE_USERNAME = 'BinghamtonUniversity'
HUGGINGFACE_USERNAME = 'brianmatzelle'

# change if you know what you're doing
RAW_DATA_FILE_NAME = 'posts-11-13-2024'

# DONT CHANGE
RAW_DATA_FILE = f'data/raw/{RAW_DATA_FILE_NAME}.json'
PROCESSED_DATA_FILE = f'data/processed/{RAW_DATA_FILE_NAME}-processed.json'

In [2]:
import json
from loguru import logger

with open(PROCESSED_DATA_FILE, 'r') as f:
  data = json.load(f)
logger.info(f"Loaded {len(data)} rows from {PROCESSED_DATA_FILE}")

import pandas as pd
df = pd.DataFrame(data)
del data
prefilter_len = len(df)
logger.info(f"Converted json to pandas DataFrame with {prefilter_len} rows")
df.head(1)

[32m2024-12-04 12:40:39.239[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mLoaded 54215 rows from data/processed/posts-11-13-2024-processed.json[0m
[32m2024-12-04 12:40:39.914[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mConverted json to pandas DataFrame with 54215 rows[0m


Unnamed: 0,id,subreddit,selftext,title,downs,name,upvote_ratio,ups,removed_by_category,link_flair_text,...,no_follow,created_utc,author_flair_text,author,num_comments,subreddit_subscribers,send_replies,is_video,deleted,comments
0,1dx1b0z,Destiny,,New Vegan,0,t3_1dx1b0z,0.95,121,,Shitpost,...,False,1720304607,,TuningsGaming,2,248289,True,False,False,"[{'id': 'lbyv8mn', 'total_awards_received': 0,..."


In [3]:
# alter columns so they're easier to work with
df['subreddit'] = df['subreddit'].str.lower()

# Show some values that might be helpful for customizing configuration
print(df['subreddit'].unique())

from utils import to_k
posts_count = to_k(len(df))
logger.info(f"Dataset size (posts): {posts_count}")

[32m2024-12-04 12:40:39.947[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mDataset size (posts): 54k[0m


['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
 'themajorityreport' 'seculartalk']


loop through posts and create conversations by alternating user/assistant with every comment/reply

## To Do:
- Generate synthetic data for threads that end in a user message. Currently we just remove this valuable data :/

In [4]:
# This code is straight from hell

from typing import Dict, Generator, List, Tuple
def Turn(role: str, content: str, metadata: Dict) -> Dict[str, str]:
  return {
    'role': role,
    'content': content,
    'metadata': metadata
  }

def get_comment_metadata(comment: Dict) -> Dict:
  return {
    "score": comment.get('score', None),
    "upvotes": comment.get('ups', None),
    "downvotes": comment.get('downs', None),
    "controversiality": comment.get('controversiality', None),
    "created_utc": comment.get('created_utc', None),
    "author": comment.get('author', None),
    "no_follow": comment.get('no_follow', None),
    "total_awards_received": comment.get('total_awards_received', None),
    "is_submitter": comment.get('is_submitter', None),
  }

def traverse_thread(comment: Dict, c_sum: int, role: str = 'assistant') -> Generator[Tuple[List[Dict[str, str]], int], None, None]:
    """
    Recursively traverse a comment thread and yield each individual thread.
    """
    if role not in {'assistant', 'user'}:
        raise ValueError("role must be 'assistant' or 'user'")
    
    if not comment.get('body'):
        return
    
    if comment['body'] == '[deleted]' or comment['body'] == '[removed]':
        return
  
    metadata = get_comment_metadata(comment)
    # Start the thread with the current comment
    current_thread = [Turn(role, comment['body'], metadata)]

    # Add controversiality only for the current comment
    current_c_sum = c_sum + comment.get('controversiality')
    
    # If no replies, yield the current thread with current_c_sum
    if not comment.get('replies'):
        # if the last message is from the user, remove it and subtract the controversiality
        if role == 'user':
            current_thread.pop()
            current_c_sum -= comment.get('controversiality')
        yield current_thread, current_c_sum
        return
    
    # Recurse into replies, but pass current_c_sum instead of c_sum
    for reply in comment['replies']:
        for sub_thread, sub_c_sum in traverse_thread(reply, current_c_sum, 'user' if role == 'assistant' else 'assistant'):
            yield current_thread + sub_thread, sub_c_sum

### To Do
- Add custom metadata based on analysis of the conversation

In [5]:
def get_post_metadata(post_row):
  return {
    "subreddit": {
      "name": post_row.get("subreddit", None),
      "subscribers": post_row.get("subreddit_subscribers", None),
    },
    "post": {
      "score": post_row.get("score", None),
      "upvotes": post_row.get("ups", None),
      "downvotes": post_row.get("downs", None),
      "upvote_ratio": post_row.get("upvote_ratio", None),
      "flair": post_row.get("link_flair_text", None),
      "author": post_row.get("author", "unknown"),
      "suggested_sort": post_row.get("suggested_sort", None),
      "title": post_row.get("title", None),
      "removed_by_category": post_row.get("removed_by_category", None),
      "created_utc": post_row.get("created_utc", None),
      "no_follow": post_row.get("no_follow", None),
      "total_awards_received": post_row.get("total_awards_received", None),
    },
    "controversiality": 0,
    "normalized_controversiality": 0
  }

In [6]:
from utils import is_post_valid
from lib.analysis import normalize_controversiality_rating

# set to prevent duplicates which can occur if the final comment is deleted or removed
conversations = set()
for i, post_row in df.iterrows():
    valid, reason = is_post_valid(post_row)
    if not valid:
        continue

    # Prepare metadata
    metadata = get_post_metadata(post_row)
    
    # if the post is deleted or removed, use the first comment as the initial turn (user)
    # deleted posts often still have a lot of comments, so we don't want to throw away the whole post
    post_turn = []
    if not post_row['selftext'] == '[deleted]' or post_row['selftext'] == '[removed]':
        post_turn = [Turn('user', post_row['selftext'] if post_row['selftext'] else post_row['title'], metadata=metadata.get('post', {}))]

    # first_turn = [Turn('system', f"You are a redditor in a political subreddit, having a conversation with another redditor about politics.")] + post_turn
    first_turn = post_turn

    bad_thread_count = 0
    for comment in post_row.get('comments', []):
        for thread, controversiality_sum in traverse_thread(comment, c_sum=0, role='assistant' if len(post_turn) > 0 else 'user'):
            if len(thread) < 2:
                # skip if the thread is it's only the system message
                bad_thread_count += 1
                continue
            # set controversiality metadata
            metadata["controversiality"] = controversiality_sum
            metadata["normalized_controversiality"] = normalize_controversiality_rating(sum=controversiality_sum, thread_length=len(thread))
            # Serialize thread with metadata
            serialized_thread = json.dumps({
                "metadata": metadata,
                "conversation": first_turn + thread
            })
            conversations.add(serialized_thread)
            
# Deserialize conversations back into Python objects if needed
conversations = [json.loads(conv) for conv in conversations]
logger.info(f"Skipped {bad_thread_count} bad threads")
logger.info(f"Extracted {len(conversations)} conversations from {len(df)} posts")
logger.info(f"Deleting dataframe from memory since it hoards resources and is no longer needed")
del df
conversations[0]

[32m2024-12-04 12:40:52.106[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m42[0m - [1mSkipped 2 bad threads[0m
[32m2024-12-04 12:40:52.107[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m43[0m - [1mExtracted 173713 conversations from 54215 posts[0m
[32m2024-12-04 12:40:52.107[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m44[0m - [1mDeleting dataframe from memory since it hoards resources and is no longer needed[0m


{'metadata': {'subreddit': {'name': 'neoliberal', 'subscribers': 175832},
  'post': {'score': 433,
   'upvotes': 433,
   'downvotes': 0,
   'upvote_ratio': 0.91,
   'flair': 'Megathread',
   'author': 'dubyahhh',
   'suggested_sort': 'new',
   'title': 'Biden Megathread V: The Establishment Strikes Back',
   'removed_by_category': None,
   'created_utc': 1720363953,
   'no_follow': False,
   'total_awards_received': 0},
  'controversiality': 0,
  'normalized_controversiality': 0.0},
 'conversation': [{'role': 'user',
   'content': 'Name is unrelated to anything, just wanted to make a Star Wars joke since these threads seem to never end\n\nHonestly just go touch grass, don’t even read anything beyond this, god save your filthy soul if you venture too deep',
   'metadata': {'score': 433,
    'upvotes': 433,
    'downvotes': 0,
    'upvote_ratio': 0.91,
    'flair': 'Megathread',
    'author': 'dubyahhh',
    'suggested_sort': 'new',
    'title': 'Biden Megathread V: The Establishment Str

## Tests
To Do - remove post processing

In [7]:
from tqdm import tqdm

last_msg_user_count, singleton_convo_count = 0, 0
# Wrap conversations with tqdm
for obj in tqdm(conversations, desc="Processing conversations"):
    convo = obj['conversation']
    # ... rest of the loop content stays the same ...
    if convo[-1]['role'] == 'user':
        last_msg_user_count += 1
        convo.pop()
    
    if len(convo) == 1:
        singleton_convo_count += 1
        conversations.remove(obj)

    # from convo[1-end], make sure role is alternating between user and assistant
    for i in range(1, len(convo)):
        if convo[i]['role'] == convo[i-1]['role']:
            raise ValueError(f"Non-alternating roles found in conversation: {convo}")

logger.info(f"Removed {last_msg_user_count} last messages from {len(conversations)} conversations")
logger.info(f"Removed {singleton_convo_count} singleton conversations from {len(conversations)} conversations")

Processing conversations: 100%|██████████| 173713/173713 [00:00<00:00, 954015.03it/s]
[32m2024-12-04 12:40:52.393[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mRemoved 0 last messages from 173713 conversations[0m
[32m2024-12-04 12:40:52.393[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mRemoved 0 singleton conversations from 173713 conversations[0m


In [8]:
# Save to JSON file
import json

name = '2024-election-subreddit-threads'
size_str = to_k(len(conversations))
from utils import make_dataset_path
dataset_path, hf_dataset_name = make_dataset_path(name, size_str)
logger.info(f"Writing {size_str} conversations to {dataset_path}...")


# Create JSON object with metadata and conversation
json_obj = []
for conversation_data in conversations:
    # Each conversation_data should already include metadata and conversation structure
    json_obj.append({
        "metadata": conversation_data.get("metadata", {}),
        "conversations": conversation_data.get("conversation", [])
    })

# Save to file
with open(dataset_path, 'w') as f:
    json.dump(json_obj, f, indent=2)
logger.info(f"Conversations saved to {dataset_path}")

[32m2024-12-04 12:40:52.399[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mWriting 173k conversations to data/datasets/2024-election-subreddit-threads-173k(2).json...[0m
[32m2024-12-04 12:41:04.724[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mConversations saved to data/datasets/2024-election-subreddit-threads-173k(2).json[0m


In [9]:
# push to huggingface
from datasets import load_dataset
dataset = load_dataset('json', data_files=dataset_path)

import os
from dotenv import load_dotenv
load_dotenv()

if not os.getenv('HF_TOKEN'):
  logger.error("No Hugging Face token found, not pushing to hub")
else:
  dataset.push_to_hub(f"{HUGGINGFACE_USERNAME}/{hf_dataset_name}".lower(), token=os.getenv('HF_TOKEN'))

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 173713 examples [00:11, 15226.82 examples/s]
Creating parquet from Arrow format: 100%|██████████| 174/174 [00:01<00:00, 165.09ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00,  8.94s/it]
