# About
This notebook is used to create a base dataset of reddit conversations, so that it can be filtered and subset in the future.

### To Do
- Add custom metadata based on analysis of the conversation
  - [X] normalized controversiality
  - [ ] fix normalized contr., currently some values are greater than 1

In [1]:
# CONFIG
# Description: Configuration for the dataset module, could eventually be used as flags
HUGGINGFACE_USERNAME = 'brianmatzelle'

# change if you know what you're doing
RAW_DATA_FILE_NAME = 'posts-11-13-2024'

# DONT CHANGE
RAW_DATA_FILE = f'data/raw/{RAW_DATA_FILE_NAME}.json'
PROCESSED_DATA_FILE = f'data/processed/{RAW_DATA_FILE_NAME}-processed.json'

In [2]:
import json
from loguru import logger

with open(PROCESSED_DATA_FILE, 'r') as f:
  data = json.load(f)
logger.info(f"Loaded {len(data)} rows from {PROCESSED_DATA_FILE}")

import pandas as pd
df = pd.DataFrame(data)
del data
prefilter_len = len(df)
logger.info(f"Converted json to pandas DataFrame with {prefilter_len} rows")
df.head(1)

[32m2024-11-27 21:34:52.164[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mLoaded 54215 rows from data/processed/posts-11-13-2024-processed.json[0m
[32m2024-11-27 21:34:52.955[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mConverted json to pandas DataFrame with 54215 rows[0m


Unnamed: 0,id,subreddit,selftext,title,downs,name,upvote_ratio,ups,removed_by_category,link_flair_text,...,no_follow,created_utc,author_flair_text,author,num_comments,subreddit_subscribers,send_replies,is_video,deleted,comments
0,1dx1b0z,Destiny,,New Vegan,0,t3_1dx1b0z,0.95,121,,Shitpost,...,False,1720304607,,TuningsGaming,2,248289,True,False,False,"[{'id': 'lbyv8mn', 'total_awards_received': 0,..."


In [3]:
# alter columns so they're easier to work with
df['subreddit'] = df['subreddit'].str.lower()

# Show some values that might be helpful for customizing configuration
print(df['subreddit'].unique())

from utils import to_k
posts_count = to_k(len(df))
logger.info(f"Dataset size (posts): {posts_count}")

[32m2024-11-27 21:34:52.993[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m54k rows[0m
[32m2024-11-27 21:34:52.993[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mDataset size (posts): 54k[0m


['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
 'themajorityreport' 'seculartalk']


loop through posts and create conversations by alternating user/assistant with every comment/reply

In [4]:
# This code is straight from hell

from typing import Dict, Generator, List, Tuple
def Turn(role: str, content: str) -> Dict[str, str]:
  return {
    'role': role,
    'content': content
  }

def traverse_thread(comment: Dict, c_sum: int, role: str = 'assistant') -> Generator[Tuple[List[Dict[str, str]], int], None, None]:
    """
    Recursively traverse a comment thread and yield each individual thread.
    """
    if role not in {'assistant', 'user'}:
        raise ValueError("role must be 'assistant' or 'user'")
    
    if not comment.get('body'):
        return
    
    if comment['body'] == '[deleted]' or comment['body'] == '[removed]':
        return
  
        
    # Start the thread with the current comment
    current_thread = [Turn(role, comment['body'])]

        # Add controversiality only for the current comment
    current_c_sum = c_sum + comment.get('controversiality', 0)
    
    # If no replies, yield the current thread with current_c_sum
    if not comment.get('replies'):
        yield current_thread, current_c_sum
        return
    
    # Recurse into replies, but pass current_c_sum instead of c_sum
    for reply in comment['replies']:
        for sub_thread, sub_c_sum in traverse_thread(reply, current_c_sum, 'user' if role == 'assistant' else 'assistant'):
            yield current_thread + sub_thread, sub_c_sum

### To Do
- Add custom metadata based on analysis of the conversation

In [5]:
def get_metadata(post_row):
  return {
    "subreddit": {
      "name": post_row.get("subreddit", "unknown"),
      "subscribers": post_row.get("subreddit_subscribers", None),
    },
    "post": {
      "score": post_row.get("score", None),
      "upvotes": post_row.get("ups", None),
      "downvotes": post_row.get("downs", None),
      "upvote_ratio": post_row.get("upvote_ratio", None),
      "flair": post_row.get("link_flair_text", None),
      "author": post_row.get("author", "unknown"),
      "suggested_sort": post_row.get("suggested_sort", None),
    },
    "controversiality": 0,
    "normalized_controversiality": 0
  }

In [6]:
from utils import is_post_valid
from lib.analysis import normalize_controversiality_rating

# set to prevent duplicates which can occur if the final comment is deleted or removed
conversations = set()
for i, post_row in df.iterrows():
    valid, reason = is_post_valid(post_row)
    if not valid:
        continue

    # Prepare metadata
    metadata = get_metadata(post_row)
    
    initial_turn = None
    first_comment_type = 'user'
    # if the post is deleted or removed, use the first comment as the initial turn (user)
    if not post_row['selftext'] == '[deleted]' or post_row['selftext'] == '[removed]':
        initial_turn = Turn('user', post_row['selftext'] if post_row['selftext'] else post_row['title'])
        first_comment_type = 'assistant'

    system_turn = [Turn('system', f"You are a redditor, having a conversation with another redditor.")]

    if initial_turn is not None:
        system_turn.append(initial_turn)

    for comment in post_row.get('comments', []):
        for thread, c_sum in traverse_thread(comment, 0, first_comment_type):
            # set controversiality metadata
            metadata["controversiality"] = c_sum
            metadata["normalized_controversiality"] = normalize_controversiality_rating(sum=c_sum, thread_length=len(thread))
            # Serialize thread with metadata
            serialized_thread = json.dumps({
                "metadata": metadata,
                "conversation": system_turn + thread
            })
            conversations.add(serialized_thread)
            
# Deserialize conversations back into Python objects if needed
conversations = [json.loads(conv) for conv in conversations]
logger.info(f"Extracted {len(conversations)} conversations from {len(df)} posts")
logger.info(f"Deleting dataframe from memory since it hoards resources and is no longer needed")
del df
conversations[0]

[32m2024-11-27 21:35:13.709[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mExtracted 643627 conversations from 54215 posts[0m
[32m2024-11-27 21:35:13.710[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m41[0m - [1mDeleting dataframe from memory since it hoards resources and is no longer needed[0m


{'metadata': {'subreddit': {'name': 'politics', 'subscribers': 8585506},
  'post': {'score': 541,
   'upvotes': 541,
   'downvotes': 0,
   'upvote_ratio': 0.97,
   'flair': 'Soft Paywall',
   'author': 'croato87',
   'suggested_sort': None},
  'controversiality': 0,
  'normalized_controversiality': 0.0},
 'conversation': [{'role': 'system',
   'content': 'You are a redditor, having a conversation with another redditor.'},
  {'role': 'user',
   'content': 'Trump Loses It Over Devastating Fox News Poll on Kamala Harris'},
  {'role': 'assistant',
   'content': 'I went from feeling doomed for another Trump administration with Biden to feeling excited about a candidate for once, and like the wind is in our sails. \n\nTrump does dumb stuff when he’s cornered. Kamala’s campaign is killing it so far. I think this is going to be a trend and the gap will continue to widen. \n\nFeels good man. Feels good to be right about wanting Biden to step down. But even I’m surprised at how United dems have 

In [7]:
# Save to JSON file
import json

name = '2024-election-subreddit-threads'
size_str = to_k(len(conversations))
from utils import make_dataset_path
dataset_path, hf_dataset_name = make_dataset_path(name, size_str)
logger.info(f"Writing {size_str} conversations to {dataset_path}...")


# Create JSON object with metadata and conversation
json_obj = []
for conversation_data in conversations:
    # Each conversation_data should already include metadata and conversation structure
    json_obj.append({
        "metadata": conversation_data.get("metadata", {}),
        "conversations": conversation_data.get("conversation", [])
    })

# Save to file
with open(dataset_path, 'w') as f:
    json.dump(json_obj, f, indent=2)
logger.info(f"Conversations saved to {dataset_path}")

[32m2024-11-27 21:35:13.739[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m643k rows[0m
[32m2024-11-27 21:35:13.741[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mWriting 643k conversations to data/datasets/2024-election-subreddit-threads-643k(1).json...[0m
[32m2024-11-27 21:35:38.119[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mConversations saved to data/datasets/2024-election-subreddit-threads-643k(1).json[0m


In [8]:
# push to huggingface
from datasets import load_dataset
dataset = load_dataset('json', data_files=dataset_path)

import os
from dotenv import load_dotenv
load_dotenv()

if not os.getenv('HF_TOKEN'):
  logger.error("No Hugging Face token found, not pushing to hub")
else:
  dataset.push_to_hub(f"{HUGGINGFACE_USERNAME}/{hf_dataset_name}".lower(), token=os.getenv('HF_TOKEN'))

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 0 examples [00:00, ? examples/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10727f2b0>>
Traceback (most recent call last):
  File "/Users/brianmatzelle/anaconda3/envs/election/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
