# About
This notebook is used to create a base dataset of reddit conversations, so that it can be filtered and subset in the future.

### To Do
- Add custom metadata based on analysis of the conversation
  - [X] normalized controversiality
  - [ ] fix normalized contr., currently some values are greater than 1

In [1]:
# CONFIG
# Description: Configuration for the dataset module, could eventually be used as flags
HUGGINGFACE_USERNAME = 'brianmatzelle'

# change if you know what you're doing
RAW_DATA_FILE_NAME = 'posts-11-13-2024'

# DONT CHANGE
RAW_DATA_FILE = f'data/raw/{RAW_DATA_FILE_NAME}.json'
PROCESSED_DATA_FILE = f'data/processed/{RAW_DATA_FILE_NAME}-processed.json'

In [None]:
import json
from loguru import logger

with open(PROCESSED_DATA_FILE, 'r') as f:
  data = json.load(f)
logger.info(f"Loaded {len(data)} rows from {PROCESSED_DATA_FILE}")

import pandas as pd
df = pd.DataFrame(data)
del data
prefilter_len = len(df)
logger.info(f"Converted json to pandas DataFrame with {prefilter_len} rows")
df.head(1)

[32m2024-11-27 12:18:12.224[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mLoaded 54215 rows from data/processed/posts-11-13-2024-processed.json[0m
[32m2024-11-27 12:18:12.573[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mConverted json to pandas DataFrame with 54215 rows[0m


Unnamed: 0,id,subreddit,selftext,title,downs,name,upvote_ratio,ups,removed_by_category,link_flair_text,...,no_follow,created_utc,author_flair_text,author,num_comments,subreddit_subscribers,send_replies,is_video,deleted,comments
0,1dx1b0z,Destiny,,New Vegan,0,t3_1dx1b0z,0.95,121,,Shitpost,...,False,1720304607,,TuningsGaming,2,248289,True,False,False,"[{'id': 'lbyv8mn', 'total_awards_received': 0,..."


In [3]:
# alter columns so they're easier to work with
df['subreddit'] = df['subreddit'].str.lower()

# Show some values that might be helpful for customizing configuration
print(df['subreddit'].unique())

from utils import to_k
posts_count = to_k(len(df))
logger.info(f"Dataset size (posts): {posts_count}")

[32m2024-11-27 12:18:12.609[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m54k rows[0m
[32m2024-11-27 12:18:12.610[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mDataset size (posts): 54k[0m


['destiny' 'hasan_piker' 'politics' 'vaushv' 'millenials' 'news'
 'worldnews' 'economics' 'socialism' 'conservative' 'libertarian'
 'neoliberal' 'republican' 'democrats' 'progressive' 'daverubin'
 'jordanpeterson' 'samharris' 'joerogan' 'thedavidpakmanshow' 'benshapiro'
 'themajorityreport' 'seculartalk']


loop through posts and create conversations by alternating user/assistant with every comment/reply

In [4]:
from typing import Dict, Generator, List
def Turn(role: str, content: str) -> Dict[str, str]:
  return {
    'role': role,
    'content': content
  }

def traverse_thread(comment: Dict, controversiality_sum: Dict, role: str = 'assistant') -> Generator[List[Dict[str, str]], None, None]:
    """
    Recursively traverse a comment thread and yield each individual thread.
    """
    if role not in {'assistant', 'user'}:
        raise ValueError("role must be 'assistant' or 'user'")
    
    if not comment.get('body'):
        return
    
    if comment['body'] == '[deleted]' or comment['body'] == '[removed]':
        return
  
    controversiality_sum['val'] += comment.get('controversiality')
    
    # Start the thread with the current comment
    current_thread = [Turn(role, comment['body'])]
    
    # If no replies, yield the current thread as-is
    if not comment.get('replies'):
        yield current_thread
        return
    
    # Recurse into replies, yielding a full thread for each reply chain
    for reply in comment['replies']:
        for sub_thread in traverse_thread(reply, controversiality_sum, 'user' if role == 'assistant' else 'assistant'):
            yield current_thread + sub_thread

### To Do
- Add custom metadata based on analysis of the conversation

In [5]:
from utils import is_post_valid
from lib.analysis import normalize_controversiality_rating
# set to prevent duplicates which can occur if the final comment is deleted or removed
conversations = set()
for i, post_row in df.iterrows():
    valid, reason = is_post_valid(post_row)
    if not valid:
        continue

    # Prepare metadata
    metadata = {
        "subreddit": {
            "name": post_row.get("subreddit", "unknown"),
            "subscribers": post_row.get("subreddit_subscribers", None),
        },
        "post": {
            "score": post_row.get("score", None),
            "upvotes": post_row.get("ups", None),
            "downvotes": post_row.get("downs", None),
            "upvote_ratio": post_row.get("upvote_ratio", None),
            "flair": post_row.get("link_flair_text", None),
            "author": post_row.get("author", "unknown"),
            "suggested_sort": post_row.get("suggested_sort", None),
        },
        "controversiality": 0,
        "normalized_controversiality": 0
    }

    controversiality_sum = {"val": 0}
    system_turn = Turn('system', f"You are a redditor, having a conversation with another redditor.")

    # if the post is deleted or removed, use the first comment as the initial turn (user)
    if post_row['selftext'] == '[deleted]' or post_row['selftext'] == '[removed]':
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment, controversiality_sum, 'user'):
                # set controversiality metadata
                metadata["controversiality"] = controversiality_sum["val"]
                metadata["normalized_controversiality"] = normalize_controversiality_rating(sum=controversiality_sum["val"], thread_length=len(thread))
                # Serialize thread with metadata
                serialized_thread = json.dumps({
                    "metadata": metadata,
                    "conversation": [system_turn] + thread
                })
                conversations.add(serialized_thread)
    # otherwise, use the post title or selftext as the initial turn (user)
    else:
        initial_turn = Turn('user', post_row['selftext'] if post_row['selftext'] else post_row['title'])
        # Process comments
        for comment in post_row.get('comments', []):
            for thread in traverse_thread(comment, controversiality_sum):
                # prepend the initial turn to the thread, since the post is not deleted or removed
                thread = [initial_turn] + thread
                # set controversiality metadata
                metadata["controversiality"] = controversiality_sum["val"]
                metadata["normalized_controversiality"] = normalize_controversiality_rating(sum=controversiality_sum["val"], thread_length=len(thread)-1) # subtract 1 since post does not have a controversiality rating
                # Serialize thread with metadata
                serialized_thread = json.dumps({
                    "metadata": metadata,
                    "conversation": [system_turn] + thread
                })
                conversations.add(serialized_thread)

# Deserialize conversations back into Python objects if needed
conversations = [json.loads(conv) for conv in conversations]
logger.info(f"Extracted {len(conversations)} conversations from {len(df)} posts")
conversations[0]

[32m2024-11-27 12:18:28.510[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m65[0m - [1mExtracted 643685 conversations from 54215 posts[0m


{'metadata': {'subreddit': {'name': 'politics', 'subscribers': 8599504},
  'post': {'score': 924,
   'upvotes': 924,
   'downvotes': 0,
   'upvote_ratio': 0.98,
   'flair': None,
   'author': 'PandaMuffin1',
   'suggested_sort': None},
  'controversiality': 0,
  'normalized_controversiality': 0.0},
 'conversation': [{'role': 'system',
   'content': 'You are a redditor, having a conversation with another redditor.'},
  {'role': 'user',
   'content': 'Harris leads Trump in Arizona, Gallego holds 11-point lead over Lake: Survey'},
  {'role': 'assistant',
   'content': "I am hoping for a blue wave across the board in Arizona. In addition to the presidency and the senate race, keep in mind that a blue wave would flip both chambers of the AZ state legislature. Right now they are controlled by the republicans by very slim margins (31-29 in the House, and 16-14 in the senate). \n\nVote blue, my friends, and let's make it happen\n\nhttps://azdem.org/"},
  {'role': 'user',
   'content': 'There i

In [6]:
# checking for duplicates, not sure if they're the reason why controversiality normalization is off (sometimes is higher than 1)
# import json
# from typing import List, Dict

# def scan_duplicates(conversations: List[Dict]) -> List[Dict]:
#     """
#     Scan conversations for duplicates based on the conversation content.
#     """
#     # Use a set for fast lookup of seen conversation contents
#     seen_contents = set()
#     duplicates = []

#     for conversation in conversations:
#         # Serialize the conversation content
#         conversation_content = json.dumps(conversation["conversation"], sort_keys=True)
        
#         # Check if the conversation content is already seen
#         if conversation_content in seen_contents:
#             duplicates.append(conversation)
#         else:
#             seen_contents.add(conversation_content)

#     return duplicates

# # Scan for duplicates and log results
# duplicate_conversations = scan_duplicates(conversations)
# logger.info(f"Found {len(duplicate_conversations)} duplicate conversations")

# # Save duplicate conversations for further analysis
# duplicate_conversations_path = f'data/datasets/{name}-duplicate-conversations.json'
# with open(duplicate_conversations_path, 'w') as f:
#     json.dump(duplicate_conversations, f, indent=2)

# logger.info(f"Duplicate conversations saved to {duplicate_conversations_path}")

In [7]:
# Save to JSON file
import json

name = '2024-election-subreddit-threads'
size_str = to_k(len(conversations))
logger.info(f"Saving {size_str} conversations to file")

from utils import make_dataset_path
dataset_path, hf_name = make_dataset_path(name, size_str)

# Create JSON object with metadata and conversation
json_obj = []
for conversation_data in conversations:
    # Each conversation_data should already include metadata and conversation structure
    json_obj.append({
        "metadata": conversation_data.get("metadata", {}),
        "conversations": conversation_data.get("conversation", [])
    })

# Save to file
with open(dataset_path, 'w') as f:
    json.dump(json_obj, f, indent=2)
logger.info(f"Conversations saved to {dataset_path}")

[32m2024-11-27 12:18:28.588[0m | [1mINFO    [0m | [36mutils[0m:[36mto_k[0m:[36m35[0m - [1m643k rows[0m
[32m2024-11-27 12:18:28.588[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mSaving 643k conversations to file[0m
[32m2024-11-27 12:18:46.161[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mConversations saved to data/datasets/2024-election-subreddit-threads-643k(1).json[0m


In [8]:
# push to huggingface
from datasets import load_dataset
dataset = load_dataset('json', data_files=dataset_path)

import os
from dotenv import load_dotenv
load_dotenv()

if not os.getenv('HF_TOKEN'):
  logger.error("No Hugging Face token found, not pushing to hub")
else:
  dataset.push_to_hub(f"{HUGGINGFACE_USERNAME}/{hf_name}".lower(), token=os.getenv('HF_TOKEN'))

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/322 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/322 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/970 [00:00<?, ?B/s]