In [25]:
import json
from datetime import datetime

In [26]:
def process_reddit_json(input_file, output_file):
    # Load JSON data
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    processed_data = []

    def format_date(utc_str):
        # Convert 'YYYY-MM-DD HH:MM:SS' to 'YYYY-MM-DD'
        if utc_str:
            try:
                date_obj = datetime.strptime(utc_str, '%Y-%m-%d %H:%M:%S')
                return date_obj.strftime('%Y-%m-%d')
            except ValueError:
                return utc_str  # Return original if format doesn't match
        return None
    
    for post in data:
        # Process the main post
        main_post = {
            "id": post.get("id"),
            "author": post.get("author"),
            "text": post.get("text"),
            "subreddit": post.get("subreddit"),
            "subreddit_id": post.get("subreddit_id"),
            "score": post.get("score"),
            "upvotes": post.get("upvotes"),
            "num_comments": post.get("num_comments"),
            "url": post.get("url"),
            "date": format_date(post.get("created_utc")),
        }
        
        # Process the comments
        comments = []
        for comment in post.get("comments", []):
            comment_data = {
                "id": comment.get("id"),
                "name": comment.get("name"),
                "parent_id": comment.get("parent_id"),
                "link_id": comment.get("link_id"),
                "text": comment.get("body"),
                "author": comment.get("author"),
                "score": comment.get("score"),
                "date": format_date(comment.get("created_utc")),
            }
            
            # Process replies within the comment
            replies = []
            for reply in comment.get("replies", []):
                reply_data = {
                    "id": reply.get("id"),
                    "name": reply.get("name"),
                    "parent_id": reply.get("parent_id"),
                    "link_id": reply.get("link_id"),
                    "text": reply.get("body"),
                    "author": reply.get("author"),
                    "score": reply.get("score"),
                    "date": format_date(reply.get("created_utc")),
                }
                replies.append(reply_data)
            comment_data["replies"] = replies
            comments.append(comment_data)
        
        main_post["comments"] = comments
        processed_data.append(main_post)
    
    # Write the processed data to a new JSON file
    with open(output_file, 'w') as f:
        json.dump(processed_data, f, indent=4)

In [27]:
input_path = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/reddit_posts_batch_1.json"
output_path = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/pcsk9_clean_reddit_data1.json"
process_reddit_json(input_path, output_path)

input_path = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/reddit_posts_batch_2.json"
output_path = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/pcsk9_clean_reddit_data2.json"
process_reddit_json(input_path, output_path)

input_path = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/reddit_posts_batch_3.json"
output_path = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/pcsk9_clean_reddit_data3.json"
process_reddit_json(input_path, output_path)

In [30]:
def flatten_reddit_data(json_file):
    # Load JSON data
    with open(json_file, 'r') as f:
        data = json.load(f)

    # List to hold flattened dictionaries
    flattened_list = []

    def process_item(item, item_type="post"):
        # Add all keys and values from the current dictionary
        flattened_item = {
            "type": item_type,  # Label to indicate if it's a post, comment, or reply
            **item
        }
        flattened_list.append(flattened_item)

    # Traverse the data to flatten posts, comments, and replies
    for post in data:
        process_item(post, "post")
        for comment in post.get("comments", []):
            process_item(comment, "comment")
            for reply in comment.get("replies", []):
                process_item(reply, "reply")

    return flattened_list

In [31]:
input_path1 = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/pcsk9_clean_reddit_data1.json"
extracted_data1 = flatten_reddit_data(input_path1)
input_path2 = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/pcsk9_clean_reddit_data2.json"
extracted_data2 = flatten_reddit_data(input_path2)
input_path3 = "/Users/nathanielduan/Desktop/UPenn/Jin Jin Lab/Crowdsourcing/reddit_data_pcsk9_2015-07-01_2024-10-31/pcsk9_clean_reddit_data3.json"
extracted_data3 = flatten_reddit_data(input_path3)

In [32]:
print(extracted_data1)

[{'type': 'post', 'id': '1aj1s7o', 'author': 'Ecstatic_Humor_4288', 'text': '* **DOI/PMID/ISBN:** https://doi.org/10.1016/j.metabol.2023.155774\n\n* [**URL**](https://www.metabolismjournal.com/article/S0026-0495(23)00378-5/fulltext)', 'subreddit': 'Scholar', 'subreddit_id': 't5_2r0ev', 'score': 2, 'upvotes': 2, 'num_comments': 3, 'url': 'https://www.reddit.com/r/Scholar/comments/1aj1s7o/article_inhibition_of_pcsk9_prevents_and/', 'date': '2024-02-04', 'comments': [{'id': 'koza8qg', 'name': 't1_koza8qg', 'parent_id': 't3_1aj1s7o', 'link_id': 't3_1aj1s7o', 'text': '[Here](https://www.swisstransfer.com/d/d6781504-ca2a-4e01-aa75-4534fa29ec75) ya go!', 'author': 'SlimlineVan', 'score': 2, 'date': '2024-02-04', 'replies': [{'id': 'kp1ycpy', 'name': 't1_kp1ycpy', 'parent_id': 't1_koza8qg', 'link_id': 't3_1aj1s7o', 'text': 'Thanks solution verified', 'author': 'Ecstatic_Humor_4288', 'score': 3, 'date': '2024-02-05'}]}]}, {'type': 'comment', 'id': 'koza8qg', 'name': 't1_koza8qg', 'parent_id': '

In [33]:
type(extracted_data1)

list