# About
In this notebook we parse and clean the raw data, which hasn't been processed since being scraped from reddit.
*make sure to run prepare-dev first unless running in production*

In [1]:
# ensure we're using the unsloth_env conda environment
!which python
!python --version

/home/brian/anaconda3/envs/unsloth_env/bin/python
Python 3.10.15


## Setup environment

In [2]:
import json
import os
from loguru import logger

In [None]:
COUNT = 25000
INPUT_FILE = f"json/local/raw_data_DEVELOPMENT-{COUNT}.json"
OUTPUT_FILE = f"json/local/cleaned_data_DEVELOPMENT-{COUNT}.json"

## Define classes
All variables correspond to fields in each class' respective example parsed object

### Post:
```json
{
  "id": "1dx1b0z",
  "subreddit": "Destiny",
  "selftext": "Second post sorry, anyways Vegan Gains commented on Trump's faculties in regards to forgetting and misnaming people which led me back to a Twitter post on it. Reddit filters won't let me just link Twitter https://x.com/RonFilipkowski/status/1764295156981723453",
  "title": "New Vegan",
  "downs": 0,
  "name": "t3_1dx1b0z",
  "upvote_ratio": "0.95",
  "ups": 121,
  "removed_by_category": "moderator",
  "link_flair_text": "Discussion",
  "score": 121,
  "author_premium": false,
  "edited": false,
  "total_awards_received": 0,
  "suggested_sort": "confidence",
  "no_follow": false,
  "created_utc": 1720304607,
  "author_flair_text": "A normie roaming\ud83d\udc38\ud83d\udcd5",
  "author": "TuningsGaming",
  "num_comments": 2,
  "subreddit_subscribers": 248289,
  "send_replies": true,
  "is_video": false
}
```

### Comment:
```json
{
  "id": "lbyv8mn",
  "total_awards_received": 0,
  "subreddit": "Destiny",
  "replies": "",
  "no_follow": false,
  "author": "ImOnYew",
  "can_mod_post": false,
  "created_utc": 1720309933,
  "send_replies": true,
  "parent_id": "t3_1dx1b0z",
  "score": 17,
  "author_fullname": "t2_skmh0wx7l",
  "collapsed": false,
  "body": "He tries so hard to be a bully, talking over everyone. It's good that destiny can deal with it.",
  "edited": false,
  "name": "t1_lbyv8mn",
  "is_submitter": false,
  "downs": 0,
  "body_html": "&lt;div class=\"md\"&gt;&lt;p&gt;He tries so hard to be a bully, talking over everyone. It&amp;#39;s good that destiny can deal with it.&lt;/p&gt;\n&lt;/div&gt;",
  "collapsed_reason": null,
  "author_premium": false,
  "permalink": "/r/Destiny/comments/1dx1b0z/new_vegan/lbyv8mn/",
  "created": 1720309933,
  "author_flair_text": null,
  "link_id": "t3_1dx1b0z",
  "controversiality": 0,
  "depth": 0,
  "ups": 17
}
```

In [4]:
class Post:
  def __init__(
      self, 
      id: str,
      subreddit: str, 
      selftext: str,
      title: str,
      downs: int,
      name: str,
      upvote_ratio: float,
      ups: int,
      removed_by_category: str,
      link_flair_text: str,
      score: int,
      author_premium: bool,
      edited: bool,
      total_awards_received: int,
      suggested_sort: str,
      no_follow: bool,
      created_utc: int,
      author_flair_text: str,
      author: str,
      num_comments: int,
      subreddit_subscribers: int,
      send_replies: bool,
      is_video: bool,
      deleted: bool,
      comments: list
    ):
    self.id: str = id
    self.subreddit: str = subreddit
    self.selftext: str = selftext
    self.title: str = title
    self.downs: int = downs
    self.name: str = name
    self.upvote_ratio: float = upvote_ratio
    self.ups: int = ups
    self.removed_by_category: str = removed_by_category
    self.link_flair_text: str = link_flair_text
    self.score: int = score
    self.author_premium: bool = author_premium
    self.edited: bool = edited
    self.total_awards_received: int = total_awards_received
    self.suggested_sort: str = suggested_sort
    self.no_follow: bool = no_follow
    self.created_utc: int = created_utc
    self.author_flair_text: str = author_flair_text
    self.author: str = author
    self.num_comments: int = num_comments
    self.subreddit_subscribers: int = subreddit_subscribers
    self.send_replies: bool = send_replies
    self.is_video: bool = is_video
    self.deleted: bool = deleted
    self.comments: list[Comment] = comments

  def to_dict(self):
    return {
      "id": self.id,
      "subreddit": self.subreddit,
      "selftext": self.selftext,
      "title": self.title,
      "downs": self.downs,
      "name": self.name,
      "upvote_ratio": self.upvote_ratio,
      "ups": self.ups,
      "removed_by_category": self.removed_by_category,
      "link_flair_text": self.link_flair_text,
      "score": self.score,
      "author_premium": self.author_premium,
      "edited": self.edited,
      "total_awards_received": self.total_awards_received,
      "suggested_sort": self.suggested_sort,
      "no_follow": self.no_follow,
      "created_utc": self.created_utc,
      "author_flair_text": self.author_flair_text,
      "author": self.author,
      "num_comments": self.num_comments,
      "subreddit_subscribers": self.subreddit_subscribers,
      "send_replies": self.send_replies,
      "is_video": self.is_video,
      "deleted": self.deleted,
      "comments": [comment.to_dict() for comment in self.comments]
    }

class Comment:
  def __init__(
    self,
    id: str,
    total_awards_received: int,
    subreddit: str,
    replies: list,
    no_follow: bool,
    author: str,
    can_mod_post: bool,
    created_utc: int,
    parent_id: str,
    score: int,
    author_fullname: str,
    collapsed: bool,
    body: str,
    edited: bool,
    name: str,
    is_submitter: bool,
    downs: int,
    body_html: str,
    collapsed_reason: str,
    author_premium: bool,
    created: int,
    author_flair_text: str,
    link_id: str,
    controversiality: int,
    depth: int,
    ups: int
  ):
    self.id: str = id
    self.total_awards_received: int = total_awards_received
    self.subreddit: str = subreddit
    self.replies: list[Comment] = replies
    self.no_follow: bool = no_follow
    self.author: str = author
    self.can_mod_post: bool = can_mod_post
    self.created_utc: int = created_utc
    self.parent_id: str = parent_id
    self.score: int = score
    self.author_fullname: str = author_fullname
    self.collapsed: bool = collapsed
    self.body: str = body
    self.edited: bool = edited
    self.name: str = name
    self.is_submitter: bool = is_submitter
    self.downs: int = downs
    self.body_html: str = body_html
    self.collapsed_reason: str = collapsed_reason
    self.author_premium: bool = author_premium
    self.created: int = created
    self.author_flair_text: str = author_flair_text
    self.link_id: str = link_id
    self.controversiality: int = controversiality
    self.depth: int = depth
    self.ups: int = ups

  def to_dict(self):
    return {
      "id": self.id,
      "total_awards_received": self.total_awards_received,
      "subreddit": self.subreddit,
      "replies": [reply.to_dict() for reply in self.replies],
      "no_follow": self.no_follow,
      "author": self.author,
      "can_mod_post": self.can_mod_post,
      "created_utc": self.created_utc,
      "parent_id": self.parent_id,
      "score": self.score,
      "author_fullname": self.author_fullname,
      "collapsed": self.collapsed,
      "body": self.body,
      "edited": self.edited,
      "name": self.name,
      "is_submitter": self.is_submitter,
      "downs": self.downs,
      "body_html": self.body_html,
      "collapsed_reason": self.collapsed_reason,
      "author_premium": self.author_premium,
      "created": self.created,
      "author_flair_text": self.author_flair_text,
      "link_id": self.link_id,
      "controversiality": self.controversiality,
      "depth": self.depth,
      "ups": self.ups
    }

In [5]:
def parse_comment(raw_comment):
    """Recursively parse a comment and its replies."""
    replies = []
    if raw_comment.get("replies"):  # Check if there are replies
        raw_replies = raw_comment["replies"].get("data", {}).get("children", [])
        replies = [parse_comment(reply["data"]) for reply in raw_replies if "data" in reply]
    
    return Comment(
        id=raw_comment["id"],
        total_awards_received=raw_comment.get("total_awards_received", 0),
        subreddit=raw_comment.get("subreddit", ""),
        replies=replies,
        no_follow=raw_comment.get("no_follow", False),
        author=raw_comment.get("author", "[deleted]"),
        can_mod_post=raw_comment.get("can_mod_post", False),
        created_utc=raw_comment.get("created_utc", 0),
        parent_id=raw_comment.get("parent_id", ""),
        score=raw_comment.get("score", 0),
        author_fullname=raw_comment.get("author_fullname", ""),
        collapsed=raw_comment.get("collapsed", False),
        body=raw_comment.get("body", ""),
        edited=raw_comment.get("edited", False),
        name=raw_comment.get("name", ""),
        is_submitter=raw_comment.get("is_submitter", False),
        downs=raw_comment.get("downs", 0),
        body_html=raw_comment.get("body_html", ""),
        collapsed_reason=raw_comment.get("collapsed_reason", None),
        author_premium=raw_comment.get("author_premium", False),
        created=raw_comment.get("created", 0),
        author_flair_text=raw_comment.get("author_flair_text", None),
        link_id=raw_comment.get("link_id", ""),
        controversiality=raw_comment.get("controversiality", 0),
        depth=raw_comment.get("depth", 0),
        ups=raw_comment.get("ups", 0)
    )

In [6]:
with open(INPUT_FILE, "r") as f:
  raw_posts: list = json.load(f)

logger.info(f"{len(raw_posts)} posts loaded from {INPUT_FILE}")
logger.info(f"raw post structure: {raw_posts[0].keys()}")
logger.info(f"raw comment structure: {raw_posts[0]['comments']['lbyv8mn'].keys()}")

[32m2024-11-25 11:17:32.176[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m25000 posts loaded from json/local/raw_data_DEVELOPMENT-25000.json[0m
[32m2024-11-25 11:17:32.183[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mraw post structure: dict_keys(['_id', 'id', 'data', 'comments', 'finalized'])[0m
[32m2024-11-25 11:17:32.184[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mraw comment structure: dict_keys(['subreddit_id', 'approved_at_utc', 'author_is_blocked', 'comment_type', 'awarders', 'mod_reason_by', 'banned_by', 'author_flair_type', 'total_awards_received', 'subreddit', 'author_flair_template_id', 'likes', 'replies', 'user_reports', 'saved', 'id', 'banned_at_utc', 'mod_reason_title', 'gilded', 'archived', 'collapsed_reason_code', 'no_follow', 'author', 'can_mod_post', 'created_utc', 'send_replies', 'parent_id', 'score', 'author_fullname', 'approved_by', 'mod_note', 'all_awardings', '

In [7]:
cleaned_posts: list[Post] = []
for raw_post in raw_posts:
  post_data = raw_post["data"]
  try:
    if not post_data["author_premium"]:
      post_data["author_premium"] = False
    post = Post(
      id=post_data["id"],
      subreddit=post_data["subreddit"],
      selftext=post_data["selftext"],
      title=post_data["title"],
      downs=post_data["downs"],
      name=post_data["name"],
      upvote_ratio=post_data["upvote_ratio"],
      ups=post_data["ups"],
      removed_by_category=post_data["removed_by_category"],
      link_flair_text=post_data["link_flair_text"],
      score=post_data["score"],
      author_premium=post_data["author_premium"],
      edited=post_data["edited"],
      total_awards_received=post_data["total_awards_received"],
      suggested_sort=post_data["suggested_sort"],
      no_follow=post_data["no_follow"],
      created_utc=post_data["created_utc"],
      author_flair_text=post_data["author_flair_text"],
      author=post_data["author"],
      num_comments=post_data["num_comments"],
      subreddit_subscribers=post_data["subreddit_subscribers"],
      send_replies=post_data["send_replies"],
      is_video=post_data["is_video"],
      deleted=(post_data["removed_by_category"] == "deleted"),
      comments=[]
    )
  except KeyError as e:
    logger.error(f"SKIPPING post={post_data['title']}, error: {e}")
    continue

  # logger.info(f"{post.title} - {post.subreddit} parsed")
  comments = []
  for raw_comment in raw_post["comments"].values():
      try:
          if isinstance(raw_comment, dict) and "id" in raw_comment:  # Ensure valid comment structure
              comment = parse_comment(raw_comment)
              comments.append(comment)
      except KeyError as e:
          logger.error(f"SKIPPING comment={raw_comment.get('id', 'unknown')}, error: {e}")
          continue

  post.comments = comments

  cleaned_posts.append(post)
  
logger.info("done")

logger.info(f"cleaned post structure: {cleaned_posts[0].__dict__.keys()}")

[32m2024-11-25 11:17:32.192[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [31m[1mSKIPPING post=Robert Kennedy Jr's Troubled Marriages Detailed in New Book, error: 'author_premium'[0m
[32m2024-11-25 11:17:32.195[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [31m[1mSKIPPING post=Stop Talking About AI, It Hurts (Schizo Rant), error: 'author_premium'[0m
[32m2024-11-25 11:17:32.195[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [31m[1mSKIPPING post=To the trumpists saying that biden is too old..., error: 'author_premium'[0m
[32m2024-11-25 11:17:32.197[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [31m[1mSKIPPING post=Biden has the best chance of beating Trump, error: 'author_premium'[0m
[32m2024-11-25 11:17:32.202[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [31m[1mSKIPPING post=Doctor Strange Actor, Bene

In [8]:
cleaned_posts_dict = [post.to_dict() for post in cleaned_posts]
if not os.path.exists("json/output"):
  os.makedirs("json/output")
with open(OUTPUT_FILE, "w") as f:
  json.dump(cleaned_posts_dict, f, indent=2)

## Done
Continue to `models/subreddit/make-dataset.ipynb`