In [13]:
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
	raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog

In [15]:
def processPostFile(path: str):
	
    new_json = []
    post_ids = set()

    print(f"Processing file {path}")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        progressLog = FileProgressLog(path, f)
        for row in jsonStream:
            progressLog.onRow()
            # PUT YOUR CODE HERE
            
            body = row["selftext"]

            if body == "[deleted]" or body == "[removed]":
                continue
            
            # example fields
            author = row["author"]
            subreddit = row["subreddit"]
            subreddit_id = row["subreddit_id"]
            id = row["id"]
            created = row["created_utc"]
            score = row["score"]
            ratio = row["upvote_ratio"]
            title = row["title"]
            
            url = row["url"]

            new_json.append({
                "subreddit": subreddit,
                "subreddit_id": id,
                "id": id,
                "author": author,
                "created": created,
                "score": score,
                "ratio": ratio,
                "title": title,
                "body": body,
                "url": url
            })

            post_ids.add(id)

            # comments only
            # body = row["body"]
            # parent = row["parent_id"]	# id/name of the parent comment or post (e.g. t3_abc123 or t1_abc123)
            # link_id = row["link_id"]	# id/name of the post (e.g. t3_abc123)
        progressLog.logProgress("\n")
    with open(f'../data/processed/posts/r_{subreddit}_posts.json', "w") as f:
        for row in new_json:
            f.write(f"{row}")
            f.write("\n")

    with open(f'../data/processed/posts/r_{subreddit}_post_ids.txt', "w") as f:
        for id in post_ids:
            f.write(f"{id}")
            f.write("\n")

In [16]:
path = '../data/raw/posts/r_politics_posts.jsonl'
processPostFile(path)

Processing file ../data/raw/posts/r_politics_posts.jsonl
62,822 - 100.00% - elapsed: 896.96ms - remaining: 0s - 14.3µs/row     


In [None]:
def processCommentFile(path: str, post_id_path: str):

    post_ids = set()
    with open(post_id_path, "r") as f:
        for line in f:
            post_ids.add(line.strip())


	
    new_json = []

    filename = os.path.basename(path)
    month = ''
    if len(filename.split("_")) > 3:
        month = filename.split("_")[3].split('.')[0] + '_'
        

    print(f"Processing file {path}")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        progressLog = FileProgressLog(path, f)
        for row in jsonStream:
            progressLog.onRow()
            # PUT YOUR CODE HERE
            
            body = row["body"]

            if body == "[deleted]" or body == "[removed]":
                continue
            

            # comments only
            body = row["body"]
            parent = row["parent_id"]	# id/name of the parent comment or post (e.g. t3_abc123 or t1_abc123)
            link_id = row["link_id"]	# id/name of the post (e.g. t3_abc123)
            created = row["created_utc"]
        progressLog.logProgress("\n")
    with open(f'../data/processed/posts/r_{subreddit}_{month}comments.json', "w") as f:
        for row in new_json:
            f.write(f"{row}")
            f.write("\n")