In [28]:
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
	raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog
import json
from datetime import datetime

In [30]:
def processPostFile(path: str, only_text: bool = False):
	
    new_json = []
    post_ids = set()

    print(f"Processing file {path}")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        progressLog = FileProgressLog(path, f)
        for row in jsonStream:
            progressLog.onRow()
            # PUT YOUR CODE HERE
            
            body = row["selftext"]

            if body == "[deleted]" or body == "[removed]": # skip deleted posts
                continue

            if only_text:
                if body == "":
                    continue           
            # example fields
            author = row["author"]
            subreddit = row["subreddit"]
            subreddit_id = row["subreddit_id"]
            id = row["id"]
            name = row["name"]
            created = row["created_utc"]
            datetime_object = datetime.fromtimestamp(created)
            score = row["score"]
            ratio = row["upvote_ratio"]
            title = row["title"]
            
            url = row["url"]

            new_json.append({
                "subreddit": subreddit,
                "subreddit_id": id,
                "id": id,
                "name": name,
                "author": author,
                "created": created,
                "datetime": str(datetime_object),
                "score": score,
                "ratio": ratio,
                "title": title,
                "body": body,
                "url": url
            })

            post_ids.add(name)

            # comments only
            # body = row["body"]
            # parent = row["parent_id"]	# id/name of the parent comment or post (e.g. t3_abc123 or t1_abc123)
            # link_id = row["link_id"]	# id/name of the post (e.g. t3_abc123)
        progressLog.logProgress("\n")
    with open(f'../data/processed/posts/r_{subreddit}_posts.json', "w") as f:
        for row in new_json:
            f.write(json.dumps(row))
            f.write("\n")

    with open(f'../data/processed/posts/r_{subreddit}_post_ids.txt', "w") as f:
        for id in post_ids:
            f.write(f"{id}")
            f.write("\n")

path = '../data/raw/posts/r_politics_posts.jsonl'
processPostFile(path, True)

Processing file ../data/raw/posts/r_politics_posts.jsonl
62,822 - 100.00% - elapsed: 884.32ms - remaining: 0s - 14.1µs/row     


In [34]:
def processCommentFile(path: str, post_id_path: str):

    post_ids = set()
    with open(post_id_path, "r") as f:
        for line in f:
            post_ids.add(line.strip())
	
    new_json = []

    filename = os.path.basename(path)
    month = ''
    if len(filename.split("_")) > 3:
        month = filename.split("_")[3].split('.')[0] + '_'
        

    print(f"Processing file {path}")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        progressLog = FileProgressLog(path, f)
        for row in jsonStream:
            progressLog.onRow()
            # PUT YOUR CODE HERE
            link_id = row["link_id"]

            if link_id not in post_ids:
                continue
            
            body = row["body"]

            if body == "[deleted]" or body == "[removed]":
                continue
            

            # comments only
            body = row["body"]
            parent = row["parent_id"]
            created = row["created_utc"]
            datetime_object = datetime.fromtimestamp(created)
            score = row["score"]
            upvotes = row["ups"]
            author = row["author"]
            subreddit = row["subreddit"]
            id = row["name"]

            new_json.append({
                "subreddit": subreddit,
                "id": id,
                "parent": parent,
                "author": author,
                "created": created,
                "datetime": str(datetime_object),
                "score": score,
                "upvotes": upvotes,
                "body": body
            })
        progressLog.logProgress("\n")
    with open(f'../data/processed/comments/r_{subreddit}_{month}comments.json', "w") as f:
        for row in new_json:
            f.write(json.dumps(row))
            f.write("\n")

In [36]:
id_path = '../data/processed/posts/r_politics_post_ids.txt'
for month in ['june', 'july', 'august', 'september']:
    path = f'../data/raw/comments/r_politics_comments_{month}.jsonl'
    processCommentFile(path, id_path)

Processing file ../data/raw/comments/r_politics_comments_june.jsonl
760,573 - 100.00% - elapsed: 00:00:05 - remaining: 0s - 6.9µs/row     
Processing file ../data/raw/comments/r_politics_comments_july.jsonl
2,046,229 - 100.00% - elapsed: 00:00:13 - remaining: 0s - 6.6µs/row     
Processing file ../data/raw/comments/r_politics_comments_august.jsonl
1,538,001 - 100.00% - elapsed: 00:00:09 - remaining: 0s - 6.4µs/row     
Processing file ../data/raw/comments/r_politics_comments_september.jsonl
1,147,472 - 100.00% - elapsed: 00:00:07 - remaining: 0s - 6.3µs/row     
