In [1]:
import sys
version = sys.version_info
if version.major < 3 or (version.major == 3 and version.minor < 10):
	raise RuntimeError("This script requires Python 3.10 or higher")
import os
from typing import Iterable

from fileStreams import getFileJsonStream
from utils import FileProgressLog
import json
from datetime import datetime

In [3]:
def processPostFile(path: str, only_text: bool = False):
	
    new_json = []
    post_ids = set()

    print(f"Processing file {path}")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        progressLog = FileProgressLog(path, f)
        for row in jsonStream:
            progressLog.onRow()
            # PUT YOUR CODE HERE
            
            body = row["selftext"]

            if body == "[deleted]" or body == "[removed]": # skip deleted posts
                continue

            if only_text:
                if body == "":
                    continue           
            # example fields
            author = row["author"]
            subreddit = row["subreddit"]
            subreddit_id = row["subreddit_id"]
            id = row["id"]
            name = row["name"]
            created = row["created_utc"]
            datetime_object = datetime.fromtimestamp(created)
            score = row["score"]
            ratio = row["upvote_ratio"]
            title = row["title"]
            
            url = row["url"]

            new_json.append({
                "subreddit": subreddit,
                "subreddit_id": subreddit_id,
                "id": id,
                "name": name,
                "author": author,
                "created": created,
                "datetime": str(datetime_object),
                "score": score,
                "ratio": ratio,
                "title": title,
                "body": body,
                "url": url
            })

            post_ids.add(name)

        progressLog.logProgress("\n")
    with open(f'../data/filtered/posts/r_{subreddit}_posts.json', "w") as f:
        for row in new_json:
            f.write(json.dumps(row))
            f.write("\n")

    with open(f'../data/filtered/posts/r_{subreddit}_post_ids.txt', "w") as f:
        for id in post_ids:
            f.write(f"{id}")
            f.write("\n")

path = '../data/raw/reddit/r_politics_posts.jsonl'
processPostFile(path, True)

Processing file ../data/raw/reddit/r_politics_posts.jsonl
62,822 - 100.00% - elapsed: 829.84ms - remaining: 0s - 13.2µs/row     


In [4]:
path = '../data/raw/reddit/r_conservative_posts.jsonl'
processPostFile(path, True)

Processing file ../data/raw/reddit/r_conservative_posts.jsonl
38,243 - 100.00% - elapsed: 470.50ms - remaining: 0s - 12.3µs/row     


In [5]:
path = '../data/raw/reddit/r_progressive_posts.jsonl'
processPostFile(path, True)

Processing file ../data/raw/reddit/r_progressive_posts.jsonl
320 - 100.00% - elapsed: 5.99ms - remaining: 0s - 18.7µs/row


In [6]:
def processCommentFile(path: str, post_id_path: str):

    post_ids = set()
    with open(post_id_path, "r") as f:
        for line in f:
            post_ids.add(line.strip())
	
    new_json = []

    filename = os.path.basename(path)
    month = ''
    if len(filename.split("_")) > 3:
        month = filename.split("_")[3].split('.')[0] + '_'
        

    print(f"Processing file {path}")
    with open(path, "rb") as f:
        jsonStream = getFileJsonStream(path, f)
        if jsonStream is None:
            print(f"Skipping unknown file {path}")
            return
        progressLog = FileProgressLog(path, f)
        for row in jsonStream:
            progressLog.onRow()
            # PUT YOUR CODE HERE
            link_id = row["link_id"]

            if link_id not in post_ids:
                continue
            
            body = row["body"]

            if body == "[deleted]" or body == "[removed]":
                continue
            

            # comments only
            body = row["body"]
            parent = row["parent_id"]
            created = row["created_utc"]
            datetime_object = datetime.fromtimestamp(created)
            score = row["score"]
            upvotes = row["ups"]
            author = row["author"]
            subreddit = row["subreddit"]
            id = row["name"]

            new_json.append({
                "subreddit": subreddit,
                "post_id": link_id,
                "id": id,
                "parent": parent,
                "author": author,
                "created": created,
                "datetime": str(datetime_object),
                "score": score,
                "upvotes": upvotes,
                "body": body
            })
        progressLog.logProgress("\n")
    with open(f'../data/filtered/comments/r_{subreddit}_{month}comments.json', "w") as f:
        for row in new_json:
            f.write(json.dumps(row))
            f.write("\n")

In [7]:
id_path = '../data/filtered/posts/r_politics_post_ids.txt'
for month in ['june', 'july', 'august', 'september']:
    path = f'../data/raw/reddit/r_politics_comments_{month}.jsonl'
    processCommentFile(path, id_path)

Processing file ../data/raw/reddit/r_politics_comments_june.jsonl
760,573 - 100.00% - elapsed: 00:00:04 - remaining: 0s - 6.4µs/row     
Processing file ../data/raw/reddit/r_politics_comments_july.jsonl
2,046,229 - 100.00% - elapsed: 00:00:13 - remaining: 0s - 6.5µs/row     
Processing file ../data/raw/reddit/r_politics_comments_august.jsonl
1,538,001 - 100.00% - elapsed: 00:00:10 - remaining: 0s - 6.6µs/row     
Processing file ../data/raw/reddit/r_politics_comments_september.jsonl
1,147,472 - 100.00% - elapsed: 00:00:07 - remaining: 0s - 6.3µs/row     


In [8]:
path = f'../data/raw/reddit/r_conservative_comments.jsonl'
id_path = '../data/filtered/posts/r_Conservative_post_ids.txt'
processCommentFile(path, id_path)

Processing file ../data/raw/reddit/r_conservative_comments.jsonl
685,579 - 100.00% - elapsed: 00:00:04 - remaining: 0s - 6.0µs/row     


In [9]:
path = f'../data/raw/reddit/r_progressive_comments.jsonl'
id_path = '../data/filtered/posts/r_progressive_post_ids.txt'
processCommentFile(path, id_path)

Processing file ../data/raw/reddit/r_progressive_comments.jsonl
815 - 100.00% - elapsed: 5.00ms - remaining: 0s - 6.1µs/row


In [10]:
used_fields_posts = [
    "subreddit",
    "subreddit_id",
    "id",
    "name",
    "author",
    "created_utc",
    "score",
    "upvote_ratio",
    "title",
    "selftext",
    "url"
]

used_fields_comments = [
    "subreddit",
    "link_id",
    "name",
    "parent_id",
    "author",
    "created_utc",
    "score",
    "ups",
    "body"
]

In [11]:
unused_fields_posts = []

with open('../data/raw/reddit/r_politics_posts.jsonl', 'rb') as f:
    set_fields = set()
    for row in getFileJsonStream('../data/raw/reddit/r_politics_posts.jsonl', f):
        for key in row.keys():
            set_fields.add(key)
    print(set_fields)
    for field in set_fields:
        if field not in used_fields_posts:
            unused_fields_posts.append(field)

unused_fields_comments = []

with open('../data/raw/reddit/r_politics_comments_june.jsonl', 'rb') as f:
    set_fields = set()
    for row in getFileJsonStream('../data/raw/reddit/r_politics_comments_june.jsonl', f):
        for key in row.keys():
            set_fields.add(key)
    print(set_fields)
    for field in set_fields:
        if field not in used_fields_comments:
            unused_fields_comments.append(field)

            
            

{'banned_at_utc', 'archived', 'hide_score', 'ups', 'media_embed', 'upvote_ratio', 'distinguished', 'link_flair_text_color', 'no_follow', 'awarders', 'whitelist_status', 'can_mod_post', 'contest_mode', 'author_cakeday', 'secure_media', 'hidden', 'num_comments', 'content_categories', '_meta', 'pwls', 'author_flair_text', 'subreddit_subscribers', 'removed_by', 'selftext', 'author_patreon_flair', 'link_flair_richtext', 'thumbnail_height', 'downs', 'top_awarded_type', 'link_flair_template_id', 'wls', 'category', 'view_count', 'author_fullname', 'removed_by_category', 'crosspost_parent', 'preview', 'quarantine', 'url', 'num_reports', 'send_replies', 'author_flair_css_class', 'num_crossposts', 'crosspost_parent_list', 'removal_reason', 'discussion_type', 'all_awardings', 'visited', 'author_flair_template_id', 'domain', 'user_reports', 'link_flair_type', 'approved_at_utc', 'created_utc', 'is_video', 'pinned', 'treatment_tags', 'subreddit_type', 'total_awards_received', 'created', 'author_flair

In [None]:
with open('../data/filtered/unused_fields_posts.txt', 'w') as f:
    for field in unused_fields_posts:
        f.write("`"+field + "`, ")

with open('../data/filtered/unused_fields_comments.txt', 'w') as f:
    for field in unused_fields_comments:
        f.write("`"+field + "`, ")


In [3]:
def generate_sample_1000_lines(path: str):
    with open(path, 'r') as f:
        lines = f.readlines()
        new_path = '..'+f"{path[2:].split('.')[0]}_sample.jsonl"
        print(new_path)
        with open(new_path, 'w') as f:
            for i in range(1001):
                f.write(lines[i])

docs = ['r_conservative_comments.jsonl',
        'r_politics_comments_august.jsonl',
        'r_politics_comments_july.jsonl',
        'r_politics_comments_june.jsonl',
        'r_politics_comments_september.jsonl']

for doc in docs:
    generate_sample_1000_lines(f'../data/raw/reddit/{doc}')