
Instructions for downloading data are [here](https://www.reddit.com/r/pushshift/comments/11ef9if/separate_dump_files_for_the_top_20k_subreddits/)

I have included the .torrent file in this github repository.
Open a bittorrent client and then open this torrent file. You can then select the subreddits you want.

This particular notebook reads in some number of lines and writes then to a CSV.
That's not particularly helpful. We're probably want to work straight from the ZST files.

In [1]:
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers


input_file = r"./subreddits/Mechwarrior5_submissions.zst"
output_file = r"./subreddits/Mechwarrior5_submissions_output"
output_format = "csv"
from_date = datetime.strptime("2005-01-01", "%Y-%m-%d")
to_date = datetime.strptime("2025-01-01", "%Y-%m-%d")

In [2]:
# sets up logging to the console as well as a file
log = logging.getLogger("bot")
log.setLevel(logging.INFO)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
log_str_handler = logging.StreamHandler()
log_str_handler.setFormatter(log_formatter)
log.addHandler(log_str_handler)
if not os.path.exists("logs"):
	os.makedirs("logs")
log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
log_file_handler.setFormatter(log_formatter)
log.addHandler(log_file_handler)

In [3]:
def write_line_csv(writer, obj):
    output_list = []
    for f in obj.values():
        output_list.append(str(f))
    writer.writerow(output_list)

def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line.strip(), file_handle.tell()

			buffer = lines[-1]

		reader.close()

In [4]:
output_path = f"{output_file}.{output_format}"

writer = None

handle = open(output_path, 'w', encoding='UTF-8', newline='')
writer = csv.writer(handle)

file_size = os.stat(input_file).st_size
file_bytes_processed = 0
created = None
matched_lines = 0
bad_lines = 0
total_lines = 0
counter = 0
MAX_LINES_TO_PROCESS = 100
for line, file_bytes_processed in read_lines_zst(input_file):
    total_lines += 1
    if total_lines >= MAX_LINES_TO_PROCESS:
        break
    
    if total_lines % 1000 == 0:
        log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : {matched_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")

    try:
        obj = json.loads(line)
        created = datetime.utcfromtimestamp(int(obj['created_utc']))

        if created < from_date:
            continue
        if created > to_date:
            continue

        matched_lines += 1
        write_line_csv(writer, obj)
    except (KeyError, json.JSONDecodeError) as err:
        bad_lines += 1
handle.close()

<_csv.writer at 0x1057359a0>