In [1]:
import os
import sys

PARENT_PATH = os.path.abspath(os.path.dirname(os.getcwd()))
sys.path.append(PARENT_PATH)

from utils import flat_subreddits

import requests
import libtorrent as lt
import time
import zstandard as zstd
from pathlib import Path


In [None]:
torrent_link = "https://academictorrents.com/download/1614740ac8c94505e4ecb9d88be8bed7b6afddd4.torrent"
torrent_file = f"{PARENT_PATH}/data/downloads/downloaded_content.torrent"

if not os.path.exists(torrent_file):
    with requests.get(torrent_link, stream=True) as r:
        r.raise_for_status()
        with open(torrent_file, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print("Torrent file downloaded.")
else:
    print("Torrent file already exists, skipping download.")

In [None]:
session = lt.session()
info = lt.torrent_info(f"{PARENT_PATH}/data/downloads/downloaded_content.torrent")
matching_files = []
file_priorities = {}
for index, file in enumerate(info.files()):
    file_name = os.path.basename(file.path)
    base_name = file_name.replace('_submissions.zst', '').replace('_comments.zst', '')
    file_priorities[index] = 0

    if base_name in flat_subreddits:
        print(f"Found matching file: {base_name}")
        matching_files.append(file_name)
        file_priorities[index] = 1
        
if not matching_files:
    print("No files in the torrent match the provided reddits list.")
else:
    # Check if files already exist
    all_files_exist = True
    for file_name in matching_files:
        expected_path = Path(f"{PARENT_PATH}/data/downloads/reddit/subreddits23/{file_name}")
        if not expected_path.exists():
            all_files_exist = False
            break
    
    if all_files_exist:
        print("All matching files already exist. Skipping download.")
    else:
        print("Some files missing. Starting download...")
        handle = session.add_torrent({'ti': info, 'save_path': f"{PARENT_PATH}/data/downloads"})
        for index in file_priorities:
            handle.file_priority(index, file_priorities[index])
        while not handle.is_seed():
            s = handle.status()
            progress = s.progress * 100
            print(f"Download progress: {progress:.2f}%")
            if progress >= 100:
                break
            time.sleep(1)
        print("Download complete for matching files.")
print("Script finished.")

In [3]:
output_dir = Path(f"{PARENT_PATH}/data/extracted")
if not output_dir.exists():
    output_dir.mkdir(parents=True)

download_dir = Path(f"{PARENT_PATH}/data/downloads/reddit/subreddits24")
zst_files = list(download_dir.glob('*.zst'))

print(f"Found {len(zst_files)} .zst files to extract")

for zst_file in zst_files:

    output_file = output_dir / zst_file.stem
    
    if not output_file.exists():
        print(f"Extracting {zst_file.name}...")
        
        with open(zst_file, 'rb') as compressed:
            dctx = zstd.ZstdDecompressor()
            with open(output_file, 'wb') as destination:
                dctx.copy_stream(compressed, destination)
        
        print(f"Extracted to {output_file}")
    else:
        print(f"Skipping {zst_file.name} - already extracted")

print("All files processed")

Found 10 .zst files to extract
Extracting AskALiberal_comments.zst...
Extracted to c:\Users\andrew\Desktop\mechanicalskeptics\data\extracted\AskALiberal_comments
Extracting AskALiberal_submissions.zst...
Extracted to c:\Users\andrew\Desktop\mechanicalskeptics\data\extracted\AskALiberal_submissions
Extracting AskConservatives_comments.zst...
Extracted to c:\Users\andrew\Desktop\mechanicalskeptics\data\extracted\AskConservatives_comments
Extracting AskConservatives_submissions.zst...
Extracted to c:\Users\andrew\Desktop\mechanicalskeptics\data\extracted\AskConservatives_submissions
Extracting centrist_comments.zst...
Extracted to c:\Users\andrew\Desktop\mechanicalskeptics\data\extracted\centrist_comments
Extracting centrist_submissions.zst...
Extracted to c:\Users\andrew\Desktop\mechanicalskeptics\data\extracted\centrist_submissions
Extracting conservatives_comments.zst...
Extracted to c:\Users\andrew\Desktop\mechanicalskeptics\data\extracted\conservatives_comments
Extracting conservativ