In [18]:
import os
from tqdm import tqdm
from pydantic import BaseSettings

class Settings(BaseSettings):
    CLIENT_ID: str
    CLIENT_SECRET: str

    class Config:
        env_file = '.env'

settings = Settings()

In [19]:
import praw

def extract_comments(submission):
    comments = []
    for comment in submission.comments:
        if isinstance(comment,praw.models.MoreComments):
            submission.comments.replace_more()
        elif comment.locked:
            continue
            
        comments.append(comment.body)
    
    replies = [reply.body for comment in submission.comments for reply in comment.replies]
    return comments + replies
    
    
reddit = praw.Reddit(
    client_id=settings.CLIENT_ID,
    client_secret=settings.CLIENT_SECRET,
    user_agent="USERAGENT"
)

In [20]:
subreddit = reddit.subreddit("CryptoMoonShots")

In [21]:
submissions = []
for submission in tqdm(subreddit.hot(limit=1000)):
    submissions.append(submission)

715it [01:35,  7.52it/s]


In [22]:
corpus = []
dead_letter = []
for submission in tqdm(submissions):
    try:
        comments = extract_comments(submission)
        corpus.append({
            "id": submission.id,
            "url": submission.url,
            "title": submission.title,
            "text": submission.selftext,
            "created_utc": submission.created_utc,
            "num_comments": submission.num_comments,
            "upvote_ratio": submission.upvote_ratio,
            "score": submission.score,
            "is_original_content": submission.is_original_content,
            "locked": submission.locked,
            "stickied": submission.stickied,
            "comments": comments
        })
    except Exception as e:
        dead_letter.append(submission)
        

100%|██████████| 715/715 [25:04<00:00,  2.10s/it]  


In [23]:
import json
os.makedirs("data", exist_ok=True)

with open("data/corpus_v2.json", "w") as f:
    json.dump(corpus, f)