In [3]:
import praw
import asyncio
import asyncpraw
import pandas as pd
from dotenv import load_dotenv
import os


In [7]:
#######################
#ENVIRONMENT VARIABLES#
#######################

load_dotenv("config.env")

client_id = os.getenv("REDDIT_CLIENT_ID")
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
user_agent = os.getenv("REDDIT_USER_AGENT")

In [8]:
###########################
#REDDIT API AUTHENTICATION#
###########################

reddit = asyncpraw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

MissingRequiredAttributeException: Required configuration setting 'client_id' missing. 
This setting can be provided in a praw.ini file, as a keyword argument to the Reddit class constructor, or as an environment variable.

In [None]:
# Parameters
subreddits = ["SanDiego", "Portland"]
post_limit = 500
comments_limit = 1000

In [None]:
###########
#FUNCTIONS#
###########

def load_existing_ids(subreddit_name):
    posts_file = f"{subreddit_name}_posts.csv"
    comments_file = f"{subreddit_name}_comments.csv"

    if os.path.exists(posts_file) and os.path.getsize(posts_file) > 0:
        existing_post_ids = set(pd.read_csv(posts_file)["id"])
    else:
        existing_post_ids = set()

    if os.path.exists(comments_file) and os.path.getsize(comments_file) > 0:
        existing_comment_ids = set(pd.read_csv(comments_file)["comment_id"])
    else:
        existing_comment_ids = set()

    return existing_post_ids, existing_comment_ids

async def fetch_posts(subreddit_name, existing_post_ids, limit=500):
    print(f"Fetching posts from r/{subreddit_name}...")
    subreddit = await reddit.subreddit(subreddit_name)
    posts = []
    counter = 0

    async for post in subreddit.new(limit=None):
        if counter >= limit:
            break
        if post.id in existing_post_ids:
            continue  # Skip already saved posts
        posts.append({
            "id": post.id,
            "title": post.title,
            "score": post.score,
            "created_utc": post.created_utc,
            "selftext": post.selftext,
            "num_comments": post.num_comments,
            "subreddit": subreddit_name
        })
        existing_post_ids.add(post.id)  # Add to in-memory set immediately
        counter += 1

    print(f"Fetched {len(posts)} new posts from r/{subreddit_name}.")
    return posts

async def fetch_comments(post_ids, existing_comment_ids, limit=1000):
    print("Fetching comments...")
    comments = []
    counter = 0

    for post_id in post_ids:
        submission = await reddit.submission(id=post_id)
        await submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            if counter >= limit:
                break
            if comment.id in existing_comment_ids:
                continue  # Skip already saved comments
            comments.append({
                "post_id": post_id,
                "comment_id": comment.id,
                "body": comment.body,
                "score": comment.score,
                "created_utc": comment.created_utc
            })
            existing_comment_ids.add(comment.id)  # Track in-memory
            counter += 1

    print(f"Fetched {len(comments)} new comments.")
    return comments

async def main():
    for subreddit_name in subreddits:
        # Load existing IDs
        existing_post_ids, existing_comment_ids = load_existing_ids(subreddit_name)

        # Fetch new posts
        posts = await fetch_posts(subreddit_name, existing_post_ids, limit=post_limit)

        # Save posts (append to CSV)
        posts_file = f"{subreddit_name}_posts.csv"
        if posts:
            pd.DataFrame(posts).to_csv(
                posts_file, mode="a", header=not os.path.exists(posts_file), index=False
            )
            print(f"Saved {len(posts)} posts to {posts_file}")

        # Fetch comments for new posts
        post_ids = [post["id"] for post in posts]
        comments = await fetch_comments(post_ids, existing_comment_ids, limit=comments_limit)

        # Save comments
        comments_file = f"{subreddit_name}_comments.csv"
        if comments:
            pd.DataFrame(comments).to_csv(
                comments_file, mode="a", header=not os.path.exists(comments_file), index=False
            )
            print(f"Saved {len(comments)} comments to {comments_file}")

In [103]:
# Run the async main loop
await main()
print("Finished.")

Fetching posts from r/SanDiego...
Fetched 9 new posts from r/SanDiego.
Saved 9 posts to SanDiego_posts.csv
Fetching comments...
Fetched 136 new comments.
Saved 136 comments to SanDiego_comments.csv
Fetching posts from r/Portland...
Fetched 30 new posts from r/Portland.
Saved 30 posts to Portland_posts.csv
Fetching comments...
Fetched 522 new comments.
Saved 522 comments to Portland_comments.csv
Finished.
