In [1]:
# Standard library imports
import os
import time
from datetime import datetime, timedelta

# Third-party imports
import pandas as pd
import praw
from dotenv import load_dotenv


In [2]:
# Load environment variables from .env file
load_dotenv() 
today = datetime.now().strftime("%Y%m%d")
# Configuration Dictionary - Centralized settings for the scraper
CONFIG = {
    # Reddit API credentials (loaded from environment variables)
    "reddit_auth": {
        "client_id": os.getenv('REDDIT_CLIENT_ID'),          
        "client_secret": os.getenv('REDDIT_CLIENT_SECRET'),  
        "user_agent": os.getenv('REDDIT_USER_AGENT')         
    },
    
    # Scraping parameters
    "scrape_params": {
        "subreddits": ["solana", "cryptocurrency", "CryptoMarkets"],  # Target communities
        "search_query": '(selftext:Solana OR selftext:SOL) AND self:yes',                  # Search keywords
        "posts_per_day": 100,                             # Target posts per day per subreddit
        "days_to_scrape": 7,                              # Number of past days to scrape
        "requests_delay": 2,                              # Delay between API calls (avoid rate limits)
        'max_submissions_per_subreddit': 100,             # Max posts to scrape per subreddit
        "sort_by": "new",                                 # Sorting order for posts
    },
    
    # Output configuration
    "output": {
        "filename": f"outputs/solana_reddit_{today}.csv",          # Output CSV filename
    }
}

def scrape_daily_posts(config):
    
    # Initialize Reddit API client
    reddit = praw.Reddit(**config["reddit_auth"])
    all_posts = []  # Master list to store all collected posts
    
    # Loop through each subreddit in the configuration
    for subreddit in config["scrape_params"]["subreddits"]:
        print(f"\nScraping r/{subreddit}...")
        
        # Process each day in the requested time range
        for day_offset in range(config["scrape_params"]["days_to_scrape"]):
            # day_offset: #days to go back from today (0 = today, 1 = yesterday, etc.)
            # Calculate day window (UTC)
            day_end = datetime.now() - timedelta(days=day_offset)
            day_start = day_end - timedelta(days=1)
            day_str = day_end.strftime('%Y-%m-%d')  # Format as YYYY-MM-DD
            
            print(f"  Day {day_offset+1}: {day_str} (Target: {config['scrape_params']['posts_per_day']} posts)")
            
            day_posts = []  # Stores posts for current day
            last_utc = None  # Tracks pagination anchor
            
            # Continue scraping until we reach target or run out of posts
            while len(day_posts) < config["scrape_params"]["posts_per_day"]:
                try:
                    # We use 'after' and 'before' to limit the search based on sorting order is NEWESR first
                    params = {
                        'after': int(day_start.timestamp()),  # Start of day (Unix timestamp), posts should not be older than this
                        'before': int(last_utc) if last_utc else int(day_end.timestamp()) 
                        # Paginate backward through time, ensuring no gaps or duplicates between batches
                        # On first request: day_end timestamp, creating a window for the first batch
                        # Subsequent requests: last_utc timestamp, i.e. the oldest post from previous batch
                    }
                    
                    # Execute search with current parameters
                    submissions = reddit.subreddit(subreddit).search(
                        config["scrape_params"]["search_query"],
                        sort=config["scrape_params"]['sort_by'],      
                        limit=config["scrape_params"]['max_submissions_per_subreddit'],  # Max posts per request (Reddit limit)
                        params=params     # Time filters
                    )
                    
                    # Process batch of submissions
                    batch = []
                    for submission in submissions:
                        # Safety check for post date, skip if older than the start of the day
                        if submission.created_utc < day_start.timestamp():
                            continue
                        # Double-check it's a text post (even though we filtered in search)
                        if not submission.is_self:  # is_self=True means text post
                            continue
                            
                        # Additional quality filters for text posts
                        if len(submission.selftext) < 50:  # Minimum 50 characters
                            continue
                            
                        # Extract relevant fields
                        batch.append({
                            'id': submission.id,        # Unique Reddit post ID
                            'title': submission.title,  # Post title text
                            'content': submission.selftext,  # Main post content/text (empty for link posts)
                            'author': str(submission.author),  # Author's username, str() in case author is deleted: "None"
                            'subreddit': subreddit,     # Which subreddit this comes from
                            'upvotes': submission.ups,      # Raw upvote count
                            'downvotes': submission.downs,  # Raw downvote count
                            'score': submission.score,      # Net upvotes (upvotes - downvotes)
                            'upvote_ratio': submission.upvote_ratio,  # Percentage of upvotes (0-1)
                            'num_comments': submission.num_comments,  # Total number of comments
                            'created_utc': submission.created_utc,    # Creation time (Unix timestamp UTC)
                            'url': submission.url,          # URL to the post or external link
                            'is_self': submission.is_self,  # Boolean - True: text post, False: link post
                            'word_count': len(submission.selftext.split()),  # Add word count
                            'is_media': False,  # Explicitly mark as text

                            # Additional potentially useful attributes:
                            'flair': submission.link_flair_text,  # Post flair text predefined by moderators or users
                            'stickied': submission.stickied,      # Boolean if the post is pinned to the top of the subreddit by moderators
                            'over_18': submission.over_18,        # Boolean if NSFW marked
                            'spoiler': submission.spoiler,        # Boolean if spoiler marked
                            'locked': submission.locked,          # Boolean if comments are disabled (no one can reply)
                            'distinguished': submission.distinguished  # "moderator", "admin", or None (normal user post)
                        })
                        last_utc = submission.created_utc  # Update pagination anchor
                    
                    # Exit if no posts found
                    if not batch:
                        break
                    
                    # Add batch to daily collection
                    day_posts.extend(batch)
                    print(f"    → Collected {len(batch)} (Total: {len(day_posts)})")
                    
                    # Stop if we've reached the end of available posts
                    if len(batch) < config["scrape_params"]['max_submissions_per_subreddit']:
                        break
                        
                    # Respect API rate limits
                    time.sleep(config["scrape_params"]["requests_delay"])
                    
                except Exception as e:
                    print(f"    Error: {str(e)}")
                    time.sleep(10)  # Longer delay on error
                    continue
            
            # Trim to exact target if we collected extra
            collected_posts = day_posts[:config["scrape_params"]["posts_per_day"]]
            all_posts.extend(collected_posts)
            print(f"  Finished {day_str}: {len(collected_posts)} posts")
    
    # Convert to DataFrame and keep only specified columns
    return pd.DataFrame(all_posts)#[config["output"]["fields"]]

# Main execution block
if __name__ == "__main__":
    # Run scraper with configuration
    df = scrape_daily_posts(CONFIG)
    
    # Add human-readable datetime column
    df['created_date'] = pd.to_datetime(df['created_utc'], unit='s')
    
    # Save to CSV
    df.to_csv(CONFIG["output"]["filename"], index=False)
    
    # Print summary report
    print(f"\nReport:")
    print(f"Total posts collected: {len(df)}")


Scraping r/solana...
  Day 1: 2025-03-30 (Target: 100 posts)
    → Collected 1 (Total: 1)
  Finished 2025-03-30: 1 posts
  Day 2: 2025-03-29 (Target: 100 posts)
    → Collected 4 (Total: 4)
  Finished 2025-03-29: 4 posts
  Day 3: 2025-03-28 (Target: 100 posts)
    → Collected 10 (Total: 10)
  Finished 2025-03-28: 10 posts
  Day 4: 2025-03-27 (Target: 100 posts)
    → Collected 16 (Total: 16)
  Finished 2025-03-27: 16 posts
  Day 5: 2025-03-26 (Target: 100 posts)
    → Collected 22 (Total: 22)
  Finished 2025-03-26: 22 posts
  Day 6: 2025-03-25 (Target: 100 posts)
    → Collected 33 (Total: 33)
  Finished 2025-03-25: 33 posts
  Day 7: 2025-03-24 (Target: 100 posts)
    → Collected 37 (Total: 37)
  Finished 2025-03-24: 37 posts

Scraping r/cryptocurrency...
  Day 1: 2025-03-30 (Target: 100 posts)
  Finished 2025-03-30: 0 posts
  Day 2: 2025-03-29 (Target: 100 posts)
    → Collected 1 (Total: 1)
  Finished 2025-03-29: 1 posts
  Day 3: 2025-03-28 (Target: 100 posts)
    → Collected 1 (Tot

In [3]:
# Local application imports
from evaluator import evaluate_dataframe
from utils import save_to_csv

evaluated_reddit = evaluate_dataframe(df[:10])
evaluated_reddit_df = save_to_csv(evaluated_reddit, filename_prefix="evaluated_reddit")
print(f"Evaluated tweet dataframe has shape: {evaluated_reddit_df.shape}")

Evaluating 10 posts...
Evaluation error: invalid literal for int() with base 10: ''
Saved 10 tweets to outputs/evaluated_reddit_20250330.csv
Evaluated tweet dataframe has shape: (10, 30)
