#### 🎯 Objective
# This notebook collects top posts and comments from political subreddits during the 2024 election cycle
# to analyze whether extreme sentiment affects engagement.



### 📚 Libraries

In [5]:

import praw  # Reddit API Wrapper
import pandas as pd  # Data handling
from datetime import datetime  # Date formatting
import os  # Environment variable handling
from dotenv import load_dotenv  # Load .env file
import time
import requests



## 🔍 Research Design
### Research Question: 
### Do posts in political subreddits during the 2024 election cycle with extreme sentiment (very positive or negative) receive more engagement?

### Justification for Subreddit Selection:
### - Subreddits like 'politics', 'election2024', and 'PoliticalDiscussion' contain high-volume election-related discussions.
### - These communities represent diverse political opinions and engagement levels.

### Data Points Collected:
### - **Post-level data**: ID, title, score, number of comments, timestamp, and URL.
### - **Comment-level data**: ID, post ID, body, score, timestamp.

## 💻 API Functions
### Load Environment Variables

In [5]:
import os
import time
import json
import random
import praw
from dotenv import load_dotenv
from datetime import datetime

# Load environment variables
load_dotenv()

# ✅ Reddit API Authentication
def authenticate_reddit():
    """Authenticate with the Reddit API using credentials stored in environment variables."""
    return praw.Reddit(client_id=os.getenv("REDDIT_CLIENT_ID"),
                       client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
                       user_agent="ElectionScraper")

# ✅ Filter Function for Date Range
def is_within_date_range(post, start_date, end_date):
    """Checks if a post's creation date is within the given range."""
    post_date = datetime.utcfromtimestamp(post.created_utc)
    return start_date <= post_date <= end_date

# ✅ Fetch Posts Until 50 Valid Ones Are Collected
def fetch_posts(reddit, subreddits, limit=50):
    """Continuously fetches posts until 50 valid ones are collected per subreddit."""
    
    start_date = datetime(2024, 3, 1)  # March 1, 2024
    end_date = datetime(2025, 1, 6)    # January 6, 2025
    
    collected_posts = []
    for subreddit in subreddits:
        print(f"🔄 Fetching top posts from r/{subreddit}...")

        subreddit_obj = reddit.subreddit(subreddit)
        potential_posts = []
        chunk_size = 500  # Fetch posts in chunks of 500
        total_fetched = 0

        while len(potential_posts) < limit:
            print(f"📊 Fetching next {chunk_size} posts from r/{subreddit}... (Total so far: {total_fetched})")

            for post in subreddit_obj.top(time_filter="year", limit=chunk_size):
                total_fetched += 1
                
                if (is_within_date_range(post, start_date, end_date) and 
                    not post.is_video and 
                    not post.url.endswith(('.gif', '.jpg', '.png', '.mp4', '.webm')) and 
                    len(post.selftext.split()) >= 5 and 
                    post.num_comments >= 100):  # Ensure at least 100 comments

                    potential_posts.append({
                        "id": post.id,
                        "subreddit": subreddit,
                        "title": post.title,
                        "score": post.score,
                        "num_comments": post.num_comments,
                        "created_utc": datetime.utcfromtimestamp(post.created_utc).isoformat(),
                        "text": post.selftext.strip(),
                        "url": post.url
                    })
                    
                    if len(potential_posts) >= limit:
                        break  # Stop if we reach the limit
                
                time.sleep(0.05)  # ⏳ Small delay to avoid rate-limiting

            if total_fetched >= 5000:  # Prevent infinite loops
                print(f"⚠️ Stopping search for r/{subreddit}, reached 5000 posts with only {len(potential_posts)} valid.")
                break

        print(f"✅ Collected {len(potential_posts)} valid posts from r/{subreddit}")

        collected_posts.extend(potential_posts)

    return collected_posts

# ✅ Fetch 100 Random Comments per Post
def fetch_random_comments(reddit, post_id, limit=100):
    """Fetches 100 random top-level comments for a given Reddit post."""
    post = reddit.submission(id=post_id)
    
    for attempt in range(3):  # Auto-retry up to 3 times
        try:
            post.comments.replace_more(limit=0)  # Load all top-level comments
            break  # If successful, exit retry loop
        except Exception as e:
            print(f"⚠️ Error fetching comments for {post_id}: {e}. Retrying...")
            time.sleep(2 ** attempt)  # Exponential backoff

    all_comments = [
        comment for comment in post.comments 
        if comment.author and comment.author.name.lower() != "automoderator"  # Exclude bot comments
        and len(comment.body.split()) >= 3  # Ensure meaningful comments
    ]
    
    if len(all_comments) < limit:
        print(f"⚠️ Only found {len(all_comments)} valid comments for post {post_id}.")

    random_comments = random.sample(all_comments, min(len(all_comments), limit))  # Randomly pick 100 comments

    comment_data = []
    for comment in random_comments:
        comment_data.append({
            "post_id": post_id,
            "comment_id": comment.id,
            "body": comment.body,
            "score": comment.score,
            "created_utc": datetime.utcfromtimestamp(comment.created_utc).isoformat()
        })

    time.sleep(0.05)  # ⏳ Small delay to avoid rate-limiting
    return comment_data

# ✅ Save Data as JSON
def save_to_json(data, filename):
    """Saves data as a JSON file."""
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

# 📥 **Data Collection**
reddit = authenticate_reddit()
subreddits = ["politics", "Conservative"]  # Balanced subreddit selection

# **Step 1: Fetch Posts Until 50 Valid Are Found Per Subreddit**
all_posts = fetch_posts(reddit, subreddits, limit=50)
print(f"✅ Collected {len(all_posts)} posts after filtering.")

# **Step 2: Fetch 100 Random Comments per Post**
all_comments = []
for post in all_posts:
    print(f"📥 Fetching 100 random comments for post {post['id']}...")
    comments = fetch_random_comments(reddit, post["id"], limit=100)
    all_comments.extend(comments)

# **Step 3: Save Data to JSON**
save_to_json(all_posts, "reddit_filtered_posts.json")
save_to_json(all_comments, "reddit_filtered_comments.json")

print("✅ Data collection complete! JSON files saved.")

# ✅ **Quality Check**
if os.path.exists("reddit_filtered_posts.json") and os.path.exists("reddit_filtered_comments.json"):
    print("✔️ JSON files successfully saved!")
else:
    print("❌ Error: Files not found!")


🔄 Fetching top posts from r/politics...
📊 Fetching next 500 posts from r/politics... (Total so far: 0)
📊 Fetching next 500 posts from r/politics... (Total so far: 500)
📊 Fetching next 500 posts from r/politics... (Total so far: 1000)
📊 Fetching next 500 posts from r/politics... (Total so far: 1500)
📊 Fetching next 500 posts from r/politics... (Total so far: 2000)
📊 Fetching next 500 posts from r/politics... (Total so far: 2500)
📊 Fetching next 500 posts from r/politics... (Total so far: 3000)
📊 Fetching next 500 posts from r/politics... (Total so far: 3500)
✅ Collected 50 valid posts from r/politics
🔄 Fetching top posts from r/Conservative...
📊 Fetching next 500 posts from r/Conservative... (Total so far: 0)
📊 Fetching next 500 posts from r/Conservative... (Total so far: 500)
✅ Collected 50 valid posts from r/Conservative
✅ Collected 100 posts after filtering.
📥 Fetching 100 random comments for post 1d4emcb...
📥 Fetching 100 random comments for post 1elhbeb...
📥 Fetching 100 random com