# 🎯 Objective

This notebook collects Reddit posts and comments from selected subreddits using the Reddit API.  
We authenticate, extract posts and comments, and save them in JSON format.  

---




## 📚 Libraries

In [1]:
import os  
import time  
import json  
import random 
import praw  
from dotenv import load_dotenv 
from datetime import datetime  
import re

# 🔍 Research Design

### **Research Question:**
How do discussions around political topics differ between left-leaning and right-leaning subreddits?

### **Why These Subreddits?**
I selected:
- **r/SandersForPresident** → A leftist political subreddit.
- **r/politics** → A liberal-leaning political subreddit.
- **r/PoliticalDiscussion** → A subreddit centred around political discussion.
- **r/Conservative** → A right-leaning political subreddit.

This helps ensure that both perspectives are catered for.

### **Data Points Collected**
For each **post**, I collect:
- `id` → Unique Reddit post ID
- `subreddit` → Subreddit name
- `title` → Title of the post
- `score` → Upvote count
- `num_comments` → Number of comments
- `created_utc` → Timestamp of creation
- `text` → Post content
- `url` → Link to the post

For each **comment**, I collect:
- `comment_id` → Unique ID of the comment
- `post_id` → ID of the related post
- `body` → Comment text
- `score` → Upvote count
- `created_utc` → Timestamp of creation


# 💻 API Functions

In [2]:
import os
import json
import time
import random
import re
from dotenv import load_dotenv
import praw
from datetime import datetime

# 💻 Reddit API Authentication
def authenticate_reddit():
    """
    Authenticate with the Reddit API using credentials stored in environment variables.

    Returns:
        praw.Reddit: An authenticated Reddit API client instance.
    """
    load_dotenv()
    return praw.Reddit(
        client_id=os.getenv("REDDIT_CLIENT_ID"),
        client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
        user_agent="ElectionScraper"
    )


# 📥 Fetch Posts (Ensure No Duplicates)
def fetch_posts(reddit, subreddits, limit=50):
    """
    Fetches top posts from selected subreddits while ensuring no duplicates.

    Args:
        reddit (praw.Reddit): Authenticated Reddit API client.
        subreddits (list): List of subreddit names.
        limit (int, optional): Number of posts per subreddit.

    Returns:
        list: A list of unique dictionaries containing post details.
    """
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2025, 1, 6)
    
    collected_posts = {}

    for subreddit in subreddits:
        print(f"🔄 Fetching top posts from r/{subreddit}...")

        subreddit_obj = reddit.subreddit(subreddit)
        total_fetched = 0
        temp_posts = []
        
        while len(temp_posts) < limit:
            print(f"📊 Fetching posts from r/{subreddit}... (Total so far: {total_fetched})")

            for post in subreddit_obj.top(time_filter="year", limit=500):
                total_fetched += 1

                if (post.id not in collected_posts and  # Ensure post is not already saved
                    post.num_comments >= 300 and
                    start_date <= datetime.utcfromtimestamp(post.created_utc) <= end_date):

                    # Store unique post
                    collected_posts[post.id] = {
                        "id": post.id,
                        "subreddit": subreddit,
                        "title": post.title,
                        "score": post.score,
                        "num_comments": post.num_comments,
                        "created_utc": datetime.utcfromtimestamp(post.created_utc).isoformat(),
                        "text": post.selftext.strip(),
                        "url": post.url
                    }

                    temp_posts.append(post.id)  # Track posts per subreddit
                    
                    if len(temp_posts) >= limit:
                        break  # Stop when reaching the limit

                time.sleep(0.05)  # ⏳ Prevent rate-limiting
            
            if total_fetched >= 5000:  # Prevent infinite loops
                print(f"Stopping search for r/{subreddit}, reached 5000 posts with only {len(temp_posts)} valid.")
                break

        print(f"✅ Collected {len(temp_posts)} unique posts from r/{subreddit}")

    return list(collected_posts.values())  # Convert dictionary back to list


# Fetch 300 Random Comments per Post (Ensuring Words Are Present)
def fetch_random_comments(reddit, post_id, limit=300):
    """
    Fetches 300 random top-level comments for a given Reddit post, ensuring they contain words.

    Args:
        reddit (praw.Reddit): Authenticated Reddit API client.
        post_id (str): ID of the post to fetch comments for.
        limit (int, optional): Number of comments to retrieve.

    Returns:
        list: A list of dictionaries containing comment details.
    """
    post = reddit.submission(id=post_id)

    try:
        post.comments.replace_more(limit=0)  # Load all top-level comments
    except Exception as e:
        print(f"Error fetching comments for {post_id}: {e}")
        return []

    all_comments = [
        comment for comment in post.comments
        if comment.author and 
        comment.author.name.lower() != "automoderator" and 
        len(comment.body.split()) >= 3 and
        re.search(r'[a-zA-Z]', comment.body)  # ✅ Ensure comment has words (not just links/gifs)
    ]

    random_comments = random.sample(all_comments, min(len(all_comments), limit))

    return [{
        "post_id": post_id,
        "comment_id": comment.id,
        "body": comment.body,
        "score": comment.score,
        "created_utc": datetime.utcfromtimestamp(comment.created_utc).isoformat()
    } for comment in random_comments]
    


# Save Data as JSON
def save_to_json(data, filename):
    """
    Saves a list of dictionaries as a JSON file in the specified directory.

    Args:
        data (list): The data to save.
        filename (str): The filename to save the data to.

    Returns:
        None
    """
    folder = "/files/ds105a-2024-alternative-summative-ajchan03/data/raw/"
    os.makedirs(folder, exist_ok=True)  # Ensure the directory exists
    file_path = os.path.join(folder, filename)

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

    


# 📥 Data Collection

In [3]:
# 📥 Step 1: Authenticate Reddit API
reddit = authenticate_reddit()
subreddits = ["politics", "Conservative", "PoliticalDiscussion", "SandersForPresident"]

# 📥 Step 2: Fetch Posts
all_posts = fetch_posts(reddit, subreddits, limit=300)
print(f"✅ Collected {len(all_posts)} posts after filtering.")

# 📥 Step 3: Fetch Comments
all_comments = []
for post in all_posts:
    print(f"📥 Fetching 300 comments for post {post['id']}...")
    comments = fetch_random_comments(reddit, post["id"], limit=300)
    all_comments.extend(comments)

# 📥 Step 4: Save Data
save_to_json(all_posts, "reddit_filtered_posts.json")
save_to_json(all_comments, "reddit_filtered_comments.json")

print("Data collection complete")


🔄 Fetching top posts from r/politics...
📊 Fetching posts from r/politics... (Total so far: 0)
✅ Collected 300 unique posts from r/politics
🔄 Fetching top posts from r/Conservative...
📊 Fetching posts from r/Conservative... (Total so far: 0)
📊 Fetching posts from r/Conservative... (Total so far: 500)
📊 Fetching posts from r/Conservative... (Total so far: 1000)
📊 Fetching posts from r/Conservative... (Total so far: 1500)
📊 Fetching posts from r/Conservative... (Total so far: 2000)
📊 Fetching posts from r/Conservative... (Total so far: 2500)
📊 Fetching posts from r/Conservative... (Total so far: 3000)
📊 Fetching posts from r/Conservative... (Total so far: 3500)
📊 Fetching posts from r/Conservative... (Total so far: 4000)
📊 Fetching posts from r/Conservative... (Total so far: 4500)
⚠️ Stopping search for r/Conservative, reached 5000 posts with only 223 valid.
✅ Collected 223 unique posts from r/Conservative
🔄 Fetching top posts from r/PoliticalDiscussion...
📊 Fetching posts from r/Politica

# ✅ Quality Check

I verify:
- JSON files exist in `/data/raw/`
- The collected data has expected structure
- The number of posts and comments is reasonable




In [5]:
# ✅ Check if JSON Files Exist in the Correct Directory
post_file = "/files/ds105a-2024-alternative-summative-ajchan03/data/raw/reddit_filtered_posts.json"
comment_file = "/files/ds105a-2024-alternative-summative-ajchan03/data/raw/reddit_filtered_comments.json"

if os.path.exists(post_file) and os.path.exists(comment_file):
    print("✔️ JSON files successfully saved in /files/ds105a-2024-alternative-summative-ajchan03/data/raw/")
else:
    print("❌ Error: Files not found!")


✔️ JSON files successfully saved in /files/ds105a-2024-alternative-summative-ajchan03/data/raw/
