Code to scrape reddit threads using PRAW api - specifying keywords for more concentrated search of political leaders in the US elections between 2021-2024

In [46]:
import praw
import pandas as pd
from datetime import datetime
import time
import re

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id="QBK7L08KqtcVUiTNv3R5bw",
    client_secret="yLGGWmBe7oKHArHhjCxQ5WOuZgpOLg",
    user_agent="Thread_Scraper:v1.0 (by /u/Single_Instance4107)"
)

# Function to extract hashtags from text
def extract_hashtags(text):
    hashtags = re.findall(r"#\w+", text)
    return hashtags

# Function to extract matching keywords from text
def extract_keywords(text, keywords):
    found_keywords = [keyword for keyword in keywords if keyword.lower() in text.lower()]
    return found_keywords

# Function to scrape posts
def scrape_reddit_posts(subreddits, keywords, start_date=None, end_date=None, upvote_threshold=0, max_posts_per_subreddit=1000):
    posts = []
    for subreddit in subreddits:
        try:
            subreddit_obj = reddit.subreddit(subreddit)
            count = 0
            print(f"Scraping subreddit: {subreddit}")  # Log current subreddit
            for submission in subreddit_obj.new(limit=None):  # Fetch newest posts
                created_time = datetime.fromtimestamp(submission.created_utc)
                # Filter by date, upvotes, and keywords
                if start_date <= created_time <= end_date and submission.score >= upvote_threshold:
                    if any(keyword.lower() in submission.title.lower() or keyword.lower() in submission.selftext.lower() for keyword in keywords):
                        hashtags = extract_hashtags(submission.title + " " + submission.selftext)
                        matched_keywords = extract_keywords(submission.title + " " + submission.selftext, keywords)
                        posts.append({
                            "title": submission.title,
                            "selftext": submission.selftext,
                            "upvotes": submission.score,
                            "num_comments": submission.num_comments,
                            "flair": submission.link_flair_text,
                            "created_utc": created_time,
                            "permalink": submission.permalink,
                            "subreddit": subreddit,
                            "hashtags": hashtags,
                            "matched_keywords": matched_keywords
                        })
                        count += 1
                        if count >= max_posts_per_subreddit:
                            break
                if count >= max_posts_per_subreddit:
                    break
            time.sleep(1)  # Respect Reddit's API rate limits
        except Exception as e:
            print(f"Error scraping subreddit {subreddit}: {e}")
    return posts

# Parameters
subreddits = [
    "politics", "PoliticalDiscussion", "news", "Conservative", "liberal",
    "election2024", "PresidentialRace", "AskPolitics", "AmericanPolitics", "uspolitics",
    "worldpolitics", "VoteBlue", "VoteRed", "Republican", "PoliticalHumor",
    "Elections", "Healthcare", "Immigration", "ClimateActionPlan", "Environment"
]

# Expanded keyword list for U.S. focus
keywords = [
    "election", "abortion", "healthcare reform", "climate change", "gun control",
    "economic policy", "reproductive rights", "voter suppression", "Joe Biden",
    "Donald Trump", "Ron DeSantis", "Kamala Harris", "Mike Pence", "January 6th",
    "Hunter Biden", "Trump indictment", "classified documents", "MAGA", "Bidenomics",
    "Ukraine aid", "Supreme Court rulings", "Democrat", "Republican", "Texas",
    "Florida", "New York", "California", "America", "United States"
]

# Adjust date range
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 11, 5)

# Scrape posts
posts = scrape_reddit_posts(subreddits, keywords, start_date=start_date, end_date=end_date, upvote_threshold=10, max_posts_per_subreddit=1000)

# Convert to a DataFrame
df = pd.DataFrame(posts)

# Save to CSV
df.to_csv("reddit_2024_elections_posts_with_keywords.csv", index=False)

print(f"Scraped {len(df)} posts. Data saved to 'reddit_2024_elections_posts_with_keywords.csv'.")


Scraping subreddit: politics
Scraping subreddit: PoliticalDiscussion
Scraping subreddit: news
Scraping subreddit: Conservative
Scraping subreddit: liberal
Scraping subreddit: election2024
Scraping subreddit: PresidentialRace
Error scraping subreddit PresidentialRace: Redirect to /subreddits/search
Scraping subreddit: AskPolitics
Scraping subreddit: AmericanPolitics
Scraping subreddit: uspolitics
Scraping subreddit: worldpolitics
Scraping subreddit: VoteBlue
Scraping subreddit: VoteRed
Scraping subreddit: Republican
Scraping subreddit: PoliticalHumor
Scraping subreddit: Elections
Scraping subreddit: Healthcare
Scraping subreddit: Immigration
Scraping subreddit: ClimateActionPlan
Scraping subreddit: Environment
Scraped 1579 posts. Data saved to 'reddit_2024_elections_posts_with_keywords.csv'.
