In [4]:
# reddit_rF1_pipeline.py

%pip install textblob praw pandas re datetime

import praw
import pandas as pd
import re
from datetime import datetime
from textblob import TextBlob




ERROR: Could not find a version that satisfies the requirement re (from versions: none)

[notice] A new release of pip is available: 25.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for re





In [None]:
# 1. Reddit API setup
# NOTE: replace these with our own credentials
CLIENT_ID = "your_client_id"
CLIENT_SECRET = "your_client_secret"
USER_AGENT = "ds464-rf1-project by u/Hefty_Affect350 v1.0"
reddit = praw.Reddit(
client_id=CLIENT_ID,
client_secret=CLIENT_SECRET,
user_agent=USER_AGENT
)
subreddit = reddit.subreddit("formula1") # some F1 content is also here
# if we strictly want r/F1 and have access, we can switch the name


In [None]:
# 2. Helper functions for cleaning text
def clean_text(text):
    """Basic text cleaning for reddit comments and posts."""
    if text is None:
        return ""
    # convert to lowercase
    text = text.lower()
    # remove urls
    text = re.sub(r"http\S+|www\S+", " ", text)
    # remove markdown links [text](url)
    text = re.sub(r"\[.*?\]\(.*?\)", " ", text)
    # remove special characters and digits (keep basic punctuation)
    text = re.sub(r"[^a-zA-Z\s']", " ", text)
    # collapse extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

def get_sentiment(text):
    """Return simple sentiment score using TextBlob."""
    if not text:
        return 0.0
    blob = TextBlob(text)
    return float(blob.sentiment.polarity) # -1 (negative) to 1
    (positive)


[ 1  2 23]
26


In [None]:
# 3. Extract posts and comments
posts_data = []
comments_data = []

# we take top posts from the last year as a starting point
for submission in subreddit.top(time_filter="year", limit=200):
    # skip removed or deleted
    if submission.selftext in ["[removed]", "[deleted]"]:
        continue

    post_record = {
        "post_id": submission.id,
        "title": submission.title,
        "body": submission.selftext,
        "score": submission.score,
        "num_comments": submission.num_comments,
        "created_utc": datetime.fromtimestamp(submission.created_utc),
        "author": str(submission.author),
        "flair": str(submission.link_flair_text)
    }
    posts_data.append(post_record)

    # load comments for this post
    submission.comments.replace_more(limit=0)
for comment in submission.comments.list():
        if isinstance(comment.body, str) and comment.body not in ["[removed]", "[deleted]"]:
            comments_data.append({
                "comment_id": comment.id,
                "post_id": submission.id,
                "author": str(comment.author),
                "body": comment.body,
                "score": comment.score,
                "created_utc":
                datetime.fromtimestamp(comment.created_utc),
                "parent_id": comment.parent_id
            })
            
# create dataframes
posts_df = pd.DataFrame(posts_data)
comments_df = pd.DataFrame(comments_data)


In [None]:
# 4. Preprocess text and add sentiment
# clean titles and bodies

posts_df["clean_title"] = posts_df["title"].apply(clean_text)
posts_df["clean_body"] = posts_df["body"].apply(clean_text)

# sentiment for post body (average mood of original post)
posts_df["body_sentiment"] = posts_df["clean_body"].apply(get_sentiment)

# clean comments and add sentiment
comments_df["clean_body"] = comments_df["body"].apply(clean_text)
comments_df["sentiment"] =
comments_df["clean_body"].apply(get_sentiment)


In [None]:
# 5. Derive simple engagement features
# engagement score for posts (very simple version)
posts_df["engagement_score"] = posts_df["num_comments"] /(posts_df["score"] + 1)

# basic time attributes
posts_df["date"] = posts_df["created_utc"].dt.date
posts_df["year"] = posts_df["created_utc"].dt.year
posts_df["month"] = posts_df["created_utc"].dt.month
comments_df["date"] = comments_df["created_utc"].dt.date
comments_df["year"] = comments_df["created_utc"].dt.year
comments_df["month"] = comments_df["created_utc"].dt.month


In [None]:
# 6. Save cleaned data for warehouse loading
posts_df.to_csv("rf1_posts_clean.csv", index=False)
comments_df.to_csv("rf1_comments_clean.csv", index=False)
print("Saved rf1_posts_clean.csv and rf1_comments_clean.csv")

print("Reddit r/F1 data extraction and preprocessing complete.")
