In [76]:
import praw as praw
import pandas as pd
import os
import datetime
import re
import emoji
import spacy
from better_profanity import profanity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import credential

In [78]:
nlp = spacy.load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()
profanity.load_censor_words()
def is_question_spacy(text):
    doc = nlp(text)
    return 1 if any(token.dep_ == "aux" and token.head.pos_ == "VERB" for token in doc) else 0

In [79]:
reddit = praw.Reddit(
    client_id= credential.REDDIT_CLIENT_ID,  
    client_secret= credential.REDDIT_CLIENT_SECRET,  
    user_agent= credential.REDDIT_USER_AGENT
)

reddit.read_only = True

In [80]:
subreddit_name = "datascience"
sort_by = "hot"  # Options: 'hot', 'new', 'top', 'rising'
num_posts = 10

In [81]:
data = []
subreddit = reddit.subreddit(subreddit_name)
posts = getattr(subreddit, sort_by)(limit=num_posts)

for post in posts:
    submission = reddit.submission(id=post.id)
    submission.comments.replace_more(limit=5)  # Limit deep threading

    # Get post creation time in UTC
    post_time_utc = datetime.datetime.fromtimestamp(submission.created_utc, datetime.UTC)

    for comment in submission.comments.list():
        # Text features
        text = comment.body
        sentiment_score = analyzer.polarity_scores(text)["compound"]
        text_length = len(text)
        num_words = len(text.split())
        contains_question = is_question_spacy(text)
        contains_emoji = 1 if any(emoji.is_emoji(char) for char in text) else 0
        contains_profanity = 1 if profanity.contains_profanity(text) else 0

        # Metadata features
        comment_time_utc = datetime.datetime.fromtimestamp(comment.created_utc, datetime.UTC)
        comment_age_hours = (comment_time_utc - post_time_utc).total_seconds() / 3600
        is_early_comment = 1 if comment_age_hours <= 1 else 0  # Early = within 1 hour

        comment_hour = comment_time_utc.hour
        comment_day = comment_time_utc.weekday()
        subreddit_name = submission.subreddit.display_name
        parent_score = comment.parent().score if comment.parent_id != submission.id else None

        # User-based features
        user_karma = None
        account_age = None
        
        if comment.author:
            user_karma = comment.author.comment_karma
            account_age = (datetime.datetime.utcnow() - datetime.datetime.utcfromtimestamp(comment.author.created_utc)).days

        # Engagement variables (dependent variables)
        comment_score = comment.score
        num_replies = len(comment.replies)

        # Store data
        data.append({
            "Comment Score": comment_score,
            "Number of Replies": num_replies,
            "Text" : text,
            "Sentiment Score": sentiment_score,
            "Text Length": text_length,
            "Word Count": num_words,
            "Contains Question": contains_question,
            "Contains Emoji": contains_emoji,
            "Contains Profanity": contains_profanity,
            "Comment Age (hours)": comment_age_hours,
            "Comment Hour": comment_hour,
            "Comment Day": comment_day,
            "Subreddit Name": subreddit_name,
            "Is Early Comment": is_early_comment,
            "Parent Score": parent_score,
            "User Karma": user_karma,
            "Account Age (days)": account_age,
        })

  account_age = (datetime.datetime.utcnow() - datetime.datetime.utcfromtimestamp(comment.author.created_utc)).days
  account_age = (datetime.datetime.utcnow() - datetime.datetime.utcfromtimestamp(comment.author.created_utc)).days


In [82]:
df_new = pd.DataFrame(data)

In [83]:
csv_file = "reddit_engagement_data.csv"

if os.path.exists(csv_file):
    df_existing = pd.read_csv(csv_file)
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)
    
    # Remove duplicate rows
    df_combined.drop_duplicates(inplace=True)
    df_combined.to_csv(csv_file, index=False)
    print(f"✅ {len(df_new)} new rows checked. Unique dataset saved to '{csv_file}'.")

else:
    df_new.to_csv(csv_file, index=False)
    print(f"✅ First-time save: {len(df_new)} rows saved to '{csv_file}'.")

✅ 192 new rows checked. Unique dataset saved to 'reddit_engagement_data.csv'.


In [85]:
df_existing = pd.read_csv("reddit_engagement_data.csv")
len(df_existing)

334