#### Lab 4 (Part 1) - PRAW Search Function

In this lab, you will attempt to "search" posts based on one or more keywords.

Ref: https://www.reddit.com/dev/api/#GET_search
https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html#praw.models.Subreddit.search

In [1]:
# Have you already installed praw package in Anaconda? Please check.
import praw
from datetime import datetime

# The information for following fileds are obtained from Reddit
# You should have created a Reddit app and gotten these values.
# We firstly create a reddit object bound to variable `reddit`

reddit = praw.Reddit(
    client_id = "CoEHlOZSnws8fscZPMNE0Q", # See slide 9 & 10 of Lab 2 Guide
    client_secret = "eAkexuMuchegtDk59ZH_HlA6Gkc2jg", # See slide 9 & 10 of Lab 2 Guide
   username = "Emotional_Ad_1570", # See slide 4 (step 3) of Lab 2 Guide
   password = "Iloveis434!",  # See slide 4 (step 3) of Lab 2 Guide
   user_agent = "IS434_Project" # See slide 9 & 10 of Lab 2 Guide
)

In [None]:
# GET RELEVANT REDDIT POSTS
subreddits = ['singapore', 'SingaporeFitness', 'askSingapore']
search_keyword = 'Anytime Fitness'
posts_data = []

for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    for submission in subreddit.search(search_keyword, sort='relevance', time_filter='all', limit=None):
        downvotes = submission.score - submission.upvote_ratio * submission.score
        posts_data.append({
            'ID': submission.id,
            'Subreddit': subreddit_name,
            'Author': str(submission.author),
            'Created_UTC': datetime.fromtimestamp(int(submission.created_utc)),
            'Title': submission.title,
            'Content': submission.selftext,
            'Upvotes': submission.score,  # Reddit API does not provide explicit upvotes
            'Downvotes': int(downvotes), # Estimated based on upvote ratio
            'Num_Comments': submission.num_comments,
            'Link': f"https://www.reddit.com{submission.permalink}"
        })

# Convert list of dicts to DataFrame
df = pd.DataFrame(posts_data)

# Save to Excel
df.to_excel('reddit_singapore_anytimefitness_posts.xlsx', index=False)

In [None]:
# AFTER CLEANING AND KEEPING RELEVANT POSTS, GET THE COMMENTS OF THESE POSTS

import pandas as pd
from datetime import datetime

# Load the cleaned list of relevant post IDs from your edited post file
posts_df = pd.read_excel('reddit_singapore_anytimefitness_posts.xlsx')
relevant_ids = set(posts_df['ID'].astype(str))  # ensure match by string type

rows = []

subreddits = ['singapore', 'SingaporeFitness', 'askSingapore']

for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    # Fetch ALL posts (could optimize w/ search, but this ensures full coverage)
    for submission in subreddit.search('Anytime Fitness', sort='relevance', time_filter='all', limit=None):
        if submission.id not in relevant_ids:
            continue  # skip non-relevant posts

        post_fields = {
            'ID': submission.id,
            'Subreddit': subreddit_name,
            'Author': str(submission.author),
            'Created_UTC': datetime.fromtimestamp(int(submission.created_utc)),
            'Title': submission.title,
            'Content': submission.selftext,
            'Upvotes': submission.score,
            'Downvotes': int(submission.score - submission.upvote_ratio * submission.score) if submission.upvote_ratio is not None else None,
            'Num_Comments': submission.num_comments,
            'Link': f"https://www.reddit.com{submission.permalink}"
        }

        # Fetch and flatten all comments
        submission.comments.replace_more(limit=None)
        flat_comments = submission.comments.list()
        for idx, comment in enumerate(flat_comments, 1):
            row = post_fields.copy()
            row['comment_number'] = idx
            row['comment'] = comment.body
            row['comment_upvotes'] = comment.score
            row['comment_downvotes'] = None  # Not available from Reddit API
            rows.append(row)

# Convert to DataFrame and save to Excel
columns = [
    'ID','Subreddit','Author','Created_UTC','Title','Content','Upvotes','Downvotes',
    'Num_Comments','Link','comment_number','comment','comment_upvotes','comment_downvotes'
]
df = pd.DataFrame(rows, columns=columns)
df.to_excel('reddit_singapore_anytimefitness_comments.xlsx', index=False)
print(f"Saved {len(df)} rows to reddit_singapore_anytimefitness_comments.xlsx")


Saved 3322 rows to reddit_singapore_anytimefitness_comments.xlsx


In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
# GET SENTIMENT SCORE OF COMMENTS

from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load your existing Excel file
excel_filename = 'reddit_singapore_anytimefitness_comments.xlsx'
df = pd.read_excel(excel_filename)

# Initialize Sentiment Intensity Analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to analyze each comment for sentiment
def analyze_comment_sentiment(comment):
    sentiment = analyzer.polarity_scores(str(comment))
    compound = sentiment["compound"]
    if compound >= 0.05:
        label = "Positive"
    elif compound <= -0.05:
        label = "Negative"
    else:
        label = "Neutral"
    return pd.Series([compound, label])

# Apply sentiment analysis, add columns 'comment_compound_score' and 'comment_sentiment_label'
df[['comment_compound_score', 'comment_sentiment_label']] = df['comment'].apply(analyze_comment_sentiment)

# Save the updated DataFrame back to the same Excel file (overwrite)
df.to_excel(excel_filename, index=False)

print("Sentiment columns added and Excel overwritten in place.")


Sentiment columns added and Excel overwritten in place.
