# This notebook is for extracting comments regarding stock discussions from Reddit.

In [2]:
import praw
import datetime
import os
import yaml
import pandas as pd
from datetime import timezone  # Import the timezone class

def fetch_comments(subreddit_name='WallStreetBets', limit=3000):
    """Fetch comments from a specified subreddit using PRAW and returns a DataFrame."""

    with open('../config/config.yaml', 'r') as file:
        config = yaml.safe_load(file)

    reddit = praw.Reddit(client_id=config['client_id'],
                         client_secret=config['client_secret'],
                         user_agent="my user agent")

    subreddit = reddit.subreddit(subreddit_name)
    data = []

    for comment in subreddit.comments(limit=limit):
        # Use datetime.fromtimestamp with timezone.utc for UTC datetime objects
        comment_datetime = datetime.datetime.fromtimestamp(comment.created_utc, timezone.utc)
        data.append([comment_datetime, comment.body, comment.score])

    return pd.DataFrame(data, columns=['Datetime', 'Body', 'Score'])

def read_csv_if_exists(filename):
    """Returns a DataFrame from a CSV file if it exists, or an empty DataFrame otherwise."""
    return pd.read_csv(filename) if os.path.exists(filename) else pd.DataFrame(columns=['Datetime', 'Body', 'Score'])

def merge_and_deduplicate(original_df, new_df):
    """Merges two DataFrames, sorts by 'Datetime' and 'Body', and removes duplicate rows, keeping the last."""

    # Ensure both DataFrames have the same structure, especially if one might be empty
    if original_df.empty:
        original_df = pd.DataFrame(columns=new_df.columns)
    elif new_df.empty:
        new_df = pd.DataFrame(columns=original_df.columns)
    combined_df = pd.concat([original_df, new_df])

    # It's critical to ensure that 'Datetime' is treated as actual datetime objects for sorting and dropping duplicates.
    combined_df['Datetime'] = pd.to_datetime(combined_df['Datetime'], utc=True)

    # Sort before dropping duplicates to ensure the last entry is kept
    combined_df.sort_values(by=['Datetime', 'Body'], inplace=True)

    # Drop duplicates with the same 'Datetime' and 'Body', keeping the last occurrence
    combined_df.drop_duplicates(subset=['Datetime', 'Body'], keep='last', inplace=True)
    
    return combined_df

def update_csv():
    comments_dir = 'comments'
    os.makedirs(comments_dir, exist_ok=True)
    
    today = datetime.datetime.now(timezone.utc).date()
    yesterday = today - datetime.timedelta(days=1)

    current_csv_filename = os.path.join(comments_dir, f'{today}-wsb-comments.csv')
    previous_csv_filename = os.path.join(comments_dir, f'{yesterday}-wsb-comments.csv')

    new_comments_df = fetch_comments()

    # Directly extract and use the 'date' part for filtering without adding it as a separate column
    current_df = new_comments_df[pd.to_datetime(new_comments_df['Datetime'], utc=True).dt.date == today]
    previous_df_new = new_comments_df[pd.to_datetime(new_comments_df['Datetime'], utc=True).dt.date == yesterday]

    if not current_df.empty:
        existing_df = read_csv_if_exists(current_csv_filename)
        updated_current_df = merge_and_deduplicate(existing_df, current_df)
        # Ensure the 'date' column is not included in the saved CSV
        updated_current_df.to_csv(current_csv_filename, index=False)

    if not previous_df_new.empty:
        previous_df = read_csv_if_exists(previous_csv_filename)
        updated_previous_df = merge_and_deduplicate(previous_df, previous_df_new)
        # Ensure the 'date' column is not included in the saved CSV
        updated_previous_df.to_csv(previous_csv_filename, index=False)

if __name__ == "__main__":
    update_csv()

In [7]:
import praw
import datetime
import os
import yaml
import pandas as pd


with open('../config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

reddit = praw.Reddit(client_id=config['client_id'],
                        client_secret=config['client_secret'],
                        user_agent="my user agent")

# Replace 'subreddit_name' with the actual name of the subreddit you want to monitor
subreddit = reddit.subreddit('WallStreetBets')

# Initialize an empty list to store comments
comments_list = []

# Stream comments and add them to the list
for comment in subreddit.comments(limit=None):
    comments_list.append(comment.body)  # Adding the text of each comment to the list

    # Optional: print the comment or its length to monitor progress
    print(comment.body)  # Or use print(len(comments_list)) to see how many comments have been collected

!banbet
Yes. It’s trash.
what's your basis? ![img](emote|t5_2th52|4271)
Looks like RDDT is starting it's speed run to a penny stock. DJT up next.
What's Sbf?
I thought fb was clearing them out. ![img](emote|t5_2th52|4271)
Get 30 strikes. Less of them but better strike
I feel better now since no one with care if I grab the bags then
![img](emote|t5_2th52|4267)
The daily board is really exposing yall, some of you are making hundreds of comments a day, every day ![img](emote|t5_2th52|4640)
Why is the wedding in 2 years? Just do it now. If they really love you, they won't give a damn about the money.
gonna see if LULU can go over 400 tmrw on 0dte
Embarrassing for all the hoodlums who doubted you.
Why is a new porn movie level trending?
You better make them regret it at 9:30am ET on Jan 01, 10000
Need DJT to at least $80
I am happy for you 😭
Yeah, I think Papa pow just doesn’t want to spook the markets with all of this two rate cut nonsense
Lol dude sold almost half his bag. He truly.has no

In [8]:
len(comments_list)

959