# Collect r/wallstreetbets Data on Reddit in 2022

## Install praw¶ for PushShift API

In [1]:
!pip install praw

Defaulting to user installation because normal site-packages is not writeable


## Packages I used

In [10]:
!pip install pandas
!pip install tqdm

In [1]:
import os
import praw
import pandas as pd
import datetime as dt
from tqdm import tqdm
import time

## Environments setup for Reddit

### Reddit secret token

1. step 1

go to this [url](https://www.reddit.com/prefs/apps) and click to ( create new app... )

![image](images/2.png)

in the end you should get something like this

![image](images/3.png)


2. step 2

after that you should get this data

![image](images/4.png)

    - below type of app you should get = REDDIT_PERSONAL_USE_SCRIPT_14_CHARS
    - secret = REDDIT_SECRET_KEY_27_CHARS
    - name = REDDIT_APP_NAME
    - username of reddit profile = REDDIT_USER_NAME
    - password reddit profile = REDDIT_LOGIN_PASSWORD


## Configure the api requirement for featching data

In [4]:
def get_date(created):
    return dt.datetime.fromtimestamp(created)

def reddit_connection():
    personal_use_script = os.environ["REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"]
    client_secret = os.environ["REDDIT_SECRET_KEY_27_CHARS"]
    user_agent = os.environ["REDDIT_APP_NAME"]
    username = os.environ["REDDIT_USER_NAME"]
    password = os.environ["REDDIT_LOGIN_PASSWORD"]

    reddit = praw.Reddit(client_id=personal_use_script, \
                         client_secret=client_secret, \
                         user_agent=user_agent, \
                         username=username, \
                         password='')
    return reddit

## Build the dataset (daily update)

In [5]:
def build_dataset(reddit, search_words='wallstreetbets', items_limit=5000):

subreddit = reddit.subreddit(search_words)
    new_subreddit = subreddit.new(limit=items_limit)
    topics_dict = { "title":[],
                "score":[],
                "id":[], "url":[],
                "comms_num": [],
                "created": [],
                "body":[]}
    
    print(f"featching data from wallstreet subreddit...")
    for submission in tqdm(new_subreddit):
        topics_dict["title"].append(submission.title)
        topics_dict["score"].append(submission.score)
        topics_dict["id"].append(submission.id)
        topics_dict["url"].append(submission.url)
        topics_dict["comms_num"].append(submission.num_comments)
        topics_dict["created"].append(submission.created)
        topics_dict["body"].append(submission.selftext)

    for comment in tqdm(subreddit.comments(limit=2000)):
        topics_dict["title"].append("Comment")
        topics_dict["score"].append(comment.score)
        topics_dict["id"].append(comment.id)
        topics_dict["url"].append("")
        topics_dict["comms_num"].append(0)
        topics_dict["created"].append(comment.created)
        topics_dict["body"].append(comment.body)

    topics_df = pd.DataFrame(topics_dict)
    print(f"featching data from wallstreet subreddit: {len(topics_df)}")
    topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))

    return topics_df

In [1]:
def update_and_save_dataset(topics_df):   
    file_path = "Stock Market.csv"
    if os.path.exists(file_path):
        topics_old_df = pd.read_csv(file_path)
        print(f"past reddit posts: {topics_old_df.shape}")
        topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
        print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
        topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
        print(f"all reddit posts: {topics_new_df.shape}")
        topics_new_df.to_csv(file_path, index=False)
    else:
        print(f"reddit posts: {topics_df.shape}")
        topics_df.to_csv(file_path, index=False)

## [Option] Update and save dataset
We perform the following actions for meargin with old data from pass:

* Load old data
* Merge the two datasets
* Save the merged data
We also log here the information on the updated dataset.

In [6]:
def update_and_save_dataset(topics_df):   
    file_path = "../input/wallstreetbets-2022/wallstreetbets_2022.csv"
    out_file_path = "wallstreetbets_2022.csv"
    if run:
        run["rows_new"] = topics_df.shape[0]
        run["cols_new"] = topics_df.shape[1]
    if os.path.exists(file_path):
        topics_old_df = pd.read_csv(file_path)
        if run:
            run["rows_old"] = topics_old_df.shape[0]
            run["cols_old"] = topics_old_df.shape[1]
        print(f"past reddit posts: {topics_old_df.shape}")
        topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
        print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
        topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
        print(f"all reddit posts: {topics_new_df.shape}")
        if run:
            run["rows_merged"] = topics_old_df.shape[0]
            run["cols_merged"] = topics_old_df.shape[1]
        topics_new_df.to_csv(out_file_path, index=False)
    else:
        print(f"wallstreetbets posts: {topics_df.shape}")
        topics_df.to_csv(out_file_path, index=False)

## Build connectoin with reddit

In [11]:
if __name__ == "__main__": 
    reddit = reddit_connection()
    topics_data_df = build_dataset(reddit)
    update_and_save_dataset(topics_data_df)