# Collect r/wallstreetbets Data on Reddit in 2022

## Install praw¶

In [1]:
!pip install praw

Defaulting to user installation because normal site-packages is not writeable


## Packages I used

In [10]:
!pip install pandas
!pip install tqdm

In [1]:
import os
import praw
import pandas as pd
import datetime as dt
from tqdm import tqdm
import time

## Environments setup for Reddit and neptune.ai secrets
how you can use secrets api key with kaggle: [Feature Launch: User Secrets](https://www.kaggle.com/product-feedback/114053)

### neptune.ai secret token

* **What is neptune.ai**

Neptune is metadata store that offers experiment tracking and model registry for machine learning researchers and engineers. With Neptune, you can log, query, manage, display, and compare all your model metadata in a single place.

* Creating API token from neptune.ai

1. step 1
    - Sign Up form [neptune_api website](https://app.neptune.ai/) 
    
    - Go to dashboard
    
    - in your profile menu select the ( GET your API token )
    
![image](images/get_api_token.png)

2. step 2

![image](images/1.png)


### Reddit secret token

1. step 1

go to this [url](https://www.reddit.com/prefs/apps) and click to ( create new app... )

![image](images/2.png)

in the end you should get something like this

![image](images/3.png)


2. step 2

after that you should get this data

![image](images/4.png)

    - below type of app you should get = REDDIT_PERSONAL_USE_SCRIPT_14_CHARS
    - secret = REDDIT_SECRET_KEY_27_CHARS
    - name = REDDIT_APP_NAME
    - username of reddit profile = REDDIT_USER_NAME
    - password reddit profile = REDDIT_LOGIN_PASSWORD


## Configure the api requirement for featching data

In [4]:
personal_use_script = "REDDIT_PERSONAL_USE_SCRIPT_14_CHARS"
client_secret = "REDDIT_SECRET_KEY_27_CHARS"
user_agent = "REDDIT_APP_NAME"
username = "REDDIT_USER_NAME"
password = "REDDIT_LOGIN_PASSWORD"

reddit = praw.Reddit(client_id=personal_use_script, \
                         client_secret=client_secret, \
                         user_agent=user_agent, \
                         username=username, \
                         password='')

## Build the dataset (daily update)

In [5]:
def build_dataset(reddit, search_words='wallstreetbets', items_limit=4000):
    
    # Collect reddit posts
    subreddit = reddit.subreddit(search_words)
    new_subreddit = subreddit.new(limit=items_limit)
    topics_dict = { "title":[],
                "score":[],
                "id":[], "url":[],
                "comms_num": [],
                "created": [],
                "body":[]}
    
    print(f"retreive new reddit posts ...")
    for submission in tqdm(new_subreddit):
        topics_dict["title"].append(submission.title)
        topics_dict["score"].append(submission.score)
        topics_dict["id"].append(submission.id)
        topics_dict["url"].append(submission.url)
        topics_dict["comms_num"].append(submission.num_comments)
        topics_dict["created"].append(submission.created)
        topics_dict["body"].append(submission.selftext)

    for comment in tqdm(subreddit.comments(limit=items_limit)):
        topics_dict["title"].append("Comment")
        topics_dict["score"].append(comment.score)
        topics_dict["id"].append(comment.id)
        topics_dict["url"].append("")
        topics_dict["comms_num"].append(0)
        topics_dict["created"].append(comment.created)
        topics_dict["body"].append(comment.body)

    topics_df = pd.DataFrame(topics_dict)
    print(f"new reddit posts retrieved: {len(topics_df)}")
    topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))

    return topics_df

## [Option] Update and save dataset
We perform the following actions for meargin with old data from pass:

* Load old data
* Merge the two datasets
* Save the merged data
We also log here the information on the updated dataset.

In [6]:
def update_and_save_dataset(topics_df):   
    file_path = "../input/wallstreetbets-2022/wallstreetbets_2022.csv"
    out_file_path = "wallstreetbets_2022.csv"
    if run:
        run["rows_new"] = topics_df.shape[0]
        run["cols_new"] = topics_df.shape[1]
    if os.path.exists(file_path):
        topics_old_df = pd.read_csv(file_path)
        if run:
            run["rows_old"] = topics_old_df.shape[0]
            run["cols_old"] = topics_old_df.shape[1]
        print(f"past reddit posts: {topics_old_df.shape}")
        topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
        print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
        topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
        print(f"all reddit posts: {topics_new_df.shape}")
        if run:
            run["rows_merged"] = topics_old_df.shape[0]
            run["cols_merged"] = topics_old_df.shape[1]
        topics_new_df.to_csv(out_file_path, index=False)
    else:
        print(f"reddit posts: {topics_df.shape}")
        topics_df.to_csv(out_file_path, index=False)

## Run it all

* Initialize connection
* Build the dataset
* Update and save the dataset

In [11]:
reddit = reddit_connection()
topics_data_df = build_dataset(reddit)
update_and_save_dataset(topics_data_df)