In [1]:
import pandas as pd 
import praw
import re
import requests 
from nltk.corpus import stopwords

In [2]:
#import nltk
#nltk.download('stopwords')

In [3]:
# Get programatic access to reddit via https://www.reddit.com/prefs/apps
# read the configuration from config file in format
# [PRAW CONFIG]
# client_id = <client id created>
# client_secret = <client secret created>

import configparser
config = configparser.RawConfigParser()
config.read('config.properties')
praw_config = dict(config.items('PRAW CONFIG'))

In [4]:
_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"

reddit = praw.Reddit(
  client_id = praw_config['client_id'],
  client_secret = praw_config['client_secret'],
  user_agent = _user_agent
)

print(reddit.read_only)

True


In [5]:
data = []
for post in reddit.subreddit('wallstreetbets').hot(limit=500):
    content = {
        "title" : post.title,
        "text" : post.selftext
    }
    data.append(content)

df = pd.DataFrame(data)

In [6]:
regex = re.compile('[^a-zA-Z0-9 ]')
word_dict = {}

for (index, row) in df.iterrows():
    # titles
    title = row['title']
    
    title = regex.sub('', title)
    title_words = title.split()
    
    # content
    content = row['text']
    
    content = regex.sub('', content)
    content_words = content.split()
    
    # combine
    words = title_words + content_words
    # keep the case of the words intentionally - remove stopwords from lowercase only as uppercase stopwords could be tickers
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    
    for x in filtered_words:
        if x in word_dict:
            word_dict[x] += 1
        else:
            word_dict[x] = 1

word_df = pd.DataFrame.from_dict(list(word_dict.items())).rename(columns = {0:"Term", 1:"Frequency"})

In [7]:
#word_df.sort_values(by="Frequency", ascending = False).head(50)

In [8]:
# Get tickers from NASDAQ api to match
import requests

headers = {
    "User-Agent" : _user_agent,
}

url = "https://api.nasdaq.com/api/screener/stocks?tableonly=true&limit=7600"
r = requests.get(url, headers=headers)
j = r.json()

In [9]:
table = j['data']['table']['rows']
#table
df_tickers = pd.DataFrame(table)
#df_tickers

In [10]:
df_tickers = df_tickers.rename(columns = {"symbol":"Term", "name":"Company_Name"})
df_tickers = df_tickers[['Term', 'Company_Name', 'netchange', 'pctchange', 'marketCap', 'lastsale']]

In [11]:
trending_stocks_df = pd.merge(df_tickers, word_df, on="Term")

In [12]:
trending_stocks_df = trending_stocks_df.sort_values(by="Frequency", ascending = False)
trending_stocks_df = trending_stocks_df.reset_index(drop=True)

In [15]:
trending_stocks_df = trending_stocks_df[trending_stocks_df['Frequency'] > 10]

In [16]:
from IPython.display import display, HTML
display(HTML(trending_stocks_df.to_html()))

Unnamed: 0,Term,Company_Name,netchange,pctchange,marketCap,lastsale,Frequency
0,DD,"DuPont de Nemours, Inc. Common Stock",1.62,1.985%,44295528049,$83.24,49
1,A,"Agilent Technologies, Inc. Common Stock",1.13,0.869%,39961079092,$131.15,43
2,UWMC,UWM Holdings Corporation Class A Common Stock,0.09,1.213%,12054856680,$7.51,41
3,GME,GameStop Corporation Common Stock,-4.58,-2.784%,11317822738,$159.92,38
4,CLOV,"Clover Health Investments, Corp. Class A Commo...",0.21,2.893%,3292029000,$7.47,35
5,PLTR,Palantir Technologies Inc. Class A Common Stock,1.71,9.309%,37495509440,$20.08,28
6,TSLA,"Tesla, Inc. Common Stock",18.05,3.157%,568114498404,$589.74,23
7,TA,TravelCenters of America Inc. Common Stock,1.04,4.177%,377734622,$25.94,17
8,AMC,"AMC Entertainment Holdings, Inc. Class A Commo...",0.21,1.644%,5844637515,$12.98,15
9,VIAC,ViacomCBS Inc. Class B Common Stock,0.4,1.039%,25147292886,$38.91,13
