In [1]:
import praw
from data import *
import time
import pandas as pd
import matplotlib.pyplot as plt
import squarify
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import emoji    # removes emojis
import re   # removes links
import en_core_web_sm
import string
import yfinance as yf
import pyfolio as pf
from datetime import date
from datetime import timedelta
def data_extractor(reddit):
    '''extracts all the data from reddit
    Parameter: reddt: reddit obj
    Return:    posts, c_analyzed, tickers, titles, a_comments, picks, subs, picks_ayz
                
                posts: int: # of posts analyzed
                 c_analyzed: int: # of comments analyzed
                 tickers: dict: all the tickers found
                titles: list: list of the title of posts analyzed 
                 a_comments: dict: all the comments to analyze
                 picks: int: top picks to analyze
                 subs: int: # of subreddits analyzed
                picks_ayz: int: top picks to analyze
    
    '''
    
    '''############################################################################'''
    # set the program parameters
    subs = ['wallstreetbets','stocks' ]     # sub-reddit to search
    post_flairs = {'Daily Discussion', 'Weekend Discussion', 'Discussion'}    # posts flairs to search || None flair is automatically considered
    goodAuth = {'AutoModerator'}   # authors whom comments are allowed more than once
    uniqueCmt = True                # allow one comment per author per symbol
    ignoreAuthP = {'example'}       # authors to ignore for posts 
    ignoreAuthC = {'example'}       # authors to ignore for comment 
    upvoteRatio = 0.70         # upvote ratio for post to be considered, 0.70 = 70%
    ups = 20       # define # of upvotes, post is considered if upvotes exceed this #
    limit = 1     # define the limit, comments 'replace more' limit
    upvotes = 2     # define # of upvotes, comment is considered if upvotes exceed this #
    picks = 5     # define # of picks here, prints as "Top ## picks are:"
    picks_ayz = 5   # define # of picks for sentiment analysis
    '''############################################################################'''     
    
    posts, count, c_analyzed, tickers, titles, a_comments = 0, 0, 0, {}, [], {}
    cmt_auth = {}
    
    for sub in subs:
        subreddit = reddit.subreddit(sub)
        hot_python = subreddit.hot()    # sorting posts by hot
        # Extracting comments, symbols from subreddit
        for submission in hot_python:
            flair = submission.link_flair_text 
            author = submission.author.name         
            
            # checking: post upvote ratio # of upvotes, post flair, and author 
            if submission.upvote_ratio >= upvoteRatio and submission.ups > ups and (flair in post_flairs or flair is None) and author not in ignoreAuthP:   
                submission.comment_sort = 'new'     
                comments = submission.comments
                titles.append(submission.title)
                posts += 1
                try: 
                    submission.comments.replace_more(limit=limit)   
                    for comment in comments:
                        # try except for deleted account?
                        try: auth = comment.author.name
                        except: pass
                        c_analyzed += 1
                        
                        # checking: comment upvotes and author
                        if comment.score > upvotes and auth not in ignoreAuthC:      
                            split = comment.body.split(" ")
                            for word in split:
                                word = word.replace("$", "")        
                                # upper = ticker, length of ticker <= 5, excluded words,                     
                                if word.isupper() and len(word) <= 5 and word not in blacklist and word in us:
                                    
                                    # unique comments, try/except for key errors
                                    if uniqueCmt and auth not in goodAuth:
                                        try: 
                                            if auth in cmt_auth[word]: break
                                        except: pass
                                        
                                    # counting tickers
                                    if word in tickers:
                                        tickers[word] += 1
                                        a_comments[word].append(comment.body)
                                        cmt_auth[word].append(auth)
                                        count += 1
                                    else:                               
                                        tickers[word] = 1
                                        cmt_auth[word] = [auth]
                                        a_comments[word] = [comment.body]
                                        count += 1   
                except Exception as e: print(e)
                
                           
    return posts, c_analyzed, tickers, titles, a_comments, picks, subs, picks_ayz



In [2]:
def print_helper(tickers, picks, c_analyzed, posts, subs, titles, time, start_time):
    '''prints out top tickers, and most mentioned tickers
    
    Parameter:   tickers: dict: all the tickers found
                 picks: int: top picks to analyze
                 c_analyzed: int: # of comments analyzed
                 posts: int: # of posts analyzed
                 subs: int: # of subreddits analyzed
                titles: list: list of the title of posts analyzed 
                 time: time obj: top picks to analyze
                start_time: time obj: prog start time

    Return: symbols: dict: dict of sorted tickers based on mentions
            times: list: include # of time top tickers is mentioned
            top: list: list of top tickers
    '''    

    # sorts the dictionary
    symbols = dict(sorted(tickers.items(), key=lambda item: item[1], reverse = True))
    top_picks = list(symbols.keys())[0:picks]
    time = (time.time() - start_time)
    
    # print top picks
    print("It took {t:.2f} seconds to analyze {c} comments in {p} posts in {s} subreddits.\n".format(t=time, c=c_analyzed, p=posts, s=len(subs)))
    print("Posts analyzed saved in titles")
    #for i in titles: print(i)  # prints the title of the posts analyzed
    
    
    print(f"\n{picks} most mentioned tickers: ")
    times = []
    top = []
    for i in top_picks:
        print(f"{i}: {symbols[i]}")
        times.append(symbols[i])
        top.append(f"{i}: {symbols[i]}")
   
    return symbols, times, top
    

In [3]:
#pip install pyfolio

In [4]:
    
def sentiment_analysis(picks_ayz, a_comments, symbols):
    '''analyzes sentiment anaylsis of top tickers
    
    Parameter:   picks_ayz: int: top picks to analyze
                 a_comments: dict: all the comments to analyze
                 symbols: dict: dict of sorted tickers based on mentions
    Return:      scores: dictionary: dictionary of all the sentiment analysis

    '''
    scores = {}
     
    vader = SentimentIntensityAnalyzer()
    vader.lexicon.update(new_words)     # adding custom words from data.py 
    picks_sentiment = list(symbols.keys())[0:picks_ayz]
    
    for symbol in picks_sentiment:
        stock_comments = a_comments[symbol]
        for cmnt in stock_comments:
    
            emojiless = emoji.get_emoji_regexp().sub(u'', cmnt) # remove emojis
            
            # remove punctuation
            text_punc  = "".join([char for char in emojiless if char not in string.punctuation])
            text_punc = re.sub('[0-9]+', '', text_punc)
                
            # tokenizeing and cleaning 
            tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
            tokenized_string = tokenizer.tokenize(text_punc)
            lower_tokenized = [word.lower() for word in tokenized_string] # convert to lower case
            
            # remove stop words
            nlp = en_core_web_sm.load()
            stopwords = nlp.Defaults.stop_words
            sw_removed = [word for word in lower_tokenized if not word in stopwords]
            
            # normalize the words using lematization
            lemmatizer = WordNetLemmatizer()
            lemmatized_tokens = ([lemmatizer.lemmatize(w) for w in sw_removed])
            
            # calculating sentiment of every word in comments n combining them
            score_cmnt = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
            
            word_count = 0
            for word in lemmatized_tokens:
                if word.upper() not in us:
                    score = vader.polarity_scores(word)
                    word_count += 1
                    for key, _ in score.items():
                        score_cmnt[key] += score[key]    
                else:
                    score_cmnt['pos'] = 2.0               
                    
            # calculating avg.
            try:        # handles: ZeroDivisionError: float division by zero
                for key in score_cmnt:
                    score_cmnt[key] = score_cmnt[key] / word_count
            except: pass
                
            
            # adding score the the specific symbol
            if symbol in scores:
                for key, _ in score_cmnt.items():
                    scores[symbol][key] += score_cmnt[key]
            else:
                scores[symbol] = score_cmnt        
    
        # calculating avg.
        for key in score_cmnt:
            scores[symbol][key] = scores[symbol][key] / symbols[symbol]
            scores[symbol][key]  = "{pol:.3f}".format(pol=scores[symbol][key])
            
    return scores

In [5]:
def sentiment(scores,picks_ayz):
    print(f"\nSentiment analysis of top {picks_ayz} picks:")
    df = pd.DataFrame(scores)
    df.index = ['Bearish', 'Neutral', 'Bullish', 'Total/Compound']
    df = df.T
    print(df)

In [6]:
def visualization(picks_ayz, scores, picks, times, top):
    '''prints sentiment analysis
       makes a most mentioned picks chart
       makes a chart of sentiment analysis of top picks
       
    Parameter:   picks_ayz: int: top picks to analyze
                 scores: dictionary: dictionary of all the sentiment analysis
                 picks: int: most mentioned picks
                times: list: include # of time top tickers is mentioned
                top: list: list of top tickers
    Return:       None
    '''
    
    # printing sentiment analysis 
    print(f"\nSentiment analysis of top {picks_ayz} picks:")
    df = pd.DataFrame(scores)
    df.index = ['Bearish', 'Neutral', 'Bullish', 'Total/Compound']
    df = df.T
    print(df)
    
    # Date Visualization
    # most mentioned picks    
    squarify.plot(sizes=times, label=top, alpha=.7 )
    plt.axis('off')
    plt.title(f"{picks} most mentioned picks")
    #plt.show()
    
    # Sentiment analysis
    df = df.astype(float)
    colors = ['red', 'springgreen', 'forestgreen', 'coral']
    df.plot(kind = 'bar', color=colors, title=f"Sentiment analysis of top {picks_ayz} picks:")
    
    
    #plt.show()

In [7]:
def SMA(data,period=30,column='Close'):
    return data[column].rolling(window=period).mean()
def EMA(data,period=20,column='Close'):
    return data[column].ewm(span=period,adjust=False).mean()

In [8]:
def StockPrice(symbols):
    '''prints data frame  
    Parameter:   symbols
    Return:       data frame for top 5 sentiment analys stock price 
    '''
    today_date = date.today()
    three_Month = str(today_date -  timedelta(days=90))
    period_long =26
    period_short = 12
    period_signal = 9
    month_period = 30
    sma_period = 30
    #five = symbols.item()
    top_5 = list(symbols.keys())[0:5]
    df_price = pd.DataFrame()
    for i in top_5:
        i_data = yf.download(tickers= i, start= three_Month, interval='1d')
        i_out_data = i_data.drop(columns = ['Open','High','Low','Adj Close','Volume'])
        i_out_data.rename(columns={'Close':i},inplace=True)
        delta = i_out_data[i].diff(1)
        delta = delta[1:]
        up = delta.copy()
        down = delta.copy()
        up[up<0] = 0
        down[down>0]=0
        i_out_data['UP'] = up
        i_out_data['down']=down
        AVG_Gain = SMA(i_out_data,month_period,column = 'UP')
        AVG_Loss = abs(SMA(i_out_data,month_period,column = 'down'))
        RS = AVG_Gain / AVG_Loss
        RSI = 100.0 - (100.0/(1.0 + RS))
        i_out_data['RSI']=  RSI
        i_out_data = i_out_data.drop(columns = ['UP','down'])
        ShortEMA = EMA(i_out_data,period_short,column=i)
        LongEMA = EMA(i_out_data,period_long,column = i)
        i_out_data['MACD'] = ShortEMA - LongEMA
        i_out_data['Signal_Line'] = EMA(i_out_data, period_signal ,column ='MACD')
        i_out_data['SMA_30'] = SMA(i_out_data,sma_period,column=i)
        std = i_out_data['SMA_30'].rolling(window= 30).std()
        i_out_data['Upper_Band'] =i_out_data['SMA_30']+std*2
        i_out_data['lower_Band'] =i_out_data['SMA_30']-std*2
        #print(i_out_data)
        df_price = pd.concat([i_out_data,df_price],axis='columns',join='outer',ignore_index = False)
        
        #df[i] = list(i_out_data)
        #df.append(i_out_data)
        df_price.dropna(inplace = True)
    
    return print(df_price.head())

    
    
    

In [9]:
#symbols ={'MRNA': 20, 'AMD': 15, 'ELY': 13, 'GME': 13, 'PFE': 10, 'AMZN': 9, 'MCFE': 8, 'PLTR': 7, 'FUBO': 6, 'MSFT': 6, 'AAPL': 5, 'CRSR': 4, 'NIO': 3, 'TSLA': 3, 'BB': 3, 'DKNG': 3, 'V': 3, 'WISH': 3, 'PYPL': 3, 'BNTX': 3, 'C': 2, 'MU': 2, 'R': 2, 'T': 2, 'CLNE': 2, 'UPST': 2, 'QS': 2, 'MVIS': 2, 'PRPL': 2, 'FSR': 2, 'M': 2, 'F': 2, 'SE': 2, 'ETSY': 2, 'PINS': 2, 'NVDA': 2, 'JPM': 2, 'BABA': 2, 'ASML': 2, 'APPS': 2, 'SQ': 2, 'ALB': 1, 'WEN': 1, 'BRO': 1, 'MARA': 1, 'TA': 1, 'CLF': 1, 'G': 1, 'ROKU': 1, 'SAVE': 1, 'PUBM': 1, 'DM': 1, 'RKT': 1, 'AN': 1, 'WKHS': 1, 'TLRY': 1, 'MS': 1, 'ET': 1, 'CASH': 1, 'EBAY': 1, 'IMVT': 1, 'ZNGA': 1, 'UGI': 1, 'Z': 1, 'RDFN': 1, 'LSPD': 1, 'NXPI': 1, 'KO': 1, 'JNJ': 1, 'AXP': 1, 'QCOM': 1, 'DVA': 1, 'UNH': 1, 'B': 1, 'AMAT': 1, 'GILD': 1, 'HBIO': 1, 'HOLX': 1, 'LITE': 1, 'RL': 1, 'CRM': 1, 'TGT': 1, 'PTON': 1, 'PKI': 1, 'BX': 1, 'GOOGL': 1, 'NDAQ': 1, 'MSCI': 1, 'BIO': 1, 'INFO': 1, 'TMO': 1, 'PJT': 1, 'CHTR': 1, 'TSCO': 1, 'POOL': 1, 'GNRC': 1, 'INMD': 1, 'CTAS': 1, 'SHOP': 1, 'MCO': 1, 'NXST': 1, 'PGR': 1, 'IDXX': 1, 'ZTS': 1, 'GM': 1, 'PLUG': 1, 'ABNB': 1, 'ASO': 1, 'O': 1, 'NKLA': 1, 'RNA': 1, 'TTD': 1, 'FSLY': 1, 'AI': 1, 'PRU': 1}
#top_5 = list(symbols.keys())[0:4]
#print(top_5)
#StockPrice(symbols)

In [10]:
#StockPrice(symbols)

In [11]:
def main():
    '''main function
    Parameter:   None
    Return:       None
    '''
    start_time = time.time()
    
    # reddit client
    reddit = praw.Reddit(user_agent="Comment Extraction",
                         client_id="VpHhcn4Pt6u42DfPBnbOrQ",
                         client_secret="YOFjc2YV0z3vxx0dU_eAT55VjprgJQ",
                         username="ayushshah1204",
                         password="Ayush@1204")

    posts, c_analyzed, tickers, titles, a_comments, picks, subs, picks_ayz = data_extractor(reddit)
    symbols, times, top = print_helper(tickers, picks, c_analyzed, posts, subs, titles, time, start_time)
    scores = sentiment_analysis(picks_ayz, a_comments, symbols)
    stock_price = StockPrice(symbols)
    sentiment(scores,picks_ayz)
    
    #visualization(picks_ayz, scores, picks, times, top)
    
if __name__ == '__main__':
    main()
 

It took 33.74 seconds to analyze 2056 comments in 23 posts in 2 subreddits.

Posts analyzed saved in titles

5 most mentioned tickers: 
MRNA: 21
GME: 14
AMD: 12
ELY: 12
AMZN: 9
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
                   AMZN        RSI       MACD  Signal_Line       SMA_30  \
Date                                                                      
2021-08-04  3354.719971  43.899061 -25.925996    19.483043  3555.837321   
2021-08-05  3375.989990  44.906639 -32.950331     8.996368  3551.576318   
2021-08-06  3344.939941  45.770717 -40.555152    -0.913936  3548.104980   
2021-08-09  3341.870117  47.489113 -46.296064    -9.990362  3546.118652   
2021-08-10  3320.679932  44.