# Reddit Wall Street Bets Sentiment Analysis

<img src="../images/reddit.jpg">

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import re

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Data Cleaning / Text Preprocessing

In [71]:
df = pd.read_csv('../reddit_wsb.csv')

In [73]:
df.head(10)

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56
5,WE BREAKING THROUGH,405,l6uf7d,https://i.redd.it/2wef8tc062e61.png,84,1611862000.0,,2021-01-28 21:26:30
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27
7,THIS IS THE MOMENT,405,l6ub9l,https://www.reddit.com/r/wallstreetbets/commen...,178,1611862000.0,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31
8,Currently Holding AMC and NOK - Is it retarded...,200,l6ub4i,https://i.redd.it/6k2z7ouo42e61.png,161,1611862000.0,,2021-01-28 21:19:16
9,I have nothing to say but BRUH I am speechless...,291,l6uas9,https://i.redd.it/bfzzw2yo42e61.jpg,27,1611862000.0,,2021-01-28 21:18:37


In [74]:
df.describe()

Unnamed: 0,score,comms_num,created
count,46020.0,46020.0,46020.0
mean,1460.640026,242.780595,1613109000.0
std,8439.574089,2577.133269,1777287.0
min,0.0,0.0,1601340000.0
25%,1.0,2.0,1611881000.0
50%,32.0,14.0,1612318000.0
75%,194.0,52.0,1613953000.0
max,348241.0,93268.0,1619844000.0


In [75]:
# Drop unnecessary columns for analysis
df = df.drop(columns=['id', 'url', 'created'])

In [76]:
# Handle missing values
df.isnull().sum()

title            0
score            0
comms_num        0
body         24385
timestamp        0
dtype: int64

In [77]:
# Information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46020 entries, 0 to 46019
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      46020 non-null  object
 1   score      46020 non-null  int64 
 2   comms_num  46020 non-null  int64 
 3   body       21635 non-null  object
 4   timestamp  46020 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.8+ MB


In [78]:
# Describe dataset
df.describe()

Unnamed: 0,score,comms_num
count,46020.0,46020.0
mean,1460.640026,242.780595
std,8439.574089,2577.133269
min,0.0,0.0
25%,1.0,2.0
50%,32.0,14.0
75%,194.0,52.0
max,348241.0,93268.0


### Data Cleaning / Data Preprocessing

In [79]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Abhi
[nltk_data]     Joshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [80]:
# Tokenization and remove punctuations
def tokenize(text):
    if (text == None):
        continue
    words = word_tokenize(str(text))
    # remove digits and other symbols except "@"--used to remove email
    words = [re.sub(r"[^A-Za-z@]", "", word) for word in words]
    # remove websites and email address
    words = [re.sub(r"\S+com", "", word) for word in words]
    words = [re.sub(r"\S+@\S+", "", word) for word in words]
    # remove empty spaces 
    words = [word for word in words if word!=" " and word!=""]
    return words

In [59]:
df['title'] = df['title'].apply(tokenize)
df['body'] = df['body'].apply(tokenize)

In [64]:
# Remove Stopwords
def remove_stopwords(text):
    stopwords = set(STOPWORDS)
    stopwords.update(nltk.corpus.stopwords.words('english'))
    stopwords_lower = [s.lower() for s in stopwords]
    words = [word.lower() for word in words if word.lower() not in stopwords_lower]

In [60]:
df.head(10)

Unnamed: 0,title,score,comms_num,body,timestamp,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,"[It, s, not, about, the, money, it, s, about, ...",55,6,[nan],1/28/2021 21:37,,,,
1,"[Math, Professor, Scott, Steiner, says, the, n...",110,23,[nan],1/28/2021 21:32,,,,
2,"[Exit, the, system]",0,47,"[The, CEO, of, NASDAQ, pushed, to, halt, tradi...",1/28/2021 21:30,,,,
3,"[NEW, SEC, FILING, FOR, GME, CAN, SOMEONE, LES...",29,74,[nan],1/28/2021 21:28,,,,
4,"[Not, to, distract, from, GME, just, thought, ...",71,156,[nan],1/28/2021 21:26,,,,
5,"[WE, BREAKING, THROUGH]",405,84,[nan],1/28/2021 21:26,,,,
6,"[SHORT, STOCK, DOES, NT, HAVE, AN, EXPIRATION,...",317,53,"[Hedgefund, whales, are, spreading, disinfo, s...",1/28/2021 21:26,,,,
7,"[THIS, IS, THE, MOMENT]",405,178,"[Life, is, nt, fair, My, mother, always, told,...",1/28/2021 21:19,,,,
8,"[Currently, Holding, AMC, and, NOK, Is, it, re...",200,161,[nan],1/28/2021 21:19,,,,
9,"[I, have, nothing, to, say, but, BRUH, I, am, ...",291,27,[nan],1/28/2021 21:18,,,,


### Exploratory Data Analysis

In [None]:
# Extract titles and bodies of the wsbets posts
vis_df = df[['title', 'body']].copy()
vis_df.head(10)

In [None]:
# Combine into plain text column
vis_df = vis_df.dropna()
vis_df['combined'] = vis_df['title'] + ' ' + vis_df['body']
vis_df = vis_df.reset_index()

In [None]:
vis_df.head(10)

In [None]:
# Start with one post
text = vis_df.combined[0]

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)

# Display Figure
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Now let's run the same analysis for all posts
text = ' '.join(vis_df.combined)
print ("There are {} words in the combination of all posts and titles on r/wsbets.".format(len(text)))

In [None]:
# Create stopword list


# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()