# Reddit Wall Street Bets Sentiment Analysis

<img src="../images/reddit.jpg">

In [144]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import re

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

### Data Cleaning / Text Preprocessing

In [145]:
df = pd.read_csv('../reddit_wsb.csv')

In [146]:
df.head(5)

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56


In [147]:
# Drop unnecessary columns for analysis
df = df.drop(columns=['id', 'url', 'created'])

In [148]:
# Handle missing values
df.isnull().sum()

title            0
score            0
comms_num        0
body         24385
timestamp        0
dtype: int64

In [149]:
# Information about dataset
df.describe()

Unnamed: 0,score,comms_num
count,46020.0,46020.0
mean,1460.640026,242.780595
std,8439.574089,2577.133269
min,0.0,0.0
25%,1.0,2.0
50%,32.0,14.0
75%,194.0,52.0
max,348241.0,93268.0


In [150]:
# Describe dataset
df.describe()

Unnamed: 0,score,comms_num
count,46020.0,46020.0
mean,1460.640026,242.780595
std,8439.574089,2577.133269
min,0.0,0.0
25%,1.0,2.0
50%,32.0,14.0
75%,194.0,52.0
max,348241.0,93268.0


In [159]:
df.head(10)

Unnamed: 0,title,score,comms_num,body,timestamp,title_cleaned
0,"It's not about the money, it's about sending a...",55,6,,2021-01-28 21:37:41,money send message
1,Math Professor Scott Steiner says the numbers ...,110,23,,2021-01-28 21:32:10,math professor scott steiner say number spell ...
2,Exit the system,0,47,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,exit system
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,74,,2021-01-28 21:28:57,new sec file gme someon less retard pleas inte...
4,"Not to distract from GME, just thought our AMC...",71,156,,2021-01-28 21:26:56,distract gme thought amc brother awar thi
5,WE BREAKING THROUGH,405,84,,2021-01-28 21:26:30,break
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,53,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,short stock doesnt expir date
7,THIS IS THE MOMENT,405,178,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31,thi moment
8,Currently Holding AMC and NOK - Is it retarded...,200,161,,2021-01-28 21:19:16,current hold amc nok retard think move gme today
9,I have nothing to say but BRUH I am speechless...,291,27,,2021-01-28 21:18:37,noth say bruh speechless moon


### Data Cleaning / Data Preprocessing

In [160]:
nltk.download('punkt')
porter = nltk.PorterStemmer()

[nltk_data] Downloading package punkt to C:\Users\Abhi
[nltk_data]     Joshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [161]:
def text_preprocessing(text): 
    # tokenization, remove punctuation, lemmatization
    words = word_tokenize(str(text))
    words = RegexpTokenizer(r'\w+')
    words = [porter.stem(t) for t in str(text).split()]
    
    # remove symbols, websites, email addresses 
    words = [re.sub(r"[^A-Za-z@]", "", word) for word in words] 
    words = [re.sub(r"\S+com", "", word) for word in words]
    words = [re.sub(r"\S+@\S+", "", word) for word in words] 
    words = [word for word in words if word != " " and word != ""]
    words = [word for word in words if len(word) != 0] 
    
    # remove stopwords     
    stopwords = set(STOPWORDS)
    stopwords.update(nltk.corpus.stopwords.words('english'))
    stopwords_lower = [s.lower() for s in stopwords]
    words=[word.lower() for word in words if word.lower() not in stopwords_lower]
    
    # combine a list into one string   
    string = " ".join(words)
    
    return string

In [162]:
df['title_cleaned'] = df['title'].apply(text_preprocessing)
df['body_cleaned'] = df['body'].apply(text_preprocessing)

In [163]:
df.head(10)

Unnamed: 0,title,score,comms_num,body,timestamp,title_cleaned,body_cleaned
0,"It's not about the money, it's about sending a...",55,6,,2021-01-28 21:37:41,money send message,
1,Math Professor Scott Steiner says the numbers ...,110,23,,2021-01-28 21:32:10,math professor scott steiner say number spell ...,
2,Exit the system,0,47,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,exit system,ceo nasdaq push halt trade give investor chanc...
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,74,,2021-01-28 21:28:57,new sec file gme someon less retard pleas inte...,
4,"Not to distract from GME, just thought our AMC...",71,156,,2021-01-28 21:26:56,distract gme thought amc brother awar thi,
5,WE BREAKING THROUGH,405,84,,2021-01-28 21:26:30,break,
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,53,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,short stock doesnt expir date,hedgefund whale spread disinfo say friday make...
7,THIS IS THE MOMENT,405,178,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31,thi moment,life isnt fair mother alway told complain arbi...
8,Currently Holding AMC and NOK - Is it retarded...,200,161,,2021-01-28 21:19:16,current hold amc nok retard think move gme today,
9,I have nothing to say but BRUH I am speechless...,291,27,,2021-01-28 21:18:37,noth say bruh speechless moon,


### Exploratory Data Analysis

In [None]:
# Extract titles and bodies of the wsbets posts
vis_df = df[['title', 'body']].copy()
vis_df.head(10)

In [None]:
# Combine into plain text column
vis_df = vis_df.dropna()
vis_df['combined'] = vis_df['title'] + ' ' + vis_df['body']
vis_df = vis_df.reset_index()

In [None]:
vis_df.head(10)

In [None]:
# Start with one post
text = vis_df.combined[0]

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)

# Display Figure
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Now let's run the same analysis for all posts
text = ' '.join(vis_df.combined)
print ("There are {} words in the combination of all posts and titles on r/wsbets.".format(len(text)))

In [None]:
# Create stopword list


# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()