# Reddit Wall Street Bets Sentiment Analysis

<img src="../images/reddit.jpg">

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import re
import plotly.express as px

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from matplotlib.pyplot import figure
from datetime import datetime
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

### Data Cleaning

In [None]:
df = pd.read_csv('../reddit_wsb.csv')

In [None]:
df_gme = pd.read_csv('../gme.csv')

In [None]:
# Drop unnecessary columns for analysis
df = df.drop(columns=['id', 'url', 'created', 'Unnamed: 10', 'Dates'])

In [None]:
df_gme = df_gme.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Change'])

In [None]:
# Handle missing values
df.isnull().sum()
df = df.dropna(subset=['timestamp', 'change (+/-)'])

In [None]:
df_gme.isnull().sum()
df_gme = df_gme.dropna()

In [None]:
# Information about dataset
df.info()

In [None]:
df_gme.info()

In [None]:
# Describe dataset
df.describe()

In [None]:
df_gme.describe()

In [None]:
df.head(10)

In [None]:
df_gme.head(10)

### Text Preprocessing

In [None]:
nltk.download('punkt')
porter = nltk.PorterStemmer()

In [None]:
def text_preprocessing(text): 
    # tokenization, remove punctuation, lemmatization
    words = word_tokenize(str(text))
    words = RegexpTokenizer(r'\w+')
    words = [porter.stem(t) for t in str(text).split()]
    
    # remove symbols, websites, email addresses 
    words = [re.sub(r"[^A-Za-z@]", "", word) for word in words] 
    words = [re.sub(r"\S+com", "", word) for word in words]
    words = [re.sub(r"\S+@\S+", "", word) for word in words] 
    words = [word for word in words if word != " " and word != ""]
    words = [word for word in words if len(word) != 0] 
    
    # remove stopwords     
    stopwords = set(STOPWORDS)
    stopwords.update(nltk.corpus.stopwords.words('english'))
    stopwords_lower = [s.lower() for s in stopwords]
    words=[word.lower() for word in words if word.lower() not in stopwords_lower]
    
    # combine a list into one string   
    string = " ".join(words)
    
    return string

In [None]:
df['title_cleaned'] = df['title'].apply(text_preprocessing)
df['body_cleaned'] = df['body'].apply(text_preprocessing)
df['combined_cleaned'] = df['title_cleaned'] + ' ' + df['body_cleaned']

In [None]:
df.head(10)

### Data Visualization

In [None]:
# Extract titles and bodies of the wsbets posts
vis_df = df[['title', 'body', 'timestamp']].copy()
vis_df.head(10)

In [None]:
# Combine into plain text column
vis_df = vis_df.dropna()
vis_df['combined'] = vis_df['title'] + ' ' + vis_df['body']
vis_df = vis_df.reset_index()

In [None]:
vis_df.head(10)

In [None]:
# Now let's run the same analysis for all posts
text = ' '.join(vis_df.combined)
print ("There are {} words in the combination of all posts and titles on r/wsbets.".format(len(text)))

In [None]:
# Create stopword list
stopwords = set(STOPWORDS)
stopwords.update(nltk.corpus.stopwords.words('english'))

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image
plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Visualize GME stock prices over time
fig = px.line(df_gme, x='Date', y='Adj Close', title="GME Stock Prices over Time")
fig.show()

### GME Sentiment Analysis

In [None]:
# Helper function to get only GME posts
def getGME(text):
    return "gme" in text.lower() or 'game stop' in text.lower() or 'gamestop' in text.lower()

In [None]:
# Filter dataframe based on posts with GME in them
gme_sent_df = df[['timestamp', 'combined_cleaned', 'change (+/-)']].copy()
gme_sent_df['containsGME'] = df['combined_cleaned'].apply(getGME)
gme_sent_df.drop(gme_sent_df[gme_sent_df['containsGME'] == False].index, inplace = True)

In [None]:
# Clean GME posts dataset
gme_sent_df = gme_sent_df.drop(columns=['containsGME'])
gme_sent_df = gme_sent_df.reset_index()

In [None]:
gme_sent_df.head(10)

In [None]:
# Train and test split
X_train, X_test, y_train, y_test = train_test_split(gme_sent_df['combined_cleaned'], 
                                                    gme_sent_df['change (+/-)'], 
                                                    random_state=0)
# Build a machine learning pipeline
est = Pipeline([('vectorizer', TfidfVectorizer(lowercase=False)),
 ('classifier', LogisticRegression(solver='liblinear', max_iter=1000))])

# GridSearchCV with a transformer and a estimator
parameters = {'vectorizer__max_df': (0.8,0.9), 
 'vectorizer__min_df': [20,50,0.1],
 "classifier__C": np.logspace(-3,3,7), 
 "classifier__penalty" :["l1", "l2"]}
gs = GridSearchCV(est, param_grid=parameters)

# Fit the training data
gs.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

### Tracking GME Sentiment Over Time

In [None]:
# Create functions to get subjectivity and polarity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
# Get subjectivity and polarity of GME posts
gme_sent_df['Subjectivity'] = gme_sent_df['combined'].apply(getSubjectivity)
gme_sent_df['Polarity'] = gme_sent_df['combined'].apply(getPolarity)

In [None]:
gme_sent_df.head(10)

In [None]:
# Function to get the sentiment for each post
def getSIA(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment

In [None]:
# Get sentiment scores for each post
compound = []
neg = []
pos = []
sent = []
sent_value = []
for i in range(0, len(gme_sent_df['combined'])):
    SIA = getSIA(gme_sent_df['combined'][i])
    compound.append(SIA['compound'])
    neg.append(SIA['neg'])
    neu.append(SIA['neu'])
    pos.append(SIA['pos'])
    max_value = max(SIA['neg'], SIA['pos'])
    sent_value.append(max_value)
    if max_value == SIA['neg']:
        sent.append('negative')
    else:
        sent.append('positive')
# Store sentiments inside dataframe
gme_sent_df['Compound'] = compound
gme_sent_df['Positive'] = pos
gme_sent_df['Negative'] = neg
gme_sent_df['Sentiment'] = sent
gme_sent_df['Sentiment Value'] = sent_value

In [None]:
gme_sent_df.head(10)

In [None]:
# Plot GME Positive Sentiment Over Time
pos_sent_df = gme_sent_df[gme_sent_df['Sentiment'] == 'positive'].reset_index()
fig = px.scatter(pos_sent_df, x='timestamp', y=['Sentiment Value'], title="GME Positive Sentiment over Time")
fig.show()

In [None]:
# Plot GME Negative Sentiment Over Time
neg_sent_df = gme_sent_df[gme_sent_df['Sentiment'] == 'negative'].reset_index()
fig = px.scatter(neg_sent_df, x='timestamp', y=['Sentiment Value'], title="GME Negative Sentiment over Time")
fig.show()