# Sentiment analysis for stock price prediction


#### Introduction

This notebook ...

In [15]:
# Ensure correct packages and settings
import os
import sys
import pathlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Check if running within a virtual environment
if sys.prefix != sys.base_prefix:
    print(f"Running in a virtual environment: {sys.prefix}")
else:
    print("Not running in a virtual environment. Activate the environment, install the packages and try again.")
    sys.exit(1)

Running in a virtual environment: c:\mainrism\imp\enterprises\projects\tweet-stock-sentiment\.venv


### Import and augment Tweet dataset

In [16]:
# Import NVDA tweets CSV into a DataFrame
nvda_tweets = pd.read_csv(Path("data/nvda-tweets.csv"))

# Remove all rows where 'Datetime' is not a valid date
nvda_tweets = nvda_tweets[pd.to_datetime(nvda_tweets['Datetime'], errors='coerce').notnull()]

# Convert 'Datetime' column to 'Date' in YYYY-MM-DD format
nvda_tweets['Date'] = pd.to_datetime(nvda_tweets['Datetime']).dt.date

# Sort by Date ascending
nvda_tweets = nvda_tweets.sort_values(by='Date')

# Remove unnammed columns
nvda_tweets = nvda_tweets.loc[:, ~nvda_tweets.columns.str.contains('^Unnamed')]

# Move Date to the first column
colsToShow = nvda_tweets.columns.tolist()
colsToShow = [colsToShow[-1]] + colsToShow[:-1]
nvda_tweets = nvda_tweets[colsToShow]

# First and last dates
firstDate = nvda_tweets['Date'].min()
lastDate = nvda_tweets['Date'].max()
print(f"First date: {firstDate}, Last date: {lastDate}")

# Add sentiment analysis columns to the Tweets DataFrame
nvda_tweets['VADER_Positive'] = 0.0
nvda_tweets['VADER_Negative'] = 0.0
nvda_tweets['VADER_Neutral'] = 0.0
nvda_tweets['VADER_Compound_Sentiment'] = 0.0
nvda_tweets['VADER_Final_Sentiment'] = 0

# Iterate over each tweet and calculate sentiment scores
analyzer = SentimentIntensityAnalyzer()
for index, row in nvda_tweets.iterrows():
    tweet = row['Text']
    vs = analyzer.polarity_scores(tweet)
    nvda_tweets.at[index, 'VADER_Positive'] = vs['pos']
    nvda_tweets.at[index, 'VADER_Negative'] = vs['neg']
    nvda_tweets.at[index, 'VADER_Neutral'] = vs['neu']
    nvda_tweets.at[index, 'VADER_Compound_Sentiment'] = vs['compound']
    if vs['compound'] >= 0.05:
        nvda_tweets.at[index, 'VADER_Final_Sentiment'] = 1
    elif vs['compound'] <= -0.05:
        nvda_tweets.at[index, 'VADER_Final_Sentiment'] = -1
    else:
        nvda_tweets.at[index, 'VADER_Final_Sentiment'] = 0

# Save the updated tweets DataFrame to a new CSV
nvda_tweets.to_csv(Path("data/nvda-tweets-augmented.csv"), index=False)

# Preview
nvda_tweets.head()


First date: 2022-11-21, Last date: 2023-02-06


Unnamed: 0,Date,Datetime,Tweet Id,Text,Username,VADER_Positive,VADER_Negative,VADER_Neutral,VADER_Compound_Sentiment,VADER_Final_Sentiment
100846,2022-11-21,2022-11-21 14:30:16+00:00,1.5947e+18,$NFLX $290 to $295. Over $295 to $297-$300-$30...,Turbobob129,0.0,0.0,1.0,0.0,0
100403,2022-11-21,2022-11-21 19:33:25+00:00,1.594776e+18,"“Most winning trading community, Get next winn...",Smith28301,0.298,0.0,0.702,0.7783,1
100402,2022-11-21,2022-11-21 19:37:51+00:00,1.594777e+18,**Most profitable trading community. Get up to...,nappedonthebed,0.129,0.0,0.871,0.7264,1
100401,2022-11-21,2022-11-21 19:38:53+00:00,1.594777e+18,**Most profitable trading \nhttps://t.co/U9AUM...,Trades75699329,0.202,0.0,0.798,0.7264,1
100400,2022-11-21,2022-11-21 19:39:09+00:00,1.594777e+18,"Best place for day trading, swing trading, sto...",Smith28301,0.208,0.0,0.792,0.6369,1


### Import and augment stock prices dataset

In [None]:
# Import NVDA daily stock prices CSV into a DataFrame
nvda_prices = pd.read_csv(Path("data/nvda-daily-stock-prices.csv"))

# Convert 'Datetime' column to 'Date' in YYYY-MM-DD format
nvda_prices['Date'] = pd.to_datetime(nvda_prices['Date']).dt.date

# Sort by Date ascending
nvda_prices = nvda_prices.sort_values(by='Date')

# Add if the stock price increased compared to the previous day
nvda_prices['Price_Up'] = 0
nvda_prices['Price_Change'] = 0.0
nvda_prices['Price_Change_Pct'] = 0.0

for i in range(1, len(nvda_prices)):
    if nvda_prices.at[i, 'Close'] > nvda_prices.at[i - 1, 'Close']:
        nvda_prices.at[i, 'Price_Up'] = 1
    else:
        nvda_prices.at[i, 'Price_Up'] = 0
    nvda_prices.at[i, 'Price_Change'] = nvda_prices.at[i, 'Close'] - nvda_prices.at[i - 1, 'Close']
    if nvda_prices.at[i - 1, 'Close'] != 0:
        nvda_prices.at[i, 'Price_Change_Pct'] = (nvda_prices.at[i, 'Price_Change'] / nvda_prices.at[i - 1, 'Close']) * 100
    else:
        nvda_prices.at[i, 'Price_Change_Pct'] = 0.0

# Add the average sentiment per day to the stock prices DataFrame
daily_sentiment = nvda_tweets.groupby('Date').agg({
    'VADER_Positive': 'mean',
    'VADER_Negative': 'mean',
    'VADER_Neutral': 'mean',
    'VADER_Compound_Sentiment': 'mean'
}).reset_index()

# Merge daily sentiment with stock prices on Date
nvda_prices = pd.merge(nvda_prices, daily_sentiment, how='left', left_on='Date', right_on='Date')

# Save the updated stock prices DataFrame to a new CSV
nvda_prices.to_csv(Path("data/nvda-daily-stock-prices-augmented.csv"), index=False)

# Amount of data
print(f"NVDA stock prices data points: {len(nvda_prices)}")

# Preview
nvda_prices.tail()

### Adding new features to the Stock Prices dataset

In [18]:
# Add the average sentiment scores per day to the stock prices DataFrame
daily_sentiment = nvda_tweets.groupby('Date').agg({
    'Positive': 'mean',
    'Negative': 'mean',
    'Neutral': 'mean',
    'Compound': 'mean'
}).reset_index()

# Merge daily sentiment with stock prices on 'Date'
nvda_prices_and_sentiment = pd.merge(nvda_prices, daily_sentiment, on='Date', how='left')
nvda_prices_and_sentiment.head()

# Add a 'Sentiment' column based on the 'Compound' score
def sentiment_from_compound(compound):
    if pd.isna(compound):
        return 'No Data'
    elif compound >= 0.05:
        return 'Positive'
    elif compound <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

nvda_prices_and_sentiment['Sentiment'] = nvda_prices_and_sentiment['Compound'].apply(sentiment_from_compound)

# Sort by Date ascending
nvda_prices_and_sentiment = nvda_prices_and_sentiment.sort_values(by='Date')

# Count how many days have each sentiment
print(nvda_prices_and_sentiment['Sentiment'].value_counts())

# Get the lowest, highest, and average Compound scores
lowest_compound = nvda_prices_and_sentiment['Compound'].min()
highest_compound = nvda_prices_and_sentiment['Compound'].max()
average_compound = nvda_prices_and_sentiment['Compound'].mean()
print(f"Lowest Compound: {lowest_compound}, Highest Compound: {highest_compound}, Average Compound: {average_compound}")

# Add if the stock price went up or down that day
nvda_prices_and_sentiment['Diff_Close'] = nvda_prices_and_sentiment['Close'].diff()
nvda_prices_and_sentiment['Was_A_Buy_Day'] = nvda_prices_and_sentiment['Diff_Close'].apply(lambda x: 1 if x > 0 else 0)

nvda_prices_and_sentiment.to_csv(Path("data/nvda-prices-and-sentiment.csv"), index=False)

# Preview the entire DataFrame histogram except for Was_A_Buy_Day
colsToShow = ['Open', 'Close', 'Compound', 'Diff_Close']
nvda_prices_and_sentiment[colsToShow].hist(bins=30, figsize=(12,8))
plt.show()

# Boxplots to spot outliers
nvda_prices_and_sentiment.plot(kind='box', subplots=True, layout=(4,4), figsize=(12,8))
plt.show()

# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(nvda_prices_and_sentiment.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.show()


KeyError: "Column(s) ['Compound', 'Negative', 'Neutral', 'Positive'] do not exist"