# Sentiment analysis for stock price prediction


#### Introduction

This notebook ...

In [51]:
# Ensure correct packages and settings
import os
import sys
import pathlib
import pandas as pd
from pathlib import Path

# Check if running within a virtual environment
if sys.prefix != sys.base_prefix:
    print(f"Running in a virtual environment: {sys.prefix}")
else:
    print("Not running in a virtual environment. Activate the environment, install the packages and try again.")
    sys.exit(1)

Running in a virtual environment: c:\mainrism\imp\enterprises\projects\tweet-stock-sentiment\.venv


In [58]:
# Import NVDA tweets CSV into a DataFrame
nvda_tweets = pd.read_csv(Path("data/nvda-tweets.csv"))

# Remove all rows where 'Datetime' is not a valid date
nvda_tweets = nvda_tweets[pd.to_datetime(nvda_tweets['Datetime'], errors='coerce').notnull()]

# Convert 'Datetime' column to 'Date' in YYYY-MM-DD format
nvda_tweets['Date'] = pd.to_datetime(nvda_tweets['Datetime']).dt.date

# Remove unnammed columns
nvda_tweets = nvda_tweets.loc[:, ~nvda_tweets.columns.str.contains('^Unnamed')]

# Move Date to the first column
cols = nvda_tweets.columns.tolist()
cols = [cols[-1]] + cols[:-1]
nvda_tweets = nvda_tweets[cols]

# First and last dates
firstDate = nvda_tweets['Date'].min()
lastDate = nvda_tweets['Date'].max()
print(f"First date: {firstDate}, Last date: {lastDate}")

# Previews
nvda_tweets.head()


First date: 2022-11-21, Last date: 2023-02-06


Unnamed: 0,Date,Datetime,Tweet Id,Text,Username
0,2023-02-06,2023-02-06 10:07:17+00:00,1.622537e+18,$nvda Top analyst price target for next week ...,RyderJohnston6
1,2023-02-06,2023-02-06 10:06:30+00:00,1.622537e+18,$nvda Top analyst price target for this week.🔗...,k_mebane
2,2023-02-06,2023-02-06 10:06:08+00:00,1.622537e+18,$nvda Top analyst price target for next week.....,RonaldBevan2
3,2023-02-06,2023-02-06 10:05:43+00:00,1.622537e+18,$NVDA Top analyst target price for next week🚀 ...,Eva077777
4,2023-02-06,2023-02-06 10:03:49+00:00,1.622537e+18,$rsls rebound 🚨🚨🚀🚀🚀\n————\n\n$EDSA\n$KODK\n$DO...,MrBlackTrading


In [66]:
# Import NVDA daily stock prices CSV into a DataFrame
nvda_prices = pd.read_csv(Path("data/nvda-daily-stock-prices.csv"))

# Truncate to only the date range of the tweets
nvda_prices['Date'] = pd.to_datetime(nvda_prices['Date']).dt.date
nvda_prices = nvda_prices[(nvda_prices['Date'] >= firstDate) & (nvda_prices['Date'] <= lastDate)]

# Amount of data
print(f"NVDA stock prices data points: {len(nvda_prices)}")

# Preview
nvda_prices.head()

NVDA stock prices data points: 52


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
5998,2022-11-21,15.301953,15.317,15.477,15.08,15.147,404739000
5999,2022-11-22,16.022243,16.038,16.058001,15.122,15.328,472866000
6000,2022-11-23,16.502769,16.518999,16.527,16.048,16.098,427241000
6001,2022-11-25,16.254019,16.27,16.487,16.172001,16.318001,167934000
6002,2022-11-28,15.811451,15.827,16.357,15.725,16.025999,303741000


In [20]:
# Merge stock prices with tweets on the date
