Sentiment analysis

VADER

In [2]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
df = pd.read_csv("data/stock_tweets.csv")

# Data Preprocessing to remove noise
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove emojis using Unicode ranges
    text = re.sub(r'[\U0001F600-\U0001F64F'
                  r'\U0001F300-\U0001F5FF'
                  r'\U0001F680-\U0001F6FF'
                  r'\U0001F700-\U0001F77F'
                  r'\U0001F780-\U0001F7FF'
                  r'\U0001F800-\U0001F8FF'
                  r'\U0001F900-\U0001F9FF'
                  r'\U0001FA00-\U0001FA6F'
                  r'\U0001FA70-\U0001FAFF'
                  r'\U00002702-\U000027B0'
                  r'\U000024C2-\U0001F251]', '', text)
    return text.strip()


# Apply the function to the "tweet" column
df['Tweet'] = df['Tweet'].apply(clean_tweet)

df.head()
#For testing purposes
df.to_csv("new_tweets.csv", index=False)

In [None]:
#Process Finance
finance_data = pd.read_csv("data/stock_yfinance_data.csv")

#Differences between previous and current day prices 
finance_data['Close Diff Prev Day'] = finance_data['Close'].diff()

# Convert both Date columns to datetime and keep only the date part
finance_data['Date'] = pd.to_datetime(finance_data['Date']).dt.date
df['Date'] = pd.to_datetime(df['Date']).dt.date

# Count tweets per day for each stock
tweet_counts = df.groupby(['Date', 'Stock Name']).size().reset_index(name='Number of Tweets')

# Add a 'Number of Tweets' column to finance_data, initialized to 0
finance_data['Number of Tweets'] = 0

# Concatenate finance_data and tweet_counts along rows
output_df = pd.concat([finance_data, tweet_counts], ignore_index=True)

# Group by Date and Stock Name, summing the 'Number of Tweets' to combine any duplicates
output_df = output_df.groupby(['Date', 'Stock Name'], as_index=False).sum()

#One hot encode Stock Names for finance data
finance_data = pd.get_dummies(finance_data, columns=['Stock Name'])

#Differences between previous and current day prices 
finance_data['Close Diff Prev Day'] = finance_data['Close'].diff()

price_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Close Diff Prev Day']

# Calculate mean and standard deviation for each price column
means = finance_data[price_columns].mean()
stds = finance_data[price_columns].std()

# Standardize the prices
finance_data[price_columns] = (finance_data[price_columns] - means) / stds


#Set financial data for output df
output_df[price_columns] = finance_data[price_columns]
output_df['Average Sentiment'] = None

#For testing purposes
output_df.to_csv("output.csv", index=False)


In [28]:
sentiment = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    return sentiment.polarity_scores(text)['compound']

df['sentiment_score'] = df['Cleaned Tweet'].apply(get_sentiment_score)



In [32]:
df.sample(n=5)

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Cleaned Tweet,sentiment_score
21871,2022-02-04,"Nearly 6,000 Tesla Model 3 and Model Y are rea...",TSLA,"Tesla, Inc.","Nearly 6,000 Tesla Model 3 and Model Y are rea...",0.3612
4236,2022-08-09,"So in the Chinese market in July, $TSLA sold 8...",TSLA,"Tesla, Inc.","So in the Chinese market in July, TSLA sold 84...",0.6705
3432,2022-08-16,"Sold the $926 and took the $15,000. $TSLA",TSLA,"Tesla, Inc.","Sold the 926 and took the 15,000 TSLA",0.0
46369,2022-03-31,Interesting $TSLA is now nearly 2x the size of...,META,"Meta Platforms, Inc.",Interesting TSLA is now nearly 2x the size of ...,0.4019
47450,2022-01-20,Why does every smart person you know like $FB ...,META,"Meta Platforms, Inc.",Why does every smart person you know like FB a...,0.6369
