Sentiment analysis

VADER

In [1]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler



In [2]:
df = pd.read_csv("data/stock_tweets.csv")

# Data Preprocessing to remove noise
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove emojis using Unicode ranges
    text = re.sub(r'[\U0001F600-\U0001F64F'
                  r'\U0001F300-\U0001F5FF'
                  r'\U0001F680-\U0001F6FF'
                  r'\U0001F700-\U0001F77F'
                  r'\U0001F780-\U0001F7FF'
                  r'\U0001F800-\U0001F8FF'
                  r'\U0001F900-\U0001F9FF'
                  r'\U0001FA00-\U0001FA6F'
                  r'\U0001FA70-\U0001FAFF'
                  r'\U00002702-\U000027B0'
                  r'\U000024C2-\U0001F251]', '', text)
    return text.strip()

# Apply the function to the "tweet" column
df['Tweet'] = df['Tweet'].apply(clean_tweet)

df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at b...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k f...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,Hahaha why are you still trying to stop Tesla ...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"Stop trying to kill kids, you sad deranged old...",TSLA,"Tesla, Inc."


In [3]:
sentiment = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    return sentiment.polarity_scores(text)['compound']

df['sentiment'] = df['Tweet'].apply(get_sentiment_score)
df['Date'] = pd.to_datetime(df['Date']).dt.date

# Group by both Date and Stock Name, and calculate the average sentiment
avg_sentiments = df.groupby(['Date', 'Stock Name'])['sentiment'].mean()

# Convert the result to a DataFrame and rename the sentiment column
avg_sentiments = avg_sentiments.to_frame(name='average_sentiment')

avg_sentiments

Unnamed: 0_level_0,Unnamed: 1_level_0,average_sentiment
Date,Stock Name,Unnamed: 2_level_1
2021-09-30,AAPL,0.098900
2021-09-30,AMD,0.417583
2021-09-30,AMZN,0.256980
2021-09-30,DIS,0.589300
2021-09-30,GOOG,0.440400
...,...,...
2022-09-29,NIO,0.259263
2022-09-29,PG,0.086080
2022-09-29,PYPL,0.630000
2022-09-29,TSLA,0.058996


In [None]:
# Load finance data
finance_data = pd.read_csv("data/stock_yfinance_data.csv")

# Calculate differences between previous and current day prices
finance_data['Close Diff Prev Day'] = finance_data['Close'].diff()

# Convert Date columns to datetime and keep only the date part
finance_data['Date'] = pd.to_datetime(finance_data['Date']).dt.date
df['Date'] = pd.to_datetime(df['Date']).dt.date

# Count tweets per day for each stock
tweet_counts = df.groupby(['Date', 'Stock Name']).size().reset_index(name='Number of Tweets')

# Merge tweet_counts with finance_data on Date and Stock Name
output_df = pd.merge(finance_data, tweet_counts, on=['Date', 'Stock Name'], how='left')

# Fill NaN values in Number of Tweets column (for days with no tweets)
output_df['Number of Tweets'] = output_df['Number of Tweets'].fillna(0)

# Merge with average sentiment data on Date and Stock Name
merged_df = pd.merge(output_df, avg_sentiments, on=['Date', 'Stock Name'], how='left')
merged_df['average_sentiment'] = merged_df['average_sentiment'].fillna(0)

# Drop rows with NaN values in specific columns (e.g., 'average_sentiment' and 'Close')
merged_df = merged_df.dropna(subset=['average_sentiment', 'Close', 'Close Diff Prev Day'])

#Processed Data to CSV
merged_df.to_csv('data/cleaned_data_vader.csv', index=False)
merged_df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,Close Diff Prev Day,Number of Tweets,average_sentiment
1,2021-10-01,259.466675,260.260010,254.529999,258.406677,258.406677,51094200,TSLA,-0.086670,94.0,0.233605
2,2021-10-04,265.500000,268.989990,258.706665,260.510010,260.510010,91449900,TSLA,2.103333,119.0,0.134079
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800,TSLA,-0.313355,88.0,0.085502
4,2021-10-06,258.733337,262.220001,257.739990,260.916656,260.916656,43898400,TSLA,0.720001,78.0,0.219463
5,2021-10-07,261.820007,268.333344,261.126679,264.536682,264.536682,57587400,TSLA,3.620026,137.0,0.193179
...,...,...,...,...,...,...,...,...,...,...,...
6043,2022-09-23,13.090000,13.892000,12.860000,13.710000,13.710000,28279600,XPEV,-0.030000,0.0,0.000000
6044,2022-09-26,14.280000,14.830000,14.070000,14.370000,14.370000,27891300,XPEV,0.660000,1.0,0.585900
6045,2022-09-27,14.580000,14.800000,13.580000,13.710000,13.710000,21160800,XPEV,-0.660000,0.0,0.000000
6046,2022-09-28,13.050000,13.421000,12.690000,13.330000,13.330000,31799400,XPEV,-0.380000,0.0,0.000000
