Sentiment analysis

VADER

In [2]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("data/stock_tweets.csv")

# Data Preprocessing to remove noise
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove emojis using Unicode ranges
    text = re.sub(r'[\U0001F600-\U0001F64F'
                  r'\U0001F300-\U0001F5FF'
                  r'\U0001F680-\U0001F6FF'
                  r'\U0001F700-\U0001F77F'
                  r'\U0001F780-\U0001F7FF'
                  r'\U0001F800-\U0001F8FF'
                  r'\U0001F900-\U0001F9FF'
                  r'\U0001FA00-\U0001FA6F'
                  r'\U0001FA70-\U0001FAFF'
                  r'\U00002702-\U000027B0'
                  r'\U000024C2-\U0001F251]', '', text)
    return text.strip()

# Apply the function to the "tweet" column
df['Tweet'] = df['Tweet'].apply(clean_tweet)

df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at b...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k f...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,Hahaha why are you still trying to stop Tesla ...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"Stop trying to kill kids, you sad deranged old...",TSLA,"Tesla, Inc."


In [4]:
sentiment = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    return sentiment.polarity_scores(text)['compound']

df['sentiment'] = df['Tweet'].apply(get_sentiment_score)
df['Date'] = pd.to_datetime(df['Date']).dt.date

# Group by both Date and Stock Name, and calculate the average sentiment
avg_sentiments = df.groupby(['Date', 'Stock Name'])['sentiment'].mean()

# Convert the result to a DataFrame and rename the sentiment column
avg_sentiments = avg_sentiments.to_frame(name='average_sentiment')

avg_sentiments

Unnamed: 0_level_0,Unnamed: 1_level_0,average_sentiment
Date,Stock Name,Unnamed: 2_level_1
2021-09-30,AAPL,0.098900
2021-09-30,AMD,0.417583
2021-09-30,AMZN,0.256980
2021-09-30,DIS,0.589300
2021-09-30,GOOG,0.440400
...,...,...
2022-09-29,NIO,0.259263
2022-09-29,PG,0.086080
2022-09-29,PYPL,0.630000
2022-09-29,TSLA,0.058996


In [5]:
# Load finance data
finance_data = pd.read_csv("data/stock_yfinance_data.csv")

# Calculate differences between previous and current day prices
finance_data['Close Diff Prev Day'] = finance_data['Close'].diff()

# Convert Date columns to datetime and keep only the date part
finance_data['Date'] = pd.to_datetime(finance_data['Date']).dt.date
df['Date'] = pd.to_datetime(df['Date']).dt.date

# Count tweets per day for each stock
tweet_counts = df.groupby(['Date', 'Stock Name']).size().reset_index(name='Number of Tweets')

# Merge tweet_counts with finance_data on Date and Stock Name
output_df = pd.merge(finance_data, tweet_counts, on=['Date', 'Stock Name'], how='left')

# Fill NaN values in Number of Tweets column (for days with no tweets)
output_df['Number of Tweets'] = output_df['Number of Tweets'].fillna(0)


# Standardize the prices
scaler = StandardScaler()
price_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Close Diff Prev Day']
output_df[price_columns] = scaler.fit_transform(output_df[price_columns])

# Merge with average sentiment data on Date and Stock Name
merged_df = pd.merge(output_df, avg_sentiments, on=['Date', 'Stock Name'], how='left')
merged_df['average_sentiment'] = merged_df['average_sentiment'].fillna(0)
# Drop rows with NaN values in specific columns (e.g., 'average_sentiment' and 'Close')
merged_df = merged_df.dropna(subset=['average_sentiment', 'Close', 'Close Diff Prev Day'])

# Display the final merged dataframe
merged_df
#Processed Data to CSV
merged_df.to_csv('output.csv', index=False) 

In [6]:
merged_df = pd.get_dummies(output_df, columns=['Stock Name'])
merged_df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Close Diff Prev Day,Number of Tweets,Stock Name_AAPL,...,Stock Name_MSFT,Stock Name_NFLX,Stock Name_NIO,Stock Name_NOC,Stock Name_PG,Stock Name_PYPL,Stock Name_TSLA,Stock Name_VZ,Stock Name_XPEV,Stock Name_ZS
0,2021-09-30,0.603202,0.593842,0.619950,0.590597,0.598731,53868000,,90.0,False,...,False,False,False,False,False,False,True,False,False,False
1,2021-10-01,0.596871,0.573776,0.591765,0.589963,0.598096,51094200,-0.003245,94.0,False,...,False,False,False,False,False,False,True,False,False,False
2,2021-10-04,0.640943,0.636712,0.622716,0.605332,0.613508,91449900,0.151491,119.0,False,...,False,False,False,False,False,False,True,False,False,False
3,2021-10-05,0.612455,0.613498,0.617974,0.603043,0.611212,55297800,-0.019261,88.0,False,...,False,False,False,False,False,False,True,False,False,False
4,2021-10-06,0.591514,0.587906,0.615553,0.608304,0.616488,43898400,0.053751,78.0,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6043,2022-09-23,-1.202860,-1.202321,-1.199131,-1.198031,-1.194914,28279600,0.000759,0.0,False,...,False,False,False,False,False,False,False,False,True,False
6044,2022-09-26,-1.194167,-1.195559,-1.190165,-1.193208,-1.190078,27891300,0.049511,1.0,False,...,False,False,False,False,False,False,False,False,True,False
6045,2022-09-27,-1.191976,-1.195776,-1.193796,-1.198031,-1.194914,21160800,-0.043754,0.0,False,...,False,False,False,False,False,False,False,False,True,False
6046,2022-09-28,-1.203152,-1.205717,-1.200391,-1.200807,-1.197698,31799400,-0.023970,0.0,False,...,False,False,False,False,False,False,False,False,True,False
