In [1]:
import pandas as pd
from transformers import pipeline
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("data/stock_tweets.csv")

# Data Preprocessing to remove noise
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove emojis using Unicode ranges
    text = re.sub(r'[\U0001F600-\U0001F64F'
                  r'\U0001F300-\U0001F5FF'
                  r'\U0001F680-\U0001F6FF'
                  r'\U0001F700-\U0001F77F'
                  r'\U0001F780-\U0001F7FF'
                  r'\U0001F800-\U0001F8FF'
                  r'\U0001F900-\U0001F9FF'
                  r'\U0001FA00-\U0001FA6F'
                  r'\U0001FA70-\U0001FAFF'
                  r'\U00002702-\U000027B0'
                  r'\U000024C2-\U0001F251]', '', text)
    return text.strip()


# Apply the function to the "tweet" column
df['Tweet'] = df['Tweet'].apply(clean_tweet)

df.head()
# #For testing purposes
df.to_csv("new_tweets.csv", index=False)

In [6]:
copy_df = df.copy()

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# Define sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

copy_df = copy_df[:10]

copy_df[['FinBERT_Sentiment','sentiment_score']] = copy_df['Tweet'].apply(lambda x: pd.Series({'FinBERT_Sentiment': nlp(x)[0]['label'], 'Sentiment_Score': nlp(x)[0]['score']})
)

copy_df

Unnamed: 0,Date,Tweet,Stock Name,Company Name,FinBERT_Sentiment,sentiment_score
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",Positive,0.902924
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",Neutral,0.999999
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",Neutral,0.999998
3,2022-09-29 22:40:07+00:00,Hahaha why are you still trying to stop Tesla ...,TSLA,"Tesla, Inc.",Neutral,0.998252
4,2022-09-29 22:27:05+00:00,"Stop trying to kill kids, you sad deranged old...",TSLA,"Tesla, Inc.",Neutral,0.88202
5,2022-09-29 22:25:53+00:00,This is you,TSLA,"Tesla, Inc.",Neutral,0.988683
6,2022-09-29 22:24:22+00:00,For years viciously silenced critics. Failin...,TSLA,"Tesla, Inc.",Neutral,0.495067
7,2022-09-29 22:23:54+00:00,$NIO just because I'm down money doesn't mean ...,TSLA,"Tesla, Inc.",Positive,0.995688
8,2022-09-29 22:23:28+00:00,50 likes for some $SPY $TSLA charts to study!,TSLA,"Tesla, Inc.",Neutral,0.999969
9,2022-09-29 22:15:01+00:00,"The powerwalls themselves are waterproof, but ...",TSLA,"Tesla, Inc.",Negative,0.999675


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# Define sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

df['FinBERT_Sentiment'] = df['Tweet'].apply(lambda x: nlp(x)[0]['label'])

# Value Counts
df['FinBERT_Sentiment'].value_counts()

Neutral     58917
Positive    15762
Negative     6114
Name: FinBERT_Sentiment, dtype: int64

In [5]:
df

Unnamed: 0,Date,Tweet,Stock Name,Company Name,FinBERT_Sentiment
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",Positive
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",Neutral
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",Neutral
3,2022-09-29 22:40:07+00:00,Hahaha why are you still trying to stop Tesla ...,TSLA,"Tesla, Inc.",Neutral
4,2022-09-29 22:27:05+00:00,"Stop trying to kill kids, you sad deranged old...",TSLA,"Tesla, Inc.",Neutral
...,...,...,...,...,...
80788,2021-10-07 17:11:57+00:00,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.,Neutral
80789,2021-10-04 17:05:59+00:00,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.,Positive
80790,2021-10-01 04:43:41+00:00,Our record delivery results are a testimony of...,XPEV,XPeng Inc.,Positive
80791,2021-10-01 00:03:32+00:00,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.,Positive
