In [75]:
import pandas as pd
from transformers import pipeline
import re

In [76]:
df = pd.read_csv("data/stock_tweets.csv")

# Data Preprocessing to remove noise
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove emojis using Unicode ranges
    text = re.sub(r'[\U0001F600-\U0001F64F'
                  r'\U0001F300-\U0001F5FF'
                  r'\U0001F680-\U0001F6FF'
                  r'\U0001F700-\U0001F77F'
                  r'\U0001F780-\U0001F7FF'
                  r'\U0001F800-\U0001F8FF'
                  r'\U0001F900-\U0001F9FF'
                  r'\U0001FA00-\U0001FA6F'
                  r'\U0001FA70-\U0001FAFF'
                  r'\U00002702-\U000027B0'
                  r'\U000024C2-\U0001F251]', '', text)
    return text.strip()


# Apply the function to the "tweet" column
df['Tweet'] = df['Tweet'].apply(clean_tweet)

df.head()
# #For testing purposes
df.to_csv("new_tweets.csv", index=False)

In [77]:
def is_financial_tweet(tweet):
    # Financial keywords

    financial_keywords = [
        'stock', 'market', 'trading', 'shares', 'portfolio', 'revenue',
        'profit', 'loss', 'growth', 'inflation', 'interest', 'rate', 
        'dividend', 'crypto', 'bitcoin', 'Fed', 'IPO', 'earnings', 'forecast', 
        'guidance', 'EBITDA', 'margin', 'cash flow', 'assets', 'liabilities', 
        'stock price', 'valuation', 'P/E ratio', 'EPS', 'dividend', 
        'market cap', 'volatility', 'quarterly report', 'earnings call', 
        'share buyback', 'merger', 'acquisition', 'upgrade', 'downgrade', 'estimates', '$',
        'interest', 'debt', 'decline', 'net income', 'gross income', 'operating income', 'bull', 
        'bullish', 'bear', 'bearish', 'green', 'red', 'security', 'securities'
    ]

    for i in financial_keywords:
        if i.lower() in tweet.lower():
            return True
    return False


In [78]:
def general_sentiment_analysis(nlp, tweet):
    result = nlp(tweet)
    score_to_sentiment = {1: "negative", 2: "negative", 3: "neutral", 4: "positive", 5: "positive"}
    star_rating = int(result[0]['label'][0])  # Extract star rating from label (e.g., '4 stars')
    sentiment = score_to_sentiment[star_rating]

    return sentiment.upper()

In [79]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# Define financial sentiment analysis pipeline
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Load pipeline
nlpGeneral = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Define general sentiment analysis pipeline

df['Sentiment'] = df['Tweet'].apply(lambda x: nlp(x)[0]['label'].upper() if is_financial_tweet(x) else general_sentiment_analysis(nlpGeneral, x))

# Value Counts
df['Sentiment'].value_counts()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sentiment
NEUTRAL     47687
POSITIVE    22162
NEGATIVE    10944
Name: count, dtype: int64

In [82]:
df

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Sentiment
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",POSITIVE
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",NEUTRAL
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",NEUTRAL
3,2022-09-29 22:40:07+00:00,Hahaha why are you still trying to stop Tesla ...,TSLA,"Tesla, Inc.",NEGATIVE
4,2022-09-29 22:27:05+00:00,"Stop trying to kill kids, you sad deranged old...",TSLA,"Tesla, Inc.",NEGATIVE
...,...,...,...,...,...
80788,2021-10-07 17:11:57+00:00,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.,NEUTRAL
80789,2021-10-04 17:05:59+00:00,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.,POSITIVE
80790,2021-10-01 04:43:41+00:00,Our record delivery results are a testimony of...,XPEV,XPeng Inc.,POSITIVE
80791,2021-10-01 00:03:32+00:00,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.,POSITIVE


In [83]:
df.to_csv('finbert_sentiment_analysis.csv', index=False)