In [2]:
import pandas as pd
import requests
from datetime import datetime
import pytz
from datasets import load_dataset
import os
import time
import re
from transformers import pipeline

In [8]:
stocks = ['GME', 'AMC', 'TLRY', 'SNDL', 'NAKD', 'ZOM', 'AAPL', 'TSLA', 'GOEV', 'PLUG']

In [3]:
def clean_text(text):
    """
    Clean tweet text by removing URLs, mentions, hashtags, and excessive whitespace.
    
    Args:
        text (str): Input text.
    Returns:
        str: Cleaned text.
    """
    if pd.isna(text):
        return ""
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

def analyze_sentiment_finbert(text):
    """
    Perform sentiment analysis using FinBERT.
    
    Args:
        text (str): Input text.
    Returns:
        dict: Sentiment label (positive, negative, neutral) and score.
    """
    if not text:
        return {'label': 'neutral', 'score': 0.0}
    try:
        # Initialize FinBERT pipeline (loaded once globally to save memory)
        global finbert_classifier
        if 'finbert_classifier' not in globals():
            finbert_classifier = pipeline('sentiment-analysis', model='ProsusAI/finbert', truncation=True, max_length=512)
        result = finbert_classifier(text)[0]
        return {
            'label': result['label'],  # positive, negative, neutral
            'score': result['score']   # Confidence score (0–1)
        }
    except Exception as e:
        print(f"FinBERT error for text '{text}': {e}")
        return {'label': 'neutral', 'score': 0.0}
    
    

In [4]:
data = pd.read_csv("twitter_data/stock_social_data.csv")
data.head()

Unnamed: 0,created_at,text,id,source
0,2020-04-09 23:56:58+00:00,RT @TDANetwork: 📽️ #TheWatchList panel assesse...,13,Twitter
1,2020-04-09 23:56:51+00:00,$UMRX bouncing. EXTREMELY OVERSOLD #Coronaviru...,14,Twitter
2,2020-04-09 23:55:05+00:00,$AAPL 4h/1h\n\nSometimes these wedges break hi...,32,Twitter
3,2020-04-09 23:54:47+00:00,This week's Expired Signals are now published ...,33,Twitter
4,2020-04-09 23:54:28+00:00,"$SPY $QQQ $VXX $AAPL $BA $MSFT\n\nGuys, I figu...",34,Twitter


In [5]:
# Clean text and apply FinBERT sentiment analysis
print("Applying FinBERT sentiment analysis...")
data['clean_text'] = data['text'].apply(clean_text)
sentiment_results = data['clean_text'].apply(analyze_sentiment_finbert)
data['sentiment_label'] = sentiment_results.apply(lambda x: x['label'])
data['sentiment_score'] = sentiment_results.apply(lambda x: x['score'])
data

Applying FinBERT sentiment analysis...


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Unnamed: 0,created_at,text,id,source,clean_text,sentiment_label,sentiment_score
0,2020-04-09 23:56:58+00:00,RT @TDANetwork: 📽️ #TheWatchList panel assesse...,13,Twitter,RT : 📽️ panel assesses the big questions $AAPL...,neutral,0.941471
1,2020-04-09 23:56:51+00:00,$UMRX bouncing. EXTREMELY OVERSOLD #Coronaviru...,14,Twitter,$UMRX bouncing. EXTREMELY OVERSOLD 💸 $DECN $OP...,neutral,0.646453
2,2020-04-09 23:55:05+00:00,$AAPL 4h/1h\n\nSometimes these wedges break hi...,32,Twitter,$AAPL 4h/1h Sometimes these wedges break highe...,neutral,0.637201
3,2020-04-09 23:54:47+00:00,This week's Expired Signals are now published ...,33,Twitter,This week's Expired Signals are now published ...,neutral,0.867028
4,2020-04-09 23:54:28+00:00,"$SPY $QQQ $VXX $AAPL $BA $MSFT\n\nGuys, I figu...",34,Twitter,"$SPY $QQQ $VXX $AAPL $BA $MSFT Guys, I figured...",neutral,0.903577
...,...,...,...,...,...,...,...
151331,2020-07-16 00:04:02+00:00,RT @TATrades: Quick poll - how much do you (on...,938637,Twitter,RT : Quick poll - how much do you (on average)...,neutral,0.927746
151332,2020-07-16 00:03:39+00:00,RT @NukemosS: China retaliating on $AAPL,938644,Twitter,RT : China retaliating on $AAPL,negative,0.787295
151333,2020-07-16 00:02:21+00:00,lows. This is shaping up for another nice sell...,938652,Twitter,lows. This is shaping up for another nice sell...,negative,0.836584
151334,2020-07-16 00:01:48+00:00,@_SeanDavid I could make a good case for why $...,938656,Twitter,I could make a good case for why $AAPL $MSFT $...,neutral,0.902641


In [6]:
output_dir = "twitter_data"
output_path = f"{output_dir}/stock_social_data_with_finbert.csv"
data.to_csv(output_path, index=False)

In [15]:
# data['date'] = data['created_at'].dt.date
# agg_sentiment = data.groupby(['stock', 'date']).agg({
#     'sentiment_score': 'mean',
#     'sentiment_label': lambda x: x.mode()[0] if not x.empty else 'neutral'
# }).reset_index()
# agg_path = f"{output_dir}/stock_sentiment_aggregated_finbert.csv"
# # agg_sentiment.to_csv(agg_path, index=False)
# agg_sentiment

def extract_stock(text, stocks):
    """
    Extract all matching stock tickers from text, return as semicolon-separated string.
    
    Args:
        text (str): Cleaned tweet text.
        stocks (list): List of stock tickers to match.
    Returns:
        str or None: Semicolon-separated tickers (e.g., 'GME;AMC') or None if no match.
    """
    if not text:
        return None
    # Match tickers with or without $, case-insensitive
    stock_pattern = r'\$?(?:' + '|'.join([re.escape(s) for s in stocks]) + r')\b'
    matches = re.findall(stock_pattern, text, re.IGNORECASE)
    # Collect unique valid tickers
    tickers = list(dict.fromkeys([m.replace('$', '').upper() for m in matches if m.replace('$', '').upper() in stocks]))
    return ';'.join(tickers) if tickers else None

data['stock'] = data['clean_text'].apply(lambda x: extract_stock(x, stocks))
data['created_at'] = pd.to_datetime(data['created_at'], errors='coerce', utc=True)
data['date'] = data['created_at'].dt.date
data

Unnamed: 0,created_at,text,id,source,clean_text,sentiment_label,sentiment_score,stock,date
0,2020-04-09 23:56:58+00:00,RT @TDANetwork: 📽️ #TheWatchList panel assesse...,13,Twitter,RT : 📽️ panel assesses the big questions $AAPL...,neutral,0.941471,AAPL,2020-04-09
1,2020-04-09 23:56:51+00:00,$UMRX bouncing. EXTREMELY OVERSOLD #Coronaviru...,14,Twitter,$UMRX bouncing. EXTREMELY OVERSOLD 💸 $DECN $OP...,neutral,0.646453,AAPL;TSLA;TLRY;GME,2020-04-09
2,2020-04-09 23:55:05+00:00,$AAPL 4h/1h\n\nSometimes these wedges break hi...,32,Twitter,$AAPL 4h/1h Sometimes these wedges break highe...,neutral,0.637201,AAPL,2020-04-09
3,2020-04-09 23:54:47+00:00,This week's Expired Signals are now published ...,33,Twitter,This week's Expired Signals are now published ...,neutral,0.867028,AAPL,2020-04-09
4,2020-04-09 23:54:28+00:00,"$SPY $QQQ $VXX $AAPL $BA $MSFT\n\nGuys, I figu...",34,Twitter,"$SPY $QQQ $VXX $AAPL $BA $MSFT Guys, I figured...",neutral,0.903577,AAPL,2020-04-09
...,...,...,...,...,...,...,...,...,...
151331,2020-07-16 00:04:02+00:00,RT @TATrades: Quick poll - how much do you (on...,938637,Twitter,RT : Quick poll - how much do you (on average)...,neutral,0.927746,TSLA,2020-07-16
151332,2020-07-16 00:03:39+00:00,RT @NukemosS: China retaliating on $AAPL,938644,Twitter,RT : China retaliating on $AAPL,negative,0.787295,AAPL,2020-07-16
151333,2020-07-16 00:02:21+00:00,lows. This is shaping up for another nice sell...,938652,Twitter,lows. This is shaping up for another nice sell...,negative,0.836584,TSLA,2020-07-16
151334,2020-07-16 00:01:48+00:00,@_SeanDavid I could make a good case for why $...,938656,Twitter,I could make a good case for why $AAPL $MSFT $...,neutral,0.902641,AAPL,2020-07-16


In [18]:
final = data.loc[:, ['id', 'date', 'created_at', 'stock', 'clean_text', 'sentiment_label', 'sentiment_score']]
final.to_csv("twitter_data/final_stock_social_sentiment_data.csv", index=False)
final

Unnamed: 0,id,date,created_at,stock,clean_text,sentiment_label,sentiment_score
0,13,2020-04-09,2020-04-09 23:56:58+00:00,AAPL,RT : 📽️ panel assesses the big questions $AAPL...,neutral,0.941471
1,14,2020-04-09,2020-04-09 23:56:51+00:00,AAPL;TSLA;TLRY;GME,$UMRX bouncing. EXTREMELY OVERSOLD 💸 $DECN $OP...,neutral,0.646453
2,32,2020-04-09,2020-04-09 23:55:05+00:00,AAPL,$AAPL 4h/1h Sometimes these wedges break highe...,neutral,0.637201
3,33,2020-04-09,2020-04-09 23:54:47+00:00,AAPL,This week's Expired Signals are now published ...,neutral,0.867028
4,34,2020-04-09,2020-04-09 23:54:28+00:00,AAPL,"$SPY $QQQ $VXX $AAPL $BA $MSFT Guys, I figured...",neutral,0.903577
...,...,...,...,...,...,...,...
151331,938637,2020-07-16,2020-07-16 00:04:02+00:00,TSLA,RT : Quick poll - how much do you (on average)...,neutral,0.927746
151332,938644,2020-07-16,2020-07-16 00:03:39+00:00,AAPL,RT : China retaliating on $AAPL,negative,0.787295
151333,938652,2020-07-16,2020-07-16 00:02:21+00:00,TSLA,lows. This is shaping up for another nice sell...,negative,0.836584
151334,938656,2020-07-16,2020-07-16 00:01:48+00:00,AAPL,I could make a good case for why $AAPL $MSFT $...,neutral,0.902641
