In [26]:
import pandas as pd
from transformers import pipeline
import re

In [27]:
df = pd.read_csv("data/stock_tweets.csv")

# Data Preprocessing to remove noise
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove emojis using Unicode ranges
    text = re.sub(r'[\U0001F600-\U0001F64F'
                  r'\U0001F300-\U0001F5FF'
                  r'\U0001F680-\U0001F6FF'
                  r'\U0001F700-\U0001F77F'
                  r'\U0001F780-\U0001F7FF'
                  r'\U0001F800-\U0001F8FF'
                  r'\U0001F900-\U0001F9FF'
                  r'\U0001FA00-\U0001FA6F'
                  r'\U0001FA70-\U0001FAFF'
                  r'\U00002702-\U000027B0'
                  r'\U000024C2-\U0001F251]', '', text)
    return text.strip()


# Apply the function to the "tweet" column
df['Tweet'] = df['Tweet'].apply(clean_tweet)

df.head()
# #For testing purposes
df.to_csv("new_tweets.csv", index=False)

# Finbert

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [29]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

inputs = tokenizer(list(df['Tweet'][:100]), padding = True, truncation = True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  7731,  2865,  ...,     0,     0,     0],
        [  101, 26060,  6959,  ...,     0,     0,     0],
        [  101,  1017,  1013,  ...,     0,     0,     0],
        ...,
        [  101,  1523,  1996,  ...,     0,     0,     0],
        [  101,  1042, 16150,  ...,     0,     0,     0],
        [  101,  1000,  2065,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [30]:
outputs = model(**inputs)
print(outputs.logits.shape)


torch.Size([100, 3])


In [31]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)


tensor([[0.2052, 0.0749, 0.7200],
        [0.0374, 0.0221, 0.9405],
        [0.0488, 0.0202, 0.9310],
        [0.0499, 0.1172, 0.8329],
        [0.0364, 0.5094, 0.4542],
        [0.0523, 0.0337, 0.9140],
        [0.0748, 0.2867, 0.6385],
        [0.0563, 0.4408, 0.5029],
        [0.0596, 0.0172, 0.9232],
        [0.0147, 0.8964, 0.0889],
        [0.0274, 0.0257, 0.9470],
        [0.0313, 0.0938, 0.8749],
        [0.0377, 0.0486, 0.9137],
        [0.0489, 0.2886, 0.6625],
        [0.1142, 0.0201, 0.8657],
        [0.4993, 0.0198, 0.4809],
        [0.3407, 0.0143, 0.6450],
        [0.0641, 0.0226, 0.9134],
        [0.0333, 0.0218, 0.9449],
        [0.0395, 0.0208, 0.9397],
        [0.0474, 0.0138, 0.9388],
        [0.1348, 0.0117, 0.8535],
        [0.0465, 0.0384, 0.9152],
        [0.0407, 0.0172, 0.9421],
        [0.0451, 0.2299, 0.7250],
        [0.0410, 0.1210, 0.8380],
        [0.1278, 0.0134, 0.8588],
        [0.0762, 0.0133, 0.9105],
        [0.0636, 0.0314, 0.9050],
        [0.054

In [32]:
df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,Hahaha why are you still trying to stop Tesla ...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"Stop trying to kill kids, you sad deranged old...",TSLA,"Tesla, Inc."


In [33]:
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()


table = {'Tweet':df['Tweet'][:100],
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df = pd.DataFrame(table, columns = ["Tweet", "Positive", "Negative", "Neutral"])


df.sample(5)


Unnamed: 0,Tweet,Positive,Negative,Neutral
42,Being a $TSLA investor since 2017 has made me ...,0.291987,0.0096,0.698413
72,Jobless claims coming up in 2 min. Would love ...,0.726594,0.050445,0.222961
5,This is you,0.05228,0.033737,0.913983
69,Don’t be scared. Don’t panic. \r\n\r\nThese ar...,0.151902,0.020714,0.827384
58,This supports my comments yesterday that full-...,0.909549,0.012876,0.077574
