In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
df = pd.read_csv('stock_tweets.csv')
df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."


In [3]:
print(df.shape)
print(df.dtypes)

(80793, 4)
Date            object
Tweet           object
Stock Name      object
Company Name    object
dtype: object


In [4]:
# Convert date column to datetime object
df['Date'] = pd.to_datetime(df['Date'])

# Extract date and time components into separate columns
df['time'] = df['Date'].dt.time
df['Date'] = df['Date'].dt.date

df

Unnamed: 0,Date,Tweet,Stock Name,Company Name,time
0,2022-09-29,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",23:41:16
1,2022-09-29,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",23:24:43
2,2022-09-29,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",23:18:08
3,2022-09-29,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",22:40:07
4,2022-09-29,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",22:27:05
...,...,...,...,...,...
80788,2021-10-07,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.,17:11:57
80789,2021-10-04,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.,17:05:59
80790,2021-10-01,Our record delivery results are a testimony of...,XPEV,XPeng Inc.,04:43:41
80791,2021-10-01,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.,00:03:32


In [5]:
import flair
sentiment_model = flair.models.TextClassifier.load('en-sentiment')

2023-04-16 20:56:46.210663: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define function for preprocessing tweets
def preprocess_tweet(tweet):
    # Remove URLs, user mentions, and hashtags
    tweet = re.sub(r"http\S+", "", tweet)
    tweet = re.sub(r"@[^\s]+", "", tweet)
    tweet = re.sub(r"#([^\s]+)", "", tweet)
    # Remove punctuation
    tweet = tweet.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    preprocessed_tweet = tweet.lower()
    return preprocessed_tweet

# Preprocess the tweets
df['preprocessed_tweet'] = df['Tweet'].apply(preprocess_tweet)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ramakanthnamuduri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ramakanthnamuduri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ramakanthnamuduri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
from tqdm import tqdm
# we will append probability and sentiment preds later
probs = []
sentiments = []

# use regex expressions (in clean function) to clean tweets
#tweets['text'] = tweets['text'].apply(clean)

for tweet in tqdm(df['preprocessed_tweet'].to_list()):
    # make prediction
    sentence = flair.data.Sentence(tweet)
    sentiment_model.predict(sentence)
    # extract sentiment prediction
    probs.append(sentence.labels[0].score)  # numerical score 0-1
    sentiments.append(sentence.labels[0].value)  # 'POSITIVE' or 'NEGATIVE'

# add probability and sentiment predictions to tweets dataframe
df['probability'] = probs
df['sentiment'] = sentiments

df

100%|███████████████████████████████████| 80793/80793 [1:02:49<00:00, 21.43it/s]


Unnamed: 0,Date,Tweet,Stock Name,Company Name,time,preprocessed_tweet,probability,sentiment
0,2022-09-29,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",23:41:16,mainstream media has done an amazing job at br...,0.900826,POSITIVE
1,2022-09-29,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",23:24:43,tesla delivery estimates are at around 364k fr...,0.987166,NEGATIVE
2,2022-09-29,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",23:18:08,3 even if i include 630m unvested rsus as of 6...,0.995589,NEGATIVE
3,2022-09-29,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",22:40:07,hahaha why are you still trying to stop tes...,0.999771,NEGATIVE
4,2022-09-29,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",22:27:05,stop trying to kill kids you sad deranged ol...,0.997327,NEGATIVE
...,...,...,...,...,...,...,...,...
80788,2021-10-07,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.,17:11:57,some of the fastest growing tech stocks on the...,0.997311,POSITIVE
80789,2021-10-04,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.,17:05:59,with earnings on the horizon here is a quick s...,0.915765,POSITIVE
80790,2021-10-01,Our record delivery results are a testimony of...,XPEV,XPeng Inc.,04:43:41,our record delivery results are a testimony of...,0.998329,POSITIVE
80791,2021-10-01,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.,00:03:32,we delivered 10412 smart evs in sep 2021 reach...,0.998331,POSITIVE


In [8]:
df

Unnamed: 0,Date,Tweet,Stock Name,Company Name,time,preprocessed_tweet,probability,sentiment
0,2022-09-29,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",23:41:16,mainstream media has done an amazing job at br...,0.900826,POSITIVE
1,2022-09-29,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",23:24:43,tesla delivery estimates are at around 364k fr...,0.987166,NEGATIVE
2,2022-09-29,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",23:18:08,3 even if i include 630m unvested rsus as of 6...,0.995589,NEGATIVE
3,2022-09-29,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",22:40:07,hahaha why are you still trying to stop tes...,0.999771,NEGATIVE
4,2022-09-29,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",22:27:05,stop trying to kill kids you sad deranged ol...,0.997327,NEGATIVE
...,...,...,...,...,...,...,...,...
80788,2021-10-07,Some of the fastest growing tech stocks on the...,XPEV,XPeng Inc.,17:11:57,some of the fastest growing tech stocks on the...,0.997311,POSITIVE
80789,2021-10-04,"With earnings on the horizon, here is a quick ...",XPEV,XPeng Inc.,17:05:59,with earnings on the horizon here is a quick s...,0.915765,POSITIVE
80790,2021-10-01,Our record delivery results are a testimony of...,XPEV,XPeng Inc.,04:43:41,our record delivery results are a testimony of...,0.998329,POSITIVE
80791,2021-10-01,"We delivered 10,412 Smart EVs in Sep 2021, rea...",XPEV,XPeng Inc.,00:03:32,we delivered 10412 smart evs in sep 2021 reach...,0.998331,POSITIVE


In [9]:
df.to_csv("sentiment-flair model.csv")