# Data Preprocessing and Sentiment analysis

In [1]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Data Preprocessing

In [2]:
tweets_df = pd.read_csv("data/stock_tweets.csv")

# Text cleanup
def clean_tweet(text):
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub("\\r" ,'',text)
    text = re.sub("\\n" ,'',text)
    # Remove emojis using Unicode ranges
    text = re.sub(r'[\U0001F600-\U0001F64F'
                  r'\U0001F300-\U0001F5FF'
                  r'\U0001F680-\U0001F6FF'
                  r'\U0001F700-\U0001F77F'
                  r'\U0001F780-\U0001F7FF'
                  r'\U0001F800-\U0001F8FF'
                  r'\U0001F900-\U0001F9FF'
                  r'\U0001FA00-\U0001FA6F'
                  r'\U0001FA70-\U0001FAFF'
                  r'\U00002702-\U000027B0'
                  r'\U000024C2-\U0001F251]', '', text)
    return text.strip()

# Apply the function to the "tweet" column
tweets_df['Tweet'] = tweets_df['Tweet'].apply(clean_tweet)

tweets_df.sample(n=5)

Unnamed: 0,Date,Tweet,Stock Name,Company Name
897,2022-09-17 12:04:26+00:00,"This is a 500,000 car/year factory that had a ...",TSLA,"Tesla, Inc."
42684,2022-05-27 22:14:52+00:00,"$SPX &amp; $NDX up 10% on the week, massive gr...",PG,Procter & Gamble Company
32246,2021-11-08 06:15:22+00:00,Over 60% of you are buying $TSLA today.Will be...,TSLA,"Tesla, Inc."
31304,2021-11-14 16:11:23+00:00,Love my S Plaid more every day since purchased...,TSLA,"Tesla, Inc."
7585,2022-06-30 17:05:49+00:00,I cancelled my order for the Model 3 today.At ...,TSLA,"Tesla, Inc."


In [3]:
finance_data = pd.read_csv("data/stock_yfinance_data.csv")
tweet_data = pd.read_csv("data/stock_tweets.csv")

# Ensure 'Date' columns are in datetime format and only keep the date part
finance_data['Date'] = pd.to_datetime(finance_data['Date']).dt.date
tweet_data['Date'] = pd.to_datetime(tweet_data['Date']).dt.date

# Count tweets per day for each stock
tweet_counts = tweet_data.groupby(['Date', 'Stock Name']).size().reset_index(name='Number of Tweets')

# Merge tweet_counts with finance_data on Date and Stock Name
finance_data = pd.merge(finance_data, tweet_counts, on=['Date', 'Stock Name'], how='left')
finance_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,Number of Tweets
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,90.0
1,2021-10-01,259.466675,260.260010,254.529999,258.406677,258.406677,51094200,TSLA,94.0
2,2021-10-04,265.500000,268.989990,258.706665,260.510010,260.510010,91449900,TSLA,119.0
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800,TSLA,88.0
4,2021-10-06,258.733337,262.220001,257.739990,260.916656,260.916656,43898400,TSLA,78.0
...,...,...,...,...,...,...,...,...,...
6043,2022-09-23,13.090000,13.892000,12.860000,13.710000,13.710000,28279600,XPEV,
6044,2022-09-26,14.280000,14.830000,14.070000,14.370000,14.370000,27891300,XPEV,1.0
6045,2022-09-27,14.580000,14.800000,13.580000,13.710000,13.710000,21160800,XPEV,
6046,2022-09-28,13.050000,13.421000,12.690000,13.330000,13.330000,31799400,XPEV,


# Vader

In [4]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def merge_dataframes(finance_data, sentiment_data, method):
    
    # Ensure 'Date' columns are in datetime format and only keep the date part
    finance_data['Stock Name'] = finance_data['Stock Name'].str.strip().str.upper()
    sentiment_data['Stock Name'] = sentiment_data['Stock Name'].str.strip().str.upper()
    finance_data['Date'] = pd.to_datetime(finance_data['Date']).dt.date
    sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date']).dt.date

    # Group tweet data by Date and Stock Name to calculate tweet counts and average sentiment
    avg_sentiments = sentiment_data.groupby(['Date', 'Stock Name'])['sentiment'].mean().reset_index(name='Average Sentiment')
    
    # Merge finance data with tweet summary data
    merged_df = pd.merge(finance_data, avg_sentiments, on=['Date', 'Stock Name'], how='left')

    # Fill NaN values for days with no tweets
    merged_df['Number of Tweets'] = merged_df['Number of Tweets'].fillna(0)
    merged_df['Average Sentiment'] = merged_df['Average Sentiment'].fillna(0)

    # Drop rows with NaN values in critical columns
    merged_df = merged_df.dropna(subset=['Close'])
    
    merged_df.to_csv(f"data/cleaned_data_{method}.csv")

    return merged_df

In [5]:
tqdm.pandas()
sentiment = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    return sentiment.polarity_scores(text)['compound']

tweets_df_vader = tweets_df.copy()

tweets_df_vader['sentiment'] = tweets_df_vader['Tweet'].progress_apply(get_sentiment_score)

merge_dataframes(finance_data, tweets_df_vader, "vader")

100%|██████████| 69759/69759 [00:07<00:00, 9605.75it/s] 


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,Number of Tweets,Average Sentiment
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,90.0,0.202088
1,2021-10-01,259.466675,260.260010,254.529999,258.406677,258.406677,51094200,TSLA,94.0,0.216879
2,2021-10-04,265.500000,268.989990,258.706665,260.510010,260.510010,91449900,TSLA,119.0,0.125718
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800,TSLA,88.0,0.091361
4,2021-10-06,258.733337,262.220001,257.739990,260.916656,260.916656,43898400,TSLA,78.0,0.210363
...,...,...,...,...,...,...,...,...,...,...
6043,2022-09-23,13.090000,13.892000,12.860000,13.710000,13.710000,28279600,XPEV,0.0,0.000000
6044,2022-09-26,14.280000,14.830000,14.070000,14.370000,14.370000,27891300,XPEV,1.0,0.585900
6045,2022-09-27,14.580000,14.800000,13.580000,13.710000,13.710000,21160800,XPEV,0.0,0.000000
6046,2022-09-28,13.050000,13.421000,12.690000,13.330000,13.330000,31799400,XPEV,0.0,0.000000


# Bert

In [6]:
def is_financial_tweet(tweet):
    # Financial keywords

    financial_keywords = [
        'stock', 'market', 'trading', 'shares', 'portfolio', 'revenue',
        'profit', 'loss', 'growth', 'inflation', 'interest', 'rate', 
        'dividend', 'crypto', 'bitcoin', 'Fed', 'IPO', 'earnings', 'forecast', 
        'guidance', 'EBITDA', 'margin', 'cash flow', 'assets', 'liabilities', 
        'stock price', 'valuation', 'P/E ratio', 'EPS', 'dividend', 
        'market cap', 'volatility', 'quarterly report', 'earnings call', 
        'share buyback', 'merger', 'acquisition', 'upgrade', 'downgrade', 'estimates', '$',
        'interest', 'debt', 'decline', 'net income', 'gross income', 'operating income', 'bull', 
        'bullish', 'bear', 'bearish', 'green', 'red', 'security', 'securities'
    ]

    for i in financial_keywords:
        if i.lower() in tweet.lower():
            return True
    return False

In [7]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import warnings
warnings.filterwarnings("ignore")

# Enable tqdm for pandas
tqdm.pandas()

# Load FinBERT model and tokenizer
tokenizer_finance = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model_finance = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
nlp_finance = pipeline("sentiment-analysis", model=model_finance, tokenizer=tokenizer_finance)

# Load general sentiment model
nlp_general = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")

# Define the function for analyzing sentiment
def analyze_batch(tweets):
    results = []
    for tweet in tweets:
        if is_financial_tweet(tweet):
            result = nlp_finance(tweet)[0]
        else:
            result = nlp_general(tweet)[0]

        score, label = result["score"], result["label"]
        sentiment = {
            "Positive": score,
            "Negative": -score,
            "Neutral": round(1 - score, 4),
            "POS": score,
            "NEG": -score,
        }.get(label, 0)
        results.append(sentiment)
    return results

# Apply in batches
batch_size = 32
tweets_df_bert = tweets_df.copy()
tweets_df_bert['sentiment'] = tweets_df_bert['Tweet'].progress_apply(
    lambda x: analyze_batch([x])[0]  # Single row
)

merge_dataframes(finance_data, tweets_df_bert, "bert")

  from .autonotebook import tqdm as notebook_tqdm





emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
100%|██████████| 69759/69759 [1:10:48<00:00, 16.42it/s]


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name,Number of Tweets,Average Sentiment
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA,90.0,0.174285
1,2021-10-01,259.466675,260.260010,254.529999,258.406677,258.406677,51094200,TSLA,94.0,0.188065
2,2021-10-04,265.500000,268.989990,258.706665,260.510010,260.510010,91449900,TSLA,119.0,0.185075
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800,TSLA,88.0,0.179827
4,2021-10-06,258.733337,262.220001,257.739990,260.916656,260.916656,43898400,TSLA,78.0,0.117512
...,...,...,...,...,...,...,...,...,...,...
6043,2022-09-23,13.090000,13.892000,12.860000,13.710000,13.710000,28279600,XPEV,0.0,0.000000
6044,2022-09-26,14.280000,14.830000,14.070000,14.370000,14.370000,27891300,XPEV,1.0,-0.669545
6045,2022-09-27,14.580000,14.800000,13.580000,13.710000,13.710000,21160800,XPEV,0.0,0.000000
6046,2022-09-28,13.050000,13.421000,12.690000,13.330000,13.330000,31799400,XPEV,0.0,0.000000
