In [3]:
import torch

In [4]:
print(torch.__version__)

1.13.1


In [6]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)


NVIDIA GeForce GTX 750 Ti


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [11]:
def calculate_runtime(function):
    """
    A wrapper function that calculates the runtime of the specified function.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = function(*args, **kwargs)
        end_time = time.time()
        print(f"Runtime for {function.__name__}: {end_time - start_time} seconds")
        return result
    return wrapper

In [12]:
#   ---------------------------   Data cleaning   ---------------------------
def clean_text(text):
    # Remove twitter Return handles (RT @xxx:)
    text = re.sub("RT @[\w]*:", "", text)

    # Remove twitter handles (@xxx)
    text = re.sub("@[\w]*", "", text)

    # Remove URL links (httpxxx)
    url_matcher = "((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*"
    text = re.sub(url_matcher, "", text)
    
    # Remove any multiple white spaces, tabs or newlines
    text = re.sub('\s+',' ', text)
    
    #remove “”
    text = re.sub("“|”", "", text)
    
    return text

#   ---------------------------   Data filtering   ---------------------------

# Method 1 filters the posts based on only 1 rule, which is that the ticker of the company of ...
# ... which the sentiment is being calculated is present.
def filter_data_1(post, ticker):
    # Filter out posts that do not mention the company ticker.
    if bool(re.search(fr"\${ticker}", post, re.IGNORECASE)):
        return True
    else:
        return False 

# Method 2 filters the posts based on the rule that exactly 1 ticker is mentioned and ...
# ... that this ticker is the ticker of the company of which the sentiment is being calculated   
def filter_data_2(post, ticker):
    # Count the number of tickers in the post
    matches = re.findall(r"\$[a-zA-Z]+", post)
    count = len(matches)
    
    # Filter out posts with more or less than 1 ticker, and check whether this 1 ticker is the company ticker.
    if count == 1 and bool(re.search(fr"\${ticker}", post, re.IGNORECASE)):
        return True
    else:
        return False


In [13]:
@calculate_runtime
def calc_sent_finbert(df):
    # Inititalise sentiment pipeline
    nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer, device=0)

    # Create sentence list and run finBERT
    sentence_list = df['text'].to_list()
    results = nlp(sentence_list)

    # Add results to main dataframe
    results = pd.DataFrame(results)
    df = df.merge(results, how='left', left_index=True, right_index=True)
    return df  #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative



In [14]:
import time

def clean_data(df, ticker):
    start_time = time.time()
    # Drop all non English Tweets and any unnnamed columns
    df = df[df['lang'] == 'en']
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    
    # Create some datetime items
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['date'] = df['created_at'].dt.date
    df['hour'] = df['created_at'].dt.hour
#     df['test'] = df['text'].apply(filter_data_1, ticker=ticker)
    
    #   ---------------------------   Data cleaning   ---------------------------
    # Clean text
    df['text'] = df['text'].apply(clean_text)
    print(f"[{ticker}] Done cleaning after --- %s seconds ---" % (time.time() - start_time))
    
    # Drop duplicate tweets based on cleaned text (sometimes Tweets include the same text but different links for example)
    df = df.drop_duplicates(subset=['author_id', 'text'], keep=False)
    
    # Resetting index
    df.reset_index(drop=True, inplace=True)

    #   ---------------------------   Data filter   ---------------------------
    # Filter dataframe using both filter 1 and 2 (read above what they mean)
    df['filter_1'] = df['text'].apply(filter_data_1, ticker=ticker)
    df['filter_2'] = df['text'].apply(filter_data_2, ticker=ticker)
    
    print(f"[{ticker}] Done filtering after --- %s seconds ---" % (time.time() - start_time))    

    # #   ---------------------------   Sentiment   ---------------------------
    df = calc_sent_finbert(df)
    return df


In [16]:
def count_results(df):
    # Drop scores with less than 80% certainty
    df = df[df['score'] > 0.8]

    # Drop all neutral observations
    df = df[df['label'] != "Neutral"]
    df['pos'] = np.where(df['label'] == "Positive", 1, 0)
    df['neg'] = np.where(df['label'] == "Negative", 1, 0)

    # Create results_df with [filter_1]
    counted_results = df[df['filter_1']][['date', 'pos', 'neg']].groupby('date', as_index=False).sum().rename(columns={"pos": "[f1BERT]pos", "neg": "[f1BERT]neg"})
    counted_results['[f1BERT]total'] = counted_results['[f1BERT]pos'] + counted_results['[f1BERT]neg']

    # Create results_df with [filter_1]
    to_merge_df = df[df['filter_2']][['date', 'pos', 'neg']].groupby('date', as_index=False).sum().rename(columns={"pos": "[f2BERT]pos", "neg": "[f2BERT]neg"})
    to_merge_df['[f2BERT]total'] = to_merge_df['[f2BERT]pos'] + to_merge_df['[f2BERT]neg']
    counted_results = counted_results.merge(to_merge_df, how='left', left_on='date', right_on='date')

    return counted_results


In [17]:
def calc_sent_measures(return_df):
    # Get results dataframe
    sentiment_measures = count_results(return_df)
    
    # Method 1 - ratio
    sentiment_measures['[f1BERT]method_1'] = sentiment_measures['[f1BERT]pos'] / sentiment_measures['[f1BERT]total']
    sentiment_measures['[f2BERT]method_1'] = sentiment_measures['[f2BERT]pos'] / sentiment_measures['[f2BERT]total']

    # Method 2 - discontinued as it is the same as method 1
    #     sentiment_measures['[f1s2]method_2'] = (sentiment_measures['[f1s2]pos'] - sentiment_measures['[f1s2]neg']) / sentiment_measures['[f1s2]total']
    #     sentiment_measures['[f2s2]method_2'] = (sentiment_measures['[f2s2]pos'] - sentiment_measures['[f2s2]neg']) / sentiment_measures['[f2s2]total']

    # Method 3
    # Method 3 does not exist with finBERT

    return sentiment_measures

## Loop tickers

In [32]:
filedir = r"E:\Users\Christiaan\Large_Files\Thesis\Twitter\merged"
# ticker_list = [']
# done_list = []
# almost_done = []
ticker_list = ['AAPL', 'AMD', 'ATVI', 'AMZN', 'ATVI', 'BA', 'BABA', 'BAC', 'DIS', 'F', 'GE', 'GME', 'IQ', 'LULU', 'MSFT', 'MU', 'NFLX', 'NVDA', 'SBUX', 'SHOP', 'SNAP', 'SQ', 'TLRY', 'TSLA', 'V', 'WMT']

for ticker in ticker_list:
    csv_path = os.path.join(filedir, f"{ticker}.csv").replace('\\', '/')

    save_dir = r"E:\Users\Christiaan\Large_Files\Thesis\Twitter\sentiment\finBERT"
    save_path = os.path.join(save_dir, f"{ticker}.csv").replace('\\', '/')

    # Check if file already exists and skip sentiment calculation if file exists
    if os.path.isfile(save_path):
        print(f"[skipping] File already exists: [{save_path}]")
        
    else:
        # Read csv
        df = pd.read_csv(csv_path)

        # Filter and clean data. Also perform finBERT sentiment scoring.
        return_df = clean_data(df, ticker)

        # Calculate sentiment scores for each method
        sentiment_measures = calc_sent_measures(return_df)
        
        # Saving the dataframe
        sentiment_measures.to_csv(save_path, encoding='utf-8', index=False)


[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/AAPL.csv]
[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/AMD.csv]
[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/AMZN.csv]
[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/ATVI.csv]
[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/BA.csv]
[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/BABA.csv]
[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/BAC.csv]
[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/DIS.csv]
[skipping] File already exists: [E:/Users/Christiaan/Large_Files/Thesis/Twitter/sentiment/finBERT/F.csv]
[skipping] File already exists: [E:/