### Imports

In [7]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler

### Load Dataset

In [8]:
dataset = pd.read_pickle('Data/small_test_set.pkl')
# drop NaN
dataset = dataset.dropna()
# add a column that defines sentiment. sentiment is 0 if returns is < 0.01, 1 if returns is > 0.01 and 2 if returns is 0
dataset['sentiment'] = dataset['returns'].apply(lambda x: 0 if x < -.01 else 1 if x > 0.01 else 2)
dataset

Unnamed: 0,title,date,ticker,returns,sentiment
219534,"Benzinga’s Top Upgrades (SBNY, CCL, MPW, MA, F...",2010-05-21 08:27:00,CCL,0.013952,1
912659,"Top Performing Industries For August 9, 2016",2016-08-09 10:54:00,NTL,-0.010309,0
165581,Bridgeline DIgital Reports Q3 Loss $0.26 Vs Es...,2015-08-14 08:01:00,BLIN,-0.048276,0
1355251,"Earnings Scheduled For March 24, 2015",2015-03-24 04:04:00,WSCI,0.001757,2
362331,Puts Purchased on Dick's Sporting Goods (DKS),2011-01-06 12:40:00,DKS,-0.045442,0
...,...,...,...,...,...
1236358,Standpoint Research Downgrades Tempur-pedic In...,2013-10-01 11:17:00,TPX,0.023658,1
1370405,"Sector Update: Utilities Leading, Consumer Goo...",2011-08-24 10:36:00,XLF,0.026678,1
90206,Aramark Acquires On-Demand Food Delivery Servi...,2019-08-06 06:44:00,ARMK,0.047018,1
1342675,Wheeler Real Estate Investment Trust Responds ...,2018-03-16 04:19:00,WHLR,0.064220,1


### Split Dataset into Train, Validation, and Test
- Train (80%)
- Test (10%)
- Validation (10%)

In [9]:
def split(df):
    x_train, x_test_and_val, y_train, y_test_and_val  = train_test_split(df['title'], df['returns'], random_state=42, test_size=0.2) # train set is 80%,
    x_test, x_val, y_test, y_val = train_test_split(x_test_and_val, y_test_and_val, test_size=0.5, random_state=42) # test and val are 50% of the remaining 20% = 10%. 
    return x_train, y_train, x_val, y_val, x_test, y_test

x_train, y_train, x_val, y_val, x_test, y_test = split(dataset)

### Dataset Analysis

In [10]:
def get_model_and_tokenizer(model_name, num_labels=3):
  tokenizer = AutoTokenizer.from_pretrained(model_name,num_labels=num_labels)
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
  return tokenizer, model

bert_tokenizer, bert_model = get_model_and_tokenizer('bert-base-uncased')
finbert_tokenizer, finbert_model = get_model_and_tokenizer('ProsusAI/finbert')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def analyze(titles, returns):
    
    tokenized_titles = bert_tokenizer(titles.tolist())['input_ids']

    num_samples = len(tokenized_titles)
    
    num_pos = len(returns[returns > 0])
    num_neg = len(returns[returns < 0])
    num_zero = len(returns[returns == 0])

    list_lengths = titles.apply(len)

    # Find the size of the smallest/largest list
    num_min_tokens = min(list_lengths)
    num_max_tokens = max(list_lengths)
    num_mean_tokens = list_lengths.mean()
    
    return {"Number of Samples":num_samples, 
            "Number of Samples with Positive Returns": num_pos,
            "Number of Samples with No Returns": num_zero,
            "Number of Samples with Negative Returns": num_neg,
            "Minimum Number of Tokens": num_min_tokens, 
            "Maximum Number of Tokens":num_max_tokens, 
            "Mean Number of Tokens":num_mean_tokens}

def df_for_analysis(train_analysis, test_analysis, validation_analysis):
    df = pd.DataFrame([train_analysis, test_analysis, validation_analysis], index=['Train', 'Test', 'Validation'])
    return df

analysis_df = df_for_analysis(analyze(x_train, y_train), analyze(x_test,y_test), analyze(x_val, y_val))

analysis_df

Unnamed: 0,Number of Samples,Number of Samples with Positive Returns,Number of Samples with No Returns,Number of Samples with Negative Returns,Minimum Number of Tokens,Maximum Number of Tokens,Mean Number of Tokens
Train,56,29,1,26,23,235,70.053571
Test,7,3,1,3,40,128,73.857143
Validation,8,8,0,0,45,232,98.625


### Testing

### BERT Model Testing

In [12]:
import torch
def predict_article(model, tokenizer, sample):
    inputs = tokenizer(sample, return_tensors='pt')
    output = model(**inputs)
    return output.logits

def convert_to_sentiment_int(output, modelType):
    if modelType == 'BERT':
        return int(torch.argmax(output, dim=-1))
    elif modelType == 'FinBERT':
        result = int(torch.argmax(output, dim=-1))
        # if 0, return 1, if 1, return 0, if 2, return 2. why? finbert flips 0 and 1. 0 is positive, 1 is negative. we want the opposite. 
        # 2 would be neutral. 
        return 1 if result == 0 else 0 if result == 1 else 2

# now given a dataframe of samples, we can evaluate the model on each sample and return the results
def predict_articles(model, tokenizer, df, modelType):
    numCorrect = 0
    for index, row in df.iterrows():
        
        # 0 if negative, 1 if positive, 2 if neutral
        ground_truth = row['sentiment'] 
        model_output = predict_article(model, tokenizer, row['title'])
        prediction = convert_to_sentiment_int(model_output, modelType)
        numCorrect+=1 if ground_truth == prediction else 0

    accuracy = numCorrect / len(df)
    return accuracy

bert_accuracy = predict_articles(bert_model, bert_tokenizer, dataset, 'BERT') # this only produces 2 classes. we need 3.
finbert_accuracy = predict_articles(finbert_model, finbert_tokenizer, dataset, 'FinBERT')

print(finbert_accuracy)
print(bert_accuracy)

0.43661971830985913
0.352112676056338


### Finbert Model Testing