### Imports

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler
from tqdm import tqdm

### Load Dataset

In [4]:
dataset = pd.read_pickle('Data/test_set.pkl')
# drop NaN
dataset = dataset.dropna()
# add a column that defines sentiment. sentiment is 0 if returns is < 0.01, 1 if returns is > 0.01 and 2 if returns is 0
dataset['sentiment'] = dataset['returns'].apply(lambda x: 0 if x < -0.01 else 1 if x > 0.01 else 2)
dataset

Unnamed: 0,title,date,ticker,returns,sentiment
821831,Bank of America Reinstates Coverage on 3M at B...,2012-09-13 16:09:00,MMM,0.020856,1
1384459,Does Yelp Need Help?,2015-04-30 14:04:00,YELP,-0.231864,0
684717,Morning Earnings Recap: The Biggest Reports Fr...,2019-01-25 06:40:00,JBLU,-0.022627,0
16283,20 Healthcare Stocks Moving In Monday's Pre-Ma...,2020-03-16 08:32:00,ACST,-0.096774,0
1226930,Start Your Engines: Global X Introduces An Aut...,2011-05-19 13:12:00,TM,-0.008478,2
...,...,...,...,...,...
365170,"Bank of America Downgrades Dollar Tree, Inc. t...",2014-07-28 14:14:00,DLTR,0.011988,1
877871,Option Alert: NCR Mar16 26.0 Puts Sweep: 1297 ...,2016-03-04 12:04:00,NCR,-0.023348,0
1005885,Protalix BioTherapeutics Trading Significantly...,2011-02-25 09:59:00,PLX,-0.184829,0
1116080,Seattle Genetics and Bristol-Myers Squibb Anno...,2015-12-23 08:01:00,SGEN,0.033404,1


### Split Dataset into Train, Validation, and Test
- Train (80%)
- Test (10%)
- Validation (10%)

In [5]:
def split(df):
    x_train, x_test_and_val, y_train, y_test_and_val  = train_test_split(df['title'], df['sentiment'], random_state=42, test_size=0.2) # train set is 80%,
    x_test, x_val, y_test, y_val = train_test_split(x_test_and_val, y_test_and_val, test_size=0.5, random_state=42) # test and val are 50% of the remaining 20% = 10%. 
    return x_train, y_train, x_val, y_val, x_test, y_test

x_train, y_train, x_val, y_val, x_test, y_test = split(dataset)

### Dataset Analysis

In [6]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_model_and_tokenizer(model_name, num_labels=3):
  tokenizer = AutoTokenizer.from_pretrained(model_name,num_labels=num_labels)
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
  model.to(device)
  return tokenizer, model

bert_tokenizer, bert_model = get_model_and_tokenizer('bert-base-uncased')
finbert_tokenizer, finbert_model = get_model_and_tokenizer('ProsusAI/finbert')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def analyze(titles, returns):
    
    tokenized_titles = bert_tokenizer(titles.tolist())['input_ids']

    num_samples = len(tokenized_titles)
    
    num_pos = len(returns[returns > 0])
    num_neg = len(returns[returns < 0])
    num_zero = len(returns[returns == 0])

    list_lengths = titles.apply(len)

    # Find the size of the smallest/largest list
    num_min_tokens = min(list_lengths)
    num_max_tokens = max(list_lengths)
    num_mean_tokens = list_lengths.mean()
    
    return {"Number of Samples":num_samples, 
            "Number of Samples with Positive Returns": num_pos,
            "Number of Samples with No Returns": num_zero,
            "Number of Samples with Negative Returns": num_neg,
            "Minimum Number of Tokens": num_min_tokens, 
            "Maximum Number of Tokens":num_max_tokens, 
            "Mean Number of Tokens":num_mean_tokens}

def df_for_analysis(train_analysis, test_analysis, validation_analysis):
    df = pd.DataFrame([train_analysis, test_analysis, validation_analysis], index=['Train', 'Test', 'Validation'])
    return df

analysis_df = df_for_analysis(analyze(x_train, y_train), analyze(x_test,y_test), analyze(x_val, y_val))

analysis_df

Unnamed: 0,Number of Samples,Number of Samples with Positive Returns,Number of Samples with No Returns,Number of Samples with Negative Returns,Minimum Number of Tokens,Maximum Number of Tokens,Mean Number of Tokens
Train,55148,28705,712,25731,3,458,74.550863
Test,6893,3544,76,3273,13,390,74.484404
Validation,6894,3603,70,3221,9,401,73.311285


### Testing

### BERT Model Testing

In [8]:
import torch
def predict_article(model, tokenizer, sample):
    inputs = tokenizer(sample, return_tensors='pt').to(device)
    output = model(**inputs)
    return output.logits

def convert_to_sentiment_int(output, modelType):
    if modelType == 'BERT':
        return int(torch.argmax(output, dim=-1))
    elif modelType == 'FinBERT':
        result = int(torch.argmax(output, dim=-1))
        # if 0, return 1, if 1, return 0, if 2, return 2. why? finbert flips 0 and 1. 0 is positive, 1 is negative. we want the opposite. 
        # 2 would be neutral. 
        return 1 if result == 0 else 0 if result == 1 else 2

# now given a dataframe of samples, we can evaluate the model on each sample and return the results
def predict_articles(model, tokenizer, df, modelType):
    true_labels = []
    predictions = []
    numCorrect = 0
    numPos = 0
    numNeg = 0
    numNeut = 0
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        
        # 0 if negative, 1 if positive, 2 if neutral
        ground_truth = row['sentiment']
        true_labels.append(ground_truth)
        model_output = predict_article(model, tokenizer, row['title'])
        prediction = convert_to_sentiment_int(model_output, modelType)
        predictions.append(prediction)
        numCorrect+=1 if ground_truth == prediction else 0
        if ground_truth == prediction:
            if prediction == 0:
                numNeg+=1
            elif prediction == 1:
                numPos+=1
            else:
                numNeut+=1
        
    accuracy = numCorrect / len(df)
    acc_pos = numPos / len(df[df['sentiment'] == 1])
    acc_neg = numNeg / len(df[df['sentiment'] == 0])
    acc_neut = numNeut / len(df[df['sentiment'] == 2])
    
    
    return accuracy, acc_pos, acc_neg, acc_neut, true_labels, predictions

# bert_accuracy = predict_articles(bert_model, bert_tokenizer, dataset, 'BERT') # this only produces 2 classes. we need 3.
#combine x_test and y_test to make a dataframe
test_df = pd.concat([x_test, y_test], axis=1)
finbert_accuracy, fin_pos, fin_neg, fin_neut, true_labels, predictions = predict_articles(finbert_model, finbert_tokenizer, test_df, 'FinBERT')

print(finbert_accuracy, fin_pos, fin_neg, fin_neut)
# print(bert_accuracy)

                                                     title   returns
959717   Pandora Shares Up 7.8% Following Q1 Earnings, ...  0.198261
383482                    Danger Zone: Williams Companies  -0.004049
545370   How to Profit from Gold's Current Price Instab...  0.000862
32172                   Earnings Scheduled For May 9, 2017  0.020625
666327   UPDATE: Goldman Sachs Resumes Intrepid Potash ... -0.004842
...                                                    ...       ...
955605   OpenText Signs Definitive Agreement to Acquire...  0.090012
471970   FuelCell Energy Shares Fall Upon Earnings Repo...  0.045662
1150330  Wall Street's M&A Chatter From June 20: Calpin...  0.006003
1230404  Bank of America Maintains Underperform on Tand... -0.047782
1164438       Top Narrow Based Indexes For January 3, 2013  0.479609

[6893 rows x 2 columns]


  0%|          | 0/6893 [00:00<?, ?it/s]


KeyError: 'sentiment'

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

finbert_f1_score = f1_score(true_labels, predictions, average=None)
finbert_precision = precision_score(true_labels, predictions, average=None)
finbert_recall = recall_score(true_labels, predictions, average=None)
finbert_accuracy = accuracy_score(true_labels, predictions)
#Convert values to percentages
finbert_f1_score = [i * 100 for i in finbert_f1_score]
finbert_precision = [i * 100 for i in finbert_precision]
finbert_recall = [i * 100 for i in finbert_recall]
pd.DataFrame([finbert_f1_score, finbert_precision, finbert_recall], columns=['Negative', 'Positive', 'Neutral'], index=['F1 Score', 'Precision', 'Recall'])

Unnamed: 0,Negative,Positive,Neutral
F1 Score,15.284298,11.777342,66.071172
Precision,9.082455,6.931999,92.849714
Recall,48.190789,39.125039,51.281274


In [None]:
temp_data = dataset.sample(10)
for index, row in temp_data.iterrows():
    print(row['title'])
    print(row['returns'])
    print(row['sentiment'])
    print(convert_to_sentiment_int(predict_article(finbert_model, finbert_tokenizer, row['title']), 'FinBERT'))
    print('\n')

Digital Realty Trust, Inc. (DLR) to Buy 3 Datacenters for $375 mln
0.01114718873943259
2
2


Tandy Leather Factory, Inc. Reports May 2016 Sales Up 3% YoY, Retail SSS Up 5%, International SSS Down 5%
-0.0013441167748555053
2
1


Benchmark Maintains Buy on Coherent, Raises Price Target to $175
-0.009290979313895477
2
1


20 Biggest Mid-Day Gainers For Thursday
0.035944573528740524
2
2


7 Stocks To Watch For May 12, 2017
-0.06435013186138398
2
2


Brean Capital Reviews 3D Systems' Downward Spiral
-0.07161215161117378
2
0


UPDATE: RE/MAX Says 'Special Committee's investigation did not identify any matters requiring adjustments to the Company's previously issued financial statements'
0.08721496721188436
2
2


More Earnings And A Fed Decision In Week Four Of The WeTrader Competition
0.019520641335001022
2
2


Powell Industries Reports Q2 Adj. EPS $0.17 vs $0.09 Est., Sales $170.2M vs $156.7M Est.; Sees FY15 Sales $625M-$675M vs $650M Est.
0.07505350915820078
2
1


CRT Capital Initiates Cov

### Finbert Model Testing