### Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler
from tqdm import tqdm
import torch
from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import OneHotEncoder
import scipy

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

bert_model.to(device)

finbert_tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
finbert_model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### Helper Functions
These function will be used throughout the code to facilitate training/testing/analysis

In [4]:
def get_dataloader(encodings, labels, batch_size):

    input_ids  = torch.tensor(encodings['input_ids'])
    token_type_ids = torch.tensor(encodings['token_type_ids'])
    attention_masks = torch.tensor(encodings['attention_mask'])

    labels = torch.tensor(labels)

    dataset = TensorDataset(input_ids, attention_masks, labels, token_type_ids)

    dataloader = DataLoader(dataset, batch_size=batch_size)

    return dataloader

In [5]:
def evaluate_model(model, dataloader):
    model.eval()
    predictions = None
    true_labels = None
    for batch in tqdm(dataloader, total = len(dataloader)):
        input_ids, attention_masks, labels, token_type_ids = [b.to(device) for b in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids)
            output_np = outputs.logits.detach().cpu().numpy()
            true_label = labels.cpu().numpy()

            if predictions is None:
                predictions = output_np
            else:
                predictions = np.append(predictions, output_np, axis=0)
            if true_labels is None:
                true_labels = true_label
            else:
                true_labels = np.append(true_labels, true_label, axis=0)
    predictions = np.argmax(predictions, axis=1)
    true_labels = np.argmax(true_labels, axis=1)

    correct = np.sum(predictions == true_labels)
    total = len(predictions)
    accuracy = correct / total
    return predictions, true_labels, accuracy
    





In [6]:
def split(df):
    x_train, x_test_and_val, y_train, y_test_and_val  = train_test_split(df['title'], df['sentiment'], random_state=42, test_size=0.2) # train set is 80%,
    x_test, x_val, y_test, y_val = train_test_split(x_test_and_val, y_test_and_val, test_size=0.5, random_state=42) # test and val are 50% of the remaining 20% = 10%. 
    return x_train, y_train, x_val, y_val, x_test, y_test

In [7]:
def score_report(predictions, true_labels):
    model_f1_score = f1_score(true_labels, predictions, average=None)
    model_precision = precision_score(true_labels, predictions, average=None)
    model_recall = recall_score(true_labels, predictions, average=None)
    model_accuracy = accuracy_score(true_labels, predictions)
    #Convert values to percentages
    model_f1_score = [i * 100 for i in model_f1_score]
    model_precision = [i * 100 for i in model_precision]
    model_recall = [i * 100 for i in model_recall]
    return model_accuracy, pd.DataFrame([model_f1_score, model_precision, model_recall], columns=['Negative', 'Positive', 'Neutral'], index=['F1 Score', 'Precision', 'Recall'])
    

### Load Dataset

In [8]:
dataset = pd.read_pickle('Data/test_set.pkl')
# drop NaN
dataset = dataset.dropna()
# add a column that defines sentiment. sentiment is 0 if returns is < -0.01, 1 if returns is > 0.01 and 2 if returns is 0
dataset['sentiment'] = dataset['returns'].apply(lambda x: 0 if x < -.01 else 1 if x > 0.01 else 2)

# dataset = dataset.sample(5000, random_state=42)

### Split Dataset into Train, Validation, and Test
- Train (80%)
- Test (10%)
- Validation (10%)

In [9]:
batch_size = 32

In [10]:
x_train, y_train, x_val, y_val, x_test, y_test = split(dataset)

train_tokenized = bert_tokenizer(x_train.to_list(), padding=True)
val_tokenized = bert_tokenizer(x_val.to_list(), padding=True)
test_tokenized = bert_tokenizer(x_test.to_list(), padding=True)

label_encoder = OneHotEncoder()
train_labels = label_encoder.fit_transform(y_train.to_numpy().reshape(-1,1))
val_labels =  label_encoder.transform(y_val.to_numpy().reshape(-1,1))
test_labels =  label_encoder.transform(y_test.to_numpy().reshape(-1,1))

if type(train_labels) == scipy.sparse.csr_matrix:
        train_labels = train_labels.toarray()

if type(val_labels) == scipy.sparse.csr_matrix:
        val_labels = val_labels.toarray()

if type(test_labels) == scipy.sparse.csr_matrix:
        test_labels = test_labels.toarray()

train_dataloader = get_dataloader(train_tokenized, train_labels, batch_size)
val_dataloader = get_dataloader(val_tokenized, val_labels, batch_size)
test_dataloader = get_dataloader(test_tokenized, test_labels, batch_size)


### Data Analysis

In [11]:
def analyze(tokenized_input, output):

    num_samples = len(tokenized_input)
    
    num_pos = len(output[output == 1])
    num_neg = len(output[output == 0])
    num_zero =  len(output[output == 2])

    list_lengths = tokenized_input.apply(len)

    # Find the size of the smallest/largest list
    num_min_tokens = min(list_lengths)
    num_max_tokens = max(list_lengths)
    num_mean_tokens = list_lengths.mean()
    
    return {"Number of Samples":num_samples, 
            "Number of Samples with Meaningful Gain": num_pos,
            "Number of Samples with Small movement": num_zero,
            "Number of Samples with Meaningful loss": num_neg,
            "Minimum Number of Tokens": num_min_tokens, 
            "Maximum Number of Tokens":num_max_tokens, 
            "Mean Number of Tokens":num_mean_tokens}

def df_for_analysis(train_analysis, test_analysis, validation_analysis):
    df = pd.DataFrame([train_analysis, test_analysis, validation_analysis], index=['Train', 'Test', 'Validation'])
    return df

analysis_df = df_for_analysis(analyze(x_train, y_train), analyze(x_test,y_test), analyze(x_val, y_val))

analysis_df

Unnamed: 0,Number of Samples,Number of Samples with Meaningful Gain,Number of Samples with Small movement,Number of Samples with Meaningful loss,Minimum Number of Tokens,Maximum Number of Tokens,Mean Number of Tokens
Train,55148,19313,18786,17049,3,458,74.550863
Test,6893,2432,2312,2149,13,390,74.484404
Validation,6894,2419,2353,2122,9,401,73.311285


### BERT Model

Tokenization

In [12]:
n_epoch = 10
learning_rate = 1e-6

In [13]:
criterion = BCEWithLogitsLoss()
optimizer = AdamW(bert_model.parameters(), lr=learning_rate)



Train the model

In [14]:
for epoch in range(n_epoch):
    bert_model.train()
    train_loss, train_step = 0, 0

    print(f"Epoch: {epoch+1}")
    
    for batch in tqdm(train_dataloader, total=len(train_dataloader)):
        bert_model.zero_grad()
        batch_input_ids, batch_attention_mask, batch_labels, batch_token_type_ids = [b.to(device) for b in batch]

        output = bert_model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, token_type_ids=batch_token_type_ids)
        logits = output[0]
        loss = criterion(logits.view(-1, 3), batch_labels.view(-1,3))

        loss.backward()
        optimizer.step()

        train_step += 1
        train_loss += loss.item()

    

    print(f"Train loss: {train_loss/train_step}")
    _, _, eval_acc = evaluate_model(bert_model, val_dataloader)
    print(f"Eval accuracy: {eval_acc}")
    




Epoch: 1


100%|██████████| 1724/1724 [07:49<00:00,  3.67it/s]


Train loss: 0.6318599841020082


100%|██████████| 216/216 [00:17<00:00, 12.04it/s]


Eval accuracy: 0.4563388453727879
Epoch: 2


100%|██████████| 1724/1724 [07:15<00:00,  3.96it/s]


Train loss: 0.6080861588817014


100%|██████████| 216/216 [00:17<00:00, 12.05it/s]


Eval accuracy: 0.47708152016246014
Epoch: 3


100%|██████████| 1724/1724 [07:23<00:00,  3.89it/s]


Train loss: 0.5954436751503809


100%|██████████| 216/216 [00:17<00:00, 12.38it/s]


Eval accuracy: 0.4823034522773426
Epoch: 4


100%|██████████| 1724/1724 [07:20<00:00,  3.92it/s]


Train loss: 0.5890996451058786


100%|██████████| 216/216 [00:17<00:00, 12.06it/s]


Eval accuracy: 0.4881055990716565
Epoch: 5


100%|██████████| 1724/1724 [07:14<00:00,  3.97it/s]


Train loss: 0.5846676963503966


100%|██████████| 216/216 [00:17<00:00, 12.05it/s]


Eval accuracy: 0.4920220481578184
Epoch: 6


100%|██████████| 1724/1724 [07:14<00:00,  3.97it/s]


Train loss: 0.5804835131355256


100%|██████████| 216/216 [00:17<00:00, 12.03it/s]


Eval accuracy: 0.4976791412822744
Epoch: 7


100%|██████████| 1724/1724 [07:12<00:00,  3.98it/s]


Train loss: 0.5776524192022864


100%|██████████| 216/216 [00:17<00:00, 12.37it/s]


Eval accuracy: 0.4976791412822744
Epoch: 8


100%|██████████| 1724/1724 [07:12<00:00,  3.99it/s]


Train loss: 0.5746270885046052


100%|██████████| 216/216 [00:17<00:00, 12.40it/s]


Eval accuracy: 0.4978241949521323
Epoch: 9


100%|██████████| 1724/1724 [07:12<00:00,  3.99it/s]


Train loss: 0.5713924634521615


100%|██████████| 216/216 [00:17<00:00, 12.37it/s]


Eval accuracy: 0.4985494633014215
Epoch: 10


100%|██████████| 1724/1724 [07:12<00:00,  3.98it/s]


Train loss: 0.5678687453078874


100%|██████████| 216/216 [00:17<00:00, 12.37it/s]

Eval accuracy: 0.4975340876124166





In [15]:
predictions, true_labels, acc = evaluate_model(bert_model, test_dataloader)
print(acc)


100%|██████████| 216/216 [00:18<00:00, 11.79it/s]

0.48324387059335555





In [16]:
accuracy, df = score_report(predictions, true_labels)
print(f"Accuracy: {accuracy}")
df

Accuracy: 0.48324387059335555


Unnamed: 0,Negative,Positive,Neutral
F1 Score,41.236529,49.66122,51.812285
Precision,52.795933,48.182521,46.348123
Recall,33.829688,51.233553,58.737024


In [17]:
if device == 'cuda':
    bert_model.to('cpu')
    torch.save(bert_model, 'Models/bert_model.pt')
    torch.cuda.empty_cache()

### Finbert Model Testing

In [18]:
finbert_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [19]:
import torch
def predict_article(model, tokenizer, sample):
    inputs = tokenizer(sample, return_tensors='pt').to(device)
    output = model(**inputs)
    return output.logits

def convert_to_sentiment_int(output, modelType):
    if modelType == 'BERT':
        return int(torch.argmax(output, dim=-1))
    elif modelType == 'FinBERT':
        result = int(torch.argmax(output, dim=-1))
        # if 0, return 1, if 1, return 0, if 2, return 2. why? finbert flips 0 and 1. 0 is positive, 1 is negative. we want the opposite. 
        # 2 would be neutral. 
        return 1 if result == 0 else 0 if result == 1 else 2

# now given a dataframe of samples, we can evaluate the model on each sample and return the results
def predict_articles(model, tokenizer, df, modelType):
    true_labels = []
    predictions = []
    numCorrect = 0
    numPos = 0
    numNeg = 0
    numNeut = 0
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        
        # 0 if negative, 1 if positive, 2 if neutral
        ground_truth = row['sentiment']
        true_labels.append(ground_truth)
        model_output = predict_article(model, tokenizer, row['title'])
        prediction = convert_to_sentiment_int(model_output, modelType)
        predictions.append(prediction)
        numCorrect+=1 if ground_truth == prediction else 0
        if ground_truth == prediction:
            if prediction == 0:
                numNeg+=1
            elif prediction == 1:
                numPos+=1
            else:
                numNeut+=1
        
    accuracy = numCorrect / len(df)
    acc_pos = numPos / len(df[df['sentiment'] == 1])
    acc_neg = numNeg / len(df[df['sentiment'] == 0])
    acc_neut = numNeut / len(df[df['sentiment'] == 2])
    
    
    return accuracy, acc_pos, acc_neg, acc_neut, true_labels, predictions

#combine x_test and y_test to make a dataframe
test_df = pd.concat([x_test, y_test], axis=1)
finbert_accuracy, fin_pos, fin_neg, fin_neut, true_labels, predictions = predict_articles(finbert_model, finbert_tokenizer, test_df, 'FinBERT')

print(finbert_accuracy)

100%|██████████| 6893/6893 [01:23<00:00, 82.70it/s]

0.4124474104163644





In [20]:
accuracy, df = score_report(predictions, true_labels)
print(f"Accuracy: {accuracy}")
df

Accuracy: 0.4124474104163644


Unnamed: 0,Negative,Positive,Neutral
F1 Score,40.865892,38.164706,43.771747
Precision,47.223917,44.609461,36.61234
Recall,36.016752,33.347039,54.411765


Bert model results are slightly different from report results as this was a separate run. We originally had the models in two separate files but opted to combine them for submission as it made more sense for the submission