[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/carloszan/experimental-fine-tuning/blob/main/fine-tunning-4-fiqa.ipynb)

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

base_model = "Seethal/sentiment_analysis_generic_dataset"

tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer = AutoTokenizer.from_pretrained(base_model, device_map="auto")
model = AutoModelForSequenceClassification.from_pretrained(base_model, load_in_8bit=True, torch_dtype=torch.float16)

model = model.eval()

In [2]:
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np

def change_format(x):
    dic = {0:2, 1:1, 2:0}
    return dic[x]

def test_fiqa(model, tokenizer, batch_size=8):
    dataset = load_dataset('pauri32/fiqa-2018')
    dataset = dataset["test"]
    dataset = dataset.to_pandas()

    dataset["target"] = dataset['label']

    dataset = dataset[['sentence', 'target']]
    dataset.columns = ['input', 'target']

    # print example
    print(f"\n\nPrompt example:\n{dataset['input'][1]}\n\n")

    context = dataset['input'].tolist()
    total_steps = dataset.shape[0]//batch_size + 1
    print(
        f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")

    out_text = []

    for i in tqdm(range(total_steps)):
        tmp_context = context[i * batch_size:(i+1) * batch_size]

        tokenizer.pad_token = "[PAD]"

        tokens = tokenizer(tmp_context, return_tensors='pt', padding=True)

        output = model(**tokens)
        output = torch.nn.functional.softmax(output.logits.float(), dim=-1)
        out_text.append(output.detach().numpy())
        torch.cuda.empty_cache()

    out_text = [item for sublist in out_text for item in sublist]
    dataset["out_text"] = out_text
    dataset["new_out_np"] = dataset["out_text"].apply(np.argmax)
    dataset["new_out"] = dataset["new_out_np"].apply(change_format)

    acc = accuracy_score(dataset["target"], dataset["new_out"])
    f1_macro = f1_score(dataset["target"], dataset["new_out"], average="macro")
    f1_micro = f1_score(dataset["target"], dataset["new_out"], average="micro")
    f1_weighted = f1_score(
        dataset["target"], dataset["new_out"], average="weighted")

    print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")

    return dataset


dataset = test_fiqa(model, tokenizer)
dataset



Prompt example:
@gakrum nice chart shows distinctive down channel not a dip.. where do you see the bottom? $SPY ..$150? ..$130?


Total len: 150. Batchsize: 8. Total steps: 19


100%|██████████| 19/19 [00:00<00:00, 20.78it/s]

Acc: 0.42. F1 macro: 0.42057716557413344. F1 micro: 0.41999999999999993. F1 weighted (BloombergGPT): 0.4205771655741335. 





Unnamed: 0,input,target,out_text,new_out_np,new_out
0,$HCP Come to the party and buy this -gonna giv...,0,"[0.011196633, 0.074687906, 0.9141154]",2,0
1,@gakrum nice chart shows distinctive down chan...,2,"[0.008964843, 0.97913545, 0.011899706]",1,1
2,Japan's Asahi to submit bid next week for SABM...,0,"[0.0008679158, 0.9974624, 0.0016696872]",1,1
3,"Tesla Motors recalls 2,700 Model X SUVs $TSLA ...",2,"[0.45686668, 0.53308904, 0.010044334]",1,1
4,CRH's concrete bid for Holcim Lafarge assets,0,"[0.0006375049, 0.99752325, 0.0018392853]",1,1
...,...,...,...,...,...
145,Intertek swings to ÃÂ£347 mln loss on oil's s...,2,"[0.79997325, 0.16599314, 0.034033716]",0,2
146,RT @jan $ARNA Don't think buyout rumor strong ...,1,"[0.017061792, 0.09405435, 0.8888838]",2,0
147,Barclays appoints JPMorgan's Paul Compton as n...,1,"[0.0026774546, 0.9311581, 0.06616437]",1,1
148,$AAPL Now I'm glad I got stopped out of my $11...,1,"[0.001520809, 0.0064909854, 0.9919882]",2,0


In [3]:
dataset.to_csv('../../results/financial/bert-sentiment-analysis.csv')

In [2]:
import pandas as pd

df = pd.read_csv('../../results/financial/bert-sentiment-analysis.csv', index_col=0)

true, pred = df["target"], df["new_out"]


import sys
sys.path.append('../../')
from metrics import metrics

metrics(true, pred)

Precision: 0.554, Recall: 0.333, F1: 0.217, Accuracy: 0.333


In [3]:
import pandas as pd

df = pd.read_csv('../../results/financial/bert-sentiment-analysis.csv', index_col=0)

true, pred = df["target"], df["new_out"]


import sys
sys.path.append('../../')
from metrics import metrics

metrics(true, pred)

Precision: 0.554, Recall: 0.333, F1: 0.217, Accuracy: 0.333
