[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/carloszan/experimental-fine-tuning/blob/main/fine-tunning-4-fiqa.ipynb)

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

base_model = "Seethal/sentiment_analysis_generic_dataset"

tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer = AutoTokenizer.from_pretrained(base_model, device_map="auto")
model = AutoModelForSequenceClassification.from_pretrained(base_model, load_in_8bit=True, torch_dtype=torch.float16)

model = model.eval()

In [17]:
tokens = tokenizer(["Fuck this economy. I hate aig and their non loan given asses"], return_tensors='pt',
                           padding=True)
output = model(**tokens)

output = torch.nn.functional.softmax(output.logits.float(), dim=-1)
output.argmax()

tensor(0)

In [18]:
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd

def change_format(x):
    dic = {0:0, 1:2, 2:4}
    return dic[x]

def test_fiqa(model, tokenizer, batch_size=8):
    dataset = load_dataset('sentiment140')
    dataset = dataset["test"]
    dataset = dataset.to_pandas()

    negative_df = dataset.query("sentiment == 0")[:50]
    neutral_df = dataset.query("sentiment == 2")[:50]
    positive_df = dataset.query("sentiment == 4")[:50]

    dataset = pd.concat([negative_df, neutral_df, positive_df])

    dataset = dataset.rename(columns={"sentiment": "target"})


    dataset = dataset[['text', 'target']]

    # print example
    print(f"\n\nPrompt example:\n{dataset['text'][1]}\n\n")

    context = dataset['text'].tolist()
    total_steps = dataset.shape[0]//batch_size + 1
    print(
        f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")

    out_text = []

    for i in tqdm(range(total_steps)):
        tmp_context = context[i * batch_size:(i+1) * batch_size]

        tokenizer.pad_token = "[PAD]"

        tokens = tokenizer(tmp_context, return_tensors='pt', padding=True)

        output = model(**tokens)
        output = torch.nn.functional.softmax(output.logits.float(), dim=-1)
        out_text.append(output.detach().numpy())
        torch.cuda.empty_cache()

    out_text = [item for sublist in out_text for item in sublist]
    dataset["out_text"] = out_text
    dataset["new_out"] = dataset["out_text"].apply(np.argmax).apply(change_format)

    acc = accuracy_score(dataset["target"], dataset["new_out"])
    f1_macro = f1_score(dataset["target"], dataset["new_out"], average="macro")
    f1_micro = f1_score(dataset["target"], dataset["new_out"], average="micro")
    f1_weighted = f1_score(
        dataset["target"], dataset["new_out"], average="weighted")

    print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")

    return dataset


dataset = test_fiqa(model, tokenizer)
dataset



Prompt example:
Reading my kindle2...  Love it... Lee childs is good read.


Total len: 150. Batchsize: 8. Total steps: 19


100%|██████████| 19/19 [00:00<00:00, 36.08it/s]

Acc: 0.7333333333333333. F1 macro: 0.7359278669358401. F1 micro: 0.7333333333333333. F1 weighted (BloombergGPT): 0.7359278669358402. 





Unnamed: 0,text,target,out_text,new_out
6,Fuck this economy. I hate aig and their non lo...,0,"[0.9956375, 0.003263108, 0.0010994273]",0
11,@Karoli I firmly believe that Obama/Pelosi hav...,0,"[0.9826833, 0.013786437, 0.0035302923]",0
14,"dear nike, stop with the flywire. that shit is...",0,"[0.98321956, 0.014683595, 0.0020968618]",0
16,I was talking to this guy last night and he wa...,0,"[0.99553454, 0.003466424, 0.0009989834]",0
18,"@ludajuice Lebron is a Beast, but I'm still ch...",0,"[0.035161164, 0.07469104, 0.8901478]",4
...,...,...,...,...
106,"@psychemedia I really liked @kswedberg's ""Lear...",4,"[0.0004353631, 0.0025622505, 0.9970024]",4
108,"Very Interesting Ad from Adobe by Goodby, Silv...",4,"[0.0007936559, 0.0032928262, 0.99591345]",4
109,Goodby Silverstein agency new site! http://www...,4,"[0.004666965, 0.02046132, 0.9748717]",4
110,"RT @designplay Goodby, Silverstein's new site:...",4,"[0.00101758, 0.004529408, 0.994453]",4


In [19]:
dataset

Unnamed: 0,text,target,out_text,new_out
6,Fuck this economy. I hate aig and their non lo...,0,"[0.9956375, 0.003263108, 0.0010994273]",0
11,@Karoli I firmly believe that Obama/Pelosi hav...,0,"[0.9826833, 0.013786437, 0.0035302923]",0
14,"dear nike, stop with the flywire. that shit is...",0,"[0.98321956, 0.014683595, 0.0020968618]",0
16,I was talking to this guy last night and he wa...,0,"[0.99553454, 0.003466424, 0.0009989834]",0
18,"@ludajuice Lebron is a Beast, but I'm still ch...",0,"[0.035161164, 0.07469104, 0.8901478]",4
...,...,...,...,...
106,"@psychemedia I really liked @kswedberg's ""Lear...",4,"[0.0004353631, 0.0025622505, 0.9970024]",4
108,"Very Interesting Ad from Adobe by Goodby, Silv...",4,"[0.0007936559, 0.0032928262, 0.99591345]",4
109,Goodby Silverstein agency new site! http://www...,4,"[0.004666965, 0.02046132, 0.9748717]",4
110,"RT @designplay Goodby, Silverstein's new site:...",4,"[0.00101758, 0.004529408, 0.994453]",4


In [20]:
path = '../../results/general/bert-sentiment-analysis.csv'

In [21]:
dataset.to_csv(path)

In [22]:
import pandas as pd

df = pd.read_csv(path, index_col=0)

true, pred = df["target"], df["new_out"]


import sys
sys.path.append('../../')
from metrics import metrics

metrics(true, pred)

Precision: 0.757, Recall: 0.733, F1: 0.736, Accuracy: 0.733
