In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

base_model = "nlptown/bert-base-multilingual-uncased-sentiment"

tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer = AutoTokenizer.from_pretrained(base_model, device_map="auto")
model = AutoModelForSequenceClassification.from_pretrained(base_model, load_in_8bit=True, torch_dtype=torch.float16)

model = model.eval()

In [10]:
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd

def map_output(arg):
  # Stars 0 and 1 => Negative (which is 0)
  # Stars 2 => Neutral (which is 2)
  # Stars 3 and 4 => Positive (which is 4)
  dic = {0: 0, 1: 0, 2: 2, 3: 4, 4: 4}
  return dic[arg]


def test_sentiment140(model, tokenizer, batch_size=8):
    dataset = load_dataset('sentiment140')
    dataset = dataset["test"]
    dataset = dataset.to_pandas()

    negative_df = dataset.query("sentiment == 0")[:50]
    neutral_df = dataset.query("sentiment == 2")[:50]
    positive_df = dataset.query("sentiment == 4")[:50]

    dataset = pd.concat([negative_df, neutral_df, positive_df])

    dataset = dataset.rename(columns={"sentiment": "target"})


    dataset = dataset[['text', 'target']]

    # print example
    print(f"\n\nPrompt example:\n{dataset['text'][1]}\n\n")

    context = dataset['text'].tolist()
    total_steps = dataset.shape[0]//batch_size + 1
    print(
        f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")

    out_text = []

    for i in tqdm(range(total_steps)):
        tmp_context = context[i * batch_size:(i+1) * batch_size]

        tokenizer.pad_token = "[PAD]"

        tokens = tokenizer(tmp_context, return_tensors='pt',
                           padding=True)

        output = model(**tokens)
        output = torch.nn.functional.softmax(output.logits.float(), dim=-1)
        out_text.append(output.detach().numpy())
        torch.cuda.empty_cache()

    out_text = [item for sublist in out_text for item in sublist]
    dataset["out_text"] = out_text
    dataset["new_out"] = dataset["out_text"].apply(np.argmax).apply(map_output)

    acc = accuracy_score(dataset["target"], dataset["new_out"])
    f1_macro = f1_score(dataset["target"], dataset["new_out"], average="macro")
    f1_micro = f1_score(dataset["target"], dataset["new_out"], average="micro")
    f1_weighted = f1_score(
        dataset["target"], dataset["new_out"], average="weighted")

    print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")

    return dataset

dataset = test_sentiment140(model, tokenizer)
dataset



Prompt example:
Reading my kindle2...  Love it... Lee childs is good read.


Total len: 150. Batchsize: 8. Total steps: 19


100%|██████████| 19/19 [00:00<00:00, 19.53it/s]


Acc: 0.62. F1 macro: 0.5516226138899054. F1 micro: 0.62. F1 weighted (BloombergGPT): 0.5516226138899054. 


Unnamed: 0,text,target,out_text,new_out
6,Fuck this economy. I hate aig and their non lo...,0,"[0.94383484, 0.044057608, 0.0052516637, 0.0019...",0
11,@Karoli I firmly believe that Obama/Pelosi hav...,0,"[0.94535536, 0.037818972, 0.0053796084, 0.0027...",0
14,"dear nike, stop with the flywire. that shit is...",0,"[0.9455346, 0.035257842, 0.0070245187, 0.00254...",0
16,I was talking to this guy last night and he wa...,0,"[0.7907809, 0.12542555, 0.034339704, 0.0163003...",0
18,"@ludajuice Lebron is a Beast, but I'm still ch...",0,"[0.949898, 0.033997145, 0.004770301, 0.0024917...",0
...,...,...,...,...
106,"@psychemedia I really liked @kswedberg's ""Lear...",4,"[0.4807703, 0.25910297, 0.14007127, 0.05775725...",0
108,"Very Interesting Ad from Adobe by Goodby, Silv...",4,"[0.4155449, 0.26777244, 0.20356138, 0.06113514...",0
109,Goodby Silverstein agency new site! http://www...,4,"[0.34933278, 0.25871626, 0.23764336, 0.0877662...",0
110,"RT @designplay Goodby, Silverstein's new site:...",4,"[0.409311, 0.2600469, 0.21110716, 0.06688347, ...",0


In [11]:
path = '../../results/general/bert.csv'

In [12]:
dataset.to_csv(path)

In [13]:
import pandas as pd

df = pd.read_csv(path, index_col=0)

true, pred = df["target"], df["new_out"]


import sys
sys.path.append('../../')
from metrics import metrics

metrics(true, pred)

Precision: 0.693, Recall: 0.62, F1: 0.552, Accuracy: 0.62
