In [38]:
!pip install datasets tqdm scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading scipy-1.11.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     ---------------------------------------- 60.4/60.4 kB 3.1 MB/s eta 0:00:00
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl (9.2 MB)
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
   --- ------------------------------------ 0.7/9.2 MB 23.1 MB/s eta 0:00:01
   ------------- -------------------------- 3.1/9.2 MB 39.3 MB/s eta 0:00:01
   ----------------------------- ---------- 6.8/9.2 MB 54.0 MB/s eta 0:00:01
   ------------------------------------

In [30]:
from datasets import load_dataset
import datasets

In [39]:
dataset = load_dataset("pauri32/fiqa-2018")

In [5]:
datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'snippets', 'target', 'sentiment_score', 'aspects', 'format', 'label'],
        num_rows: 961
    })
    validation: Dataset({
        features: ['sentence', 'snippets', 'target', 'sentiment_score', 'aspects', 'format', 'label'],
        num_rows: 102
    })
    test: Dataset({
        features: ['sentence', 'snippets', 'target', 'sentiment_score', 'aspects', 'format', 'label'],
        num_rows: 150
    })
})

# Label values:
0: positive

1: neutral

2: negative

## Todo

refactor this function to use label

In [40]:
from sklearn.metrics import accuracy_score,f1_score
from datasets import load_dataset
from tqdm import tqdm


def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}

def change_target(x):
    if 'positive' in x or 'Positive' in x:
        return 'positive'
    elif 'negative' in x or 'Negative' in x:
        return 'negative'
    else:
        return 'neutral'


def test_fiqa(model, tokenizer, batch_size = 8):
    dataset = load_dataset('pauri32/fiqa-2018')
    dataset = dataset["test"]
    dataset = dataset.to_pandas()
    dataset["output"] = dataset['label']
    dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."

    dataset = dataset[['sentence', 'output', 'instruction']]
    dataset.columns = ['input', 'output', 'instruction']
    dataset[['context', 'target']] = dataset.apply(format_example, axis = 1, result_type="expand")

    # print example
    print(f"\n\nPrompt example:\n{dataset['context'][1]}\n\n")

    context = dataset['context'].tolist()
    total_steps = dataset.shape[0]//batch_size + 1
    print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}")

    out_text_list = []

    for i in tqdm(range(total_steps)):
        tmp_context = context[i* batch_size:(i+1)* batch_size]
        tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512)
        for k in tokens.keys():
            tokens[k] = tokens[k].cuda()
        
        res = model.generate(**tokens, max_length=512)
        res_sentences = tokenizer.batch_decode(res)
        out_text = [o.split("Answer: ")[1] for o in res_sentences]
        out_text_list += out_text
        torch.cuda.empty_cache()

    dataset["out_text"] = out_text_list
    dataset["new_target"] = dataset["target"].apply(change_target)
    dataset["new_out"] = dataset["out_text"].apply(change_target)

    acc = accuracy_score(dataset["new_target"], dataset["new_out"])
    f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro")
    f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro")
    f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted")

    print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ")

    return dataset

In [41]:
test_fiqa(None, None)



Prompt example:
Instruction: What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}.
Input: @gakrum nice chart shows distinctive down channel not a dip.. where do you see the bottom? $SPY ..$150? ..$130?
Answer: 


Total len: 150. Batchsize: 8. Total steps: 19


  0%|          | 0/19 [00:00<?, ?it/s]


TypeError: 'NoneType' object is not callable

In [18]:
test_fiqa(None, None).query('label == 1')

Unnamed: 0,sentence,snippets,target,sentiment_score,aspects,format,label
10,Sold some $TSLA Puts this morning Closed them ...,['Sold some $TSLA Puts this morning'],TSLA,-0.043,['Stock/Price Action/Bearish/Bear Position'],post,1
12,RT @tomhend777 $MU needs to hold here -Broken ...,['Still not technically oversold so now big bo...,MU,-0.046,['Stock/Technical Analysis'],post,1
14,Valeant Names Interim Leader as CEO Remains Ho...,['Interim Leader as CEO Remains Hospitalized'],Valeant,0.0,['Corporate/Appointment'],headline,1
16,Why $TWTR is doomed in 1 chart $FB $GOOG http...,['doomed in 1 chart'],GOOG,-0.07,['Stock/Technical Analysis'],post,1
18,AB InBev to Sell SABMiller Stake in China's Sn...,['to Sell SABMiller Stake in'],SABMiller,-0.045,['Stock/Signal/Sell Signal'],headline,1
21,Whitbread boss Andy Harrison defends sales fal...,['defends sales fall as 'just a blip''],Whitbread,-0.1,['Corporate/Sales'],headline,1
23,MillerCoors Board Names Gavin Hattersley Inter...,['Board Names Gavin Hattersley Interim CEO'],MillerCoors,0.0,['Corporate/Appointment'],headline,1
27,GlaxoSmithKline starts hunt for successor to C...,['starts hunt for successor to CEO'],GlaxoSmithKline,0.0,['Corporate/Appointment/Executive Appointment'],headline,1
30,Mylan Appoints Ranjan Ray Chaudhuri as Global ...,['Appoints Ranjan Ray Chaudhuri as Global Comm...,Mylan,0.0,['Corporate/Appointment'],headline,1
36,Keith Skeoch to step up as David Nish quits as...,['David Nish quits as chief executive'],Standard Life,-0.09,['Corporate/Appointment'],headline,1
