In [1]:
import matplotlib
import os
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import softmax
from datasets import Dataset
import torch
from nltk.tokenize import word_tokenize
import nltk
from scipy.spatial import distance
from scipy.special import softmax

In [2]:
emo_model_id = "SamLowe/roberta-base-go_emotions"
emo_classifier = pipeline('text-classification', model=emo_model_id,tokenizer=emo_model_id, max_length=512, truncation=True, top_k=None)

In [3]:
if(os.path.exists('../modeldata/emo_probi.p')):
    print("LOADING data emotion probability distribution...")
    with open('../modeldata/emo_probi.p', "rb") as f:
        [all_emo_probi, _] = pickle.load(f)
    f.close()
all_emo_probi = dict(all_emo_probi)

LOADING data emotion probability distribution...


In [3]:
def append_scores(labels, original, sample):
    all_emo_scores = original
    for sam in sample:
        for s in sam:
            emo = s.get('label')
            prev_score = original[emo]
            score = s.get('score')
            all_emo_scores[emo] = (prev_score + score)
    all_scores = list(zip(*all_emo_scores.items()))[1]
    probi = softmax(all_scores)
    all_emo_scores = dict(zip(labels, probi))
    return all_emo_scores

def weighted_bleu_score(target, response):
    score1 = nltk.translate.bleu_score.sentence_bleu([target], response, weights=(1, 0, 0))
    score2 = nltk.translate.bleu_score.sentence_bleu([target], response, weights=(0, 1, 0))
    score3 = nltk.translate.bleu_score.sentence_bleu([target], response, weights=(0, 0, 1))
    ngram_score_list = [score1, score2, score3]
    return (sum(ngram_score_list) / len(ngram_score_list))

In [4]:
print("Start testing.")
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
BLEU_score_list = []
texts = []
ppo_model = "../DEV-blenderbot-400m-emo-probi-bleu-epoch0-score0.382-bleu0.06629"
model_id = ppo_model
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map={"": device}, torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Start testing.


Some weights of the model checkpoint at ../DEV-blenderbot-400m-emo-probi-bleu-epoch0-score0.382-bleu0.06629 were not used when initializing BlenderbotForConditionalGeneration: ['v_head.summary.weight', 'v_head.summary.bias']
- This IS expected if you are initializing BlenderbotForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BlenderbotForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
with open("../modeldata/test_dialogue_dataset.p", "rb") as f:
    [test_dataset] = pickle.load(f)
test_dataset = Dataset.from_dict(test_dataset)[:10]
test_dataset = Dataset.from_dict(test_dataset)

def tokenize(sample):
    prompt = sample["prompt"] # prompt
    continuation = sample["target"] # utterance

    sample["input_ids"] = tokenizer.encode(prompt)
    #sample["target_ids"] = tokenizer.encode(continuation)[: input_size()]
    sample["query"] = {"prompt": tokenizer.decode(sample["input_ids"]), "target": continuation}
    return sample

test_dataset = test_dataset.map(tokenize, batched=False)
test_dataset.set_format(type="torch")
emp_weight = 0.2
fluency_weight = 0.8

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [9]:
print("Start testing...")
print(len(test_dataset))
#try:
with open(f'DEV_test_text_log_emo_probi_score.txt', 'w') as text_log:
    counter = 0
    prompts = []
    for test_query in test_dataset:
        input_texts = test_query["prompt"]
        prompts.append(input_texts)
        target = test_query["query"]["target"]
        #print(target)
        input_ids = tokenizer(input_texts, return_tensors="pt", padding=True).to(device)
        #input_ids = test_query['input_ids']
        outputs = model.generate(**input_ids, do_sample=True, max_new_tokens=40, use_cache=True)
        generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        texts.append(generated_texts[0])
        #text_log.write(f"{counter} Prompt: {input_texts} \n")
        #text_log.write(f"{counter} Response: {generated_texts[0]} \n")
        #text_log.write(f"{counter} Ref: {target} \n")
        counter += 1
        print(counter)

        # Calculate bleu score
        test_response = word_tokenize(generated_texts[0])
        dev_target = word_tokenize(test_query["target"])
        dev_BLEUscore = weighted_bleu_score(dev_target, test_response)
        BLEU_score_list.append(dev_BLEUscore)
text_log.close()

mean_bleu = sum(BLEU_score_list) / len(BLEU_score_list)

# calculate emo distribution
prompt_results = emo_classifier(prompts)
emo_results = emo_classifier(texts)
labels = [s.get('label') for s in emo_results[0]]
zeros = [0] * len(labels)
list_js_distance = []
for i in range(len(prompt_results)):
    #print(prompt_results[i])
    #print(emo_results[i])
    prompt_dict = dict(zip(labels, zeros))
    response_dict = dict(zip(labels, zeros))
    for j in range(len(prompt_results[i])):
        label = prompt_results[i][j].get("label")
        prompt_dict[label] = prompt_results[i][j].get("score")
        label = emo_results[i][j].get("label")
        response_dict[label] = emo_results[i][j].get("score")

    prompt_value = dict(sorted(prompt_dict.items(), key=lambda x: x[0].lower())).values()
    prompt_value = list(prompt_value)
    #print(prompt_value)
    response_value = dict(sorted(response_dict.items(), key=lambda x: x[0].lower())).values()
    response_value = list(response_value)

    js_distance = distance.jensenshannon(prompt_value, response_value)
    list_js_distance.append(js_distance)
    #print(response_value)
mean_js_distance = sum(list_js_distance) / len(list_js_distance)

"""
zeros = [0] * len(labels)
score_dict = dict(zip(labels, zeros))
empathy_results = append_scores(labels, score_dict, emo_results)
# sort alphabetically
empathy_results = dict(sorted(empathy_results.items(), key=lambda x: x[0].lower()))
all_emo_probi_values = list(all_emo_probi.values())
empathy_results_values = list(empathy_results.values())

js_distance = distance.jensenshannon(all_emo_probi_values, empathy_results_values)
# js_distance: identical = 0, entirely different = 1, reverse this for reward
emo_score = 1 - js_distance

current_score = (emo_score * emp_weight) + (mean_bleu * fluency_weight)
with open(f'DEV_test_score_log_emo_probi_score.txt', 'wb') as score_log:
    score_log.write(f"Mean BLEU of this model: {mean_bleu}. \n")
    score_log.write(f"Emo distribution similarity of this model: {emo_score}. \n")
    score_log.write(f"Score of this model: {current_score}. \n")
    print(f"Mean BLEU of this model: {mean_bleu}. \n")
    print(f"Emo distribution similarity of this model: {emo_score}. \n")
    print(f"Score of this model: {current_score}. \n")
score_log.close()
#except Exception as err:
#    with open(f'DEV_test_error_log_emo_probi_score.txt', 'w') as err_log:
#        err_log.write(f"Unexpected {err=}, {type(err)=}")
#    err_log.close()
"""

Start testing...
10
1
2
3
4
5
6
7
8
9
10
dict_values([0.015353312715888023, 0.004193090833723545, 0.003757640952244401, 0.007810292299836874, 0.032693423330783844, 0.008610889315605164, 0.0044470117427408695, 0.00202195905148983, 0.004049321636557579, 0.017276519909501076, 0.007352734915912151, 0.011946394108235836, 0.004603976849466562, 0.005533120594918728, 0.817112147808075, 0.002720079617574811, 0.011198784224689007, 0.008417824283242226, 0.0030771277379244566, 0.040655042976140976, 0.09969515353441238, 0.010760837234556675, 0.0029458615463227034, 0.04813799634575844, 0.006577658466994762, 0.002215614775195718, 0.07298275083303452, 0.012798329815268517])
[('admiration', 0.041058462113142014), ('amusement', 0.012828965671360493), ('anger', 0.005675230175256729), ('annoyance', 0.008834452368319035), ('approval', 0.039628565311431885), ('caring', 0.15474122762680054), ('confusion', 0.15657512843608856), ('curiosity', 0.17108076810836792), ('desire', 0.006378207355737686), ('disappoint

'\nzeros = [0] * len(labels)\nscore_dict = dict(zip(labels, zeros))\nempathy_results = append_scores(labels, score_dict, emo_results)\n# sort alphabetically\nempathy_results = dict(sorted(empathy_results.items(), key=lambda x: x[0].lower()))\nall_emo_probi_values = list(all_emo_probi.values())\nempathy_results_values = list(empathy_results.values())\n\njs_distance = distance.jensenshannon(all_emo_probi_values, empathy_results_values)\n# js_distance: identical = 0, entirely different = 1, reverse this for reward\nemo_score = 1 - js_distance\n\ncurrent_score = (emo_score * emp_weight) + (mean_bleu * fluency_weight)\nwith open(f\'DEV_test_score_log_emo_probi_score.txt\', \'wb\') as score_log:\n    score_log.write(f"Mean BLEU of this model: {mean_bleu}. \n")\n    score_log.write(f"Emo distribution similarity of this model: {emo_score}. \n")\n    score_log.write(f"Score of this model: {current_score}. \n")\n    print(f"Mean BLEU of this model: {mean_bleu}. \n")\n    print(f"Emo distribution