## **Inference for emotion recognition**

In [1]:
!pip install -r '../input/test-data/requirements.txt'



In [2]:
!pip install peft==0.3.0



#### **Import Libraries**

In [3]:
from accelerate import Accelerator
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    BitsAndBytesConfig,
)
import time
import json
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader



#### **Config**

In [4]:
# Params
class Config:
    def __init__(self) -> None:
        self.seed = 42
        self.speaker = True
        self.video = False
        self.audio = False

In [5]:
# Testing with emotion labels or without
with_labels = False

#### **Dataset Preprocessing and Preparation**

* We will divide the real test dataset into groups consisting of 50 conversations each or nearly 500 utterances (samples) due to session limits.
* There are total 664 conversations.
* This leads to 664/50 = 13 groups.

In [57]:
def split_data(data_file):
    file = open(data_file)
    data = json.load(file)
    os.makedirs('../working/input', exist_ok=True)

    num_conv = len(data)
    count = 0
    files = []
    while count < num_conv:
        i = 0
        group = []
        # form groups of 50 conversations / 500 utterances
        while i < 50 and count < num_conv:
            group.append(data[count])
            count += 1
            i += 1
        save_file = "../working/input/" + "group"+str(round(count/50))+".json"
        files.append(save_file)
        with open(save_file, 'w') as f:
            f.write(json.dumps(group))
        print("Saved Group: {} Length: {}".format(round(count/50), len(group)))
    return files

In [58]:
def get_prompt(sample_data_point, speaker=False, video=False, audio=False):
	if speaker and video and audio:
		instruction = """Identify the emotion label of utterance {} out of the 7 emotions: joy, anger, disgust, fear, sadness, surprise, neutral. The video_captions for each utterance describe the video corresponding to the text and audio captions describe the audio.\nDon't give an explanation.\nOutput only one line in the format:\nutterance_id :: emotion_label""".format(sample_data_point["utterance_id"])
	elif speaker and video:
		instruction = """Identify the emotion label of utterance {} out of the 7 emotions: joy, anger, disgust, fear, sadness, surprise, neutral. The video_captions for each utterance describe the video corresponding to the text.\nDon't give an explanation.\nOutput only one line in the format:\nutterance_id :: emotion_label""".format(sample_data_point["utterance_id"])
	else:
		instruction = """Identify the emotion label of utterance {} out of the 7 emotions: joy, anger, disgust, fear, sadness, surprise, neutral.\nDon't give an explanation.\nOutput only one line in the format:\nutterance_id :: emotion_label""".format(sample_data_point["utterance_id"])

	return f"""<s>[INST]\n{sample_data_point['conversation']}{instruction}\n[/INST]\n"""


In [59]:
def get_prompt_and_out(sample_data_point, speaker=False, video=False, audio=False):
	if speaker and video and audio:
		instruction = """Identify the emotion label of utterance {} out of the 7 emotions: joy, anger, disgust, fear, sadness, surprise, neutral. The video_captions for each utterance describe the video corresponding to the text and audio captions describe the audio.\nDon't give an explanation.\nOutput only one line in the format:\nutterance_id :: emotion_label""".format(sample_data_point["utterance_id"])
	elif speaker and video:
		instruction = """Identify the emotion label of utterance {} out of the 7 emotions: joy, anger, disgust, fear, sadness, surprise, neutral. The video_captions for each utterance describe the video corresponding to the text.\nDon't give an explanation.\nOutput only one line in the format:\nutterance_id :: emotion_label""".format(sample_data_point["utterance_id"])
	else:
		instruction = """Identify the emotion label of utterance {} out of the 7 emotions: joy, anger, disgust, fear, sadness, surprise, neutral.\nDon't give an explanation.\nOutput only one line in the format:\nutterance_id :: emotion_label""".format(sample_data_point["utterance_id"])

	return f"""<s>[INST]\n{sample_data_point['conversation']}{instruction}\n[/INST]\n""", f"""utterance_{sample_data_point["utterance_id"]} :: {sample_data_point['emotion_label']}\n</s>"""

In [60]:
def get_formatted_conversation(data_sample, speaker=False, video=False, audio=False):
    conversation_str = '"conversation": [\n'
    for utt in data_sample["conversation"]:
        # conversation_str
        if speaker and video and audio:
            conversation_str += "{{\n\"utterance_ID\": {}\n\"text\": {}\n\"speaker\": {}\n\"video_caption\": {}\n\"audio_caption\": {}".format(utt["utterance_ID"], utt["text"], utt["speaker"], utt["video_caption"], utt["audio_caption"])
            conversation_str += "\n}\n"
        elif speaker and video:
            conversation_str += "{{\n\"utterance_ID\": {}\n\"text\": {}\n\"speaker\": {}\n\"video_caption\": {}".format(utt["utterance_ID"], utt["text"], utt["speaker"], utt["video_caption"])
            conversation_str += "\n}\n"
        elif speaker:
            conversation_str += "{{\n\"utterance_ID\": {}\n\"text\": {}\n\"speaker\": {}\n}}\n".format(utt["utterance_ID"], utt["text"], utt["speaker"])
        else:
            conversation_str += "{{\n\"utterance_ID\": {}\n\"text\": {}\n}}\n".format(utt["utterance_ID"], utt["text"])
    return conversation_str

In [61]:
def form_input(data_sample, utt_id, speaker=False, video=False, audio=False, with_labels=False):
    conversation_str = get_formatted_conversation(data_sample, speaker, video, audio)
    if with_labels:
        emotions_str = data_sample["conversation"][utt_id]["emotion"]
    sample_data_point = {} # has utterance_id, conversation, emotion_label

    sample_data_point["utterance_id"] = utt_id+1
    sample_data_point["conversation"] = conversation_str
    if with_labels:
        sample_data_point["emotion_label"] = emotions_str
    else:
        sample_data_point["emotion_label"] = ""
    sample_data_point["conversation_id"] = data_sample["conversation_ID"]
    return sample_data_point

In [62]:
def generate_inst_for_each_utt_of_conv(data_point, speaker=False, video=False, audio=False, with_labels=False):
    num_utt = len(data_point["conversation"])
    list_output_objects = []
    for i in range(num_utt):
        sample_data_point = form_input(data_point, i, speaker, video, audio, with_labels)
        if with_labels:
            inst, out = get_prompt_and_out(sample_data_point, speaker, video, audio)
        else:
            inst = get_prompt(sample_data_point, speaker, video, audio)

        if with_labels:
            obj = {
                "instruction": inst,
                "out": out,
                "text": inst + out,
                "conversation_id": data_point["conversation_ID"],
                "utterance_id": i + 1,
                "emotion": data_point["conversation"][i]["emotion"]
            }
        else:
            obj = {
                "instruction": inst,
                # "out": out,
                # "text": inst + out,
                "conversation_id": data_point["conversation_ID"],
                "utterance_id": i + 1,
                "emotion": ""
            }
        list_output_objects.append(obj)
    return list_output_objects

In [63]:
def process_dataset(data, speaker=False, video=False, audio=False, with_labels=False):
    # add emotion key in original dict
    if not with_labels:
        for conv in data:
            for utt in conv["conversation"]:
                utt["emotion"] = ""
    flattened_list = [element for sublist in map(lambda x: generate_inst_for_each_utt_of_conv(x, speaker, video, audio, with_labels), data) for element in sublist]
    ds = {}
    ds["instruction"] = [sample["instruction"] for sample in flattened_list]
    if with_labels:
        ds["out"] = [sample["out"] for sample in flattened_list]
        ds["text"] = [sample["text"] for sample in flattened_list]
    ds["conversation_id"] = [sample["conversation_id"] for sample in flattened_list]
    ds["utterance_id"] = [sample["utterance_id"] for sample in flattened_list]
    ds["emotion"] = [sample["emotion"] for sample in flattened_list]
    return ds

In [64]:
config = Config()
data_file = "../input/test-data/Subtask_2_test.json"
# files = split_data(data_file)

#### **Load according to group number**

In [67]:
# Load one group of conversations
# group_num = 0
# data_file = files[group_num]
with open(data_file, 'r') as f:
    data = json.load(f)

In [68]:
# Form datasets from data
dataset = process_dataset(data, speaker=True, with_labels=with_labels)

In [72]:
# Total number of samples
n = len(dataset["instruction"])
print(n)

6301


In [69]:
print(dataset["instruction"][0])

<s>[INST]
"conversation": [
{
"utterance_ID": 1
"text": Please do not do that again . It is a horrible sound .
"speaker": Chandler
}
{
"utterance_ID": 2
"text": Uh , it is Paul .
"speaker": Paul
}
{
"utterance_ID": 3
"text": Buzz him in .
"speaker": Monica
}
{
"utterance_ID": 4
"text": Who is Paul ?
"speaker": Joey
}
{
"utterance_ID": 5
"text": Paul , the wine guy , Paul ?
"speaker": Ross
}
{
"utterance_ID": 6
"text": Maybe .
"speaker": Monica
}
{
"utterance_ID": 7
"text": Wait a minute . Your " not a real date " is with Paul , the wine guy ?
"speaker": Joey
}
{
"utterance_ID": 8
"text": He finally asked you out ?
"speaker": Ross
}
{
"utterance_ID": 9
"text": Yes .
"speaker": Monica
}
{
"utterance_ID": 10
"text": Ooh . This is a " Dear Diary " moment .
"speaker": Chandler
}
{
"utterance_ID": 11
"text": Rach , wait , I can cancel .
"speaker": Monica
}
{
"utterance_ID": 12
"text": Please , no . Go , I will be fine .
"speaker": Rachel
}
{
"utterance_ID": 13
"text": Ross , are you okay ? I

#### **Loading fine-tuned model saved on Huggingface Hub**

In [18]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('hf_a')" # add your token

In [19]:
!pip install -U bitsandbytes accelerate



In [20]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load peft config for pre-trained checkpoint
peft_model_id = "ArefaMuzaffar/merged-llama-emotion-text"
# config = PeftConfig.from_pretrained(peft_model_id)

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_compute_dtype=compute_dtype,
    llm_int8_enable_fp32_cpu_offload=True,
)

model = AutoModelForCausalLM.from_pretrained(peft_model_id,
                                             return_dict=True,
                                             device_map="auto",
                                             offload_folder="/tmp/.offload",
                                             quantization_config= bnb_config,
                                            )
tokenizer = AutoTokenizer.from_pretrained(peft_model_id, use_fast=True)
# model = PeftModel.from_pretrained(model, peft_model_id)

model.config.use_cache = True
model.eval()

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Peft model loaded")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Peft model loaded


#### **Functions for parsing the output**

Since this is causal LM the output generated is text which needs to be parsed to get the emotion label.

In [21]:
# Functions for parsing the output
import string

uppercase_set = set(string.ascii_uppercase)
lowercase_set = set(string.ascii_lowercase)
chars = uppercase_set.union(lowercase_set)

In [22]:
def extract_emotion_label(pred_label):
    """
    Extracts the first word after occurence of ::
    pred_label is raw output string from the model
    Example: "utterance_id :: anger ..."
    It could be inconsistent.
    """
    if len(pred_label.split("::")) >= 2:
        label = pred_label.split("::")[1]
    else:
        return ""
    label = label.strip()
    out_label = ""
    for c in label:
        if c not in chars:
            break
        out_label += c
    return out_label

In [23]:
import re
def extract_first_emotion(pred_str):
    target_strings = ["anger", "fear", "joy", "disgust", "sadness", "surprise", "neutral"]
    pattern = re.compile('|'.join(map(re.escape, target_strings)))
    # Find the first match
    match = pattern.search(pred_str)
    if match:
        # print(f"Found emotion: {match.group()}")
        # print(f"Index: {match.start()}")
        return match.group()
    else:
        # print("No emotion found.")
        return ""

In [70]:
# Define folder for saving the output
if with_labels:
    eval_results_folder = "./results_train"
    if not os.path.exists(eval_results_folder):
        os.makedirs(eval_results_folder)
else:
    eval_results_folder = "./results_test"
    if not os.path.exists(eval_results_folder):
        os.makedirs(eval_results_folder)

In [26]:
# Dataloader for batched inference
class InferenceDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset["instruction"])

    def __getitem__(self, idx):
        instruction = self.dataset["instruction"][idx]
        return instruction


#### **4 functions for getting outputs**

* We can do inference with or without labels.
* With labels for getting metrics on the provided dataset with labels.
* Without for the actual evaluation data.
* We can do batched inference.

In [73]:
# Get full output for entire dataset, batched inference
def get_full_output_batched(dataset, batch_size=2):
    init_time = time.time()
    inference_dataset = InferenceDataset(dataset, tokenizer)

    data_loader = DataLoader(inference_dataset, batch_size=batch_size, shuffle=False)

    outputs_list = []

    model.eval()
    i = 0
    with torch.no_grad():
        for batch_item in data_loader:
            start_time = time.time()
            instructions = batch_item
            print("------------------------------------")
            print(f"Iteration {i}")
            print("------------------------------------")
            i += 1
            tokenized_instruction = tokenizer(instructions, return_tensors="pt", padding=True, truncation=True, max_length=3000)
            input_ids = tokenized_instruction["input_ids"].cuda()

            outputs = model.generate(input_ids=input_ids, max_new_tokens=15) # max_new_tokens is to limit the num of tokens added to orig input
            # since we only want to add utterance_id :: emotion_label we may set it to 20

            decoded_outputs = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
            end_time = time.time()
            print("Inference time for one sample: {:.2f} seconds".format((end_time - start_time)/batch_size))
            outputs_list.extend(decoded_outputs)
    final_time = time.time()
    print("Total time for {} samples using batched inferencing = {} seconds.".format(n, final_time - init_time))
    return outputs_list

In [74]:
full_output = []
pred_labels_list = []
true_labels_list = []
correct = 0

In [75]:
def evaluate_with_label_batched(dataset):   
    full_output = get_full_output_batched(dataset)
    correct = 0
    # extract the label from text, !without shuffle
    for i in range(len(dataset["instruction"])):
        print("------------------------------------")
        print(f"Iteration {i}")
        print("------------------------------------")
        output = full_output[i]
        # extract
        if len(output.split("utterance_id :: emotion_label")) == 1:
            output = extract_first_emotion(output.lower())
            pred_labels_list.append(output)
            print("Predicted: ", output)
        else:
            output = output.split("utterance_id :: emotion_label")[1]
            output = extract_emotion_label(output.strip())
            output = output.strip().lower()
            pred_labels_list.append(output)
            print("Predicted: ", output)
        # baseline
        baseline_output = dataset["out"][i]
        baseline_output = extract_emotion_label(baseline_output.strip())
        baseline_output = baseline_output.strip().lower()
        print("Baseline: ", baseline_output)
        true_labels_list.append(baseline_output)
        # Accuracy
        if output == baseline_output:
            print("Correct")
            correct += 1
        else:
            print("Incorrect")
        print("------------------------------------")

    print("------------------------------------")
    print("Accuracy: {}".format(correct / n))
    print("------------------------------------")

    # Save results
    with open(eval_results_folder + "/pred_labels_batched.json", 'w', encoding="utf-8") as f:
        for i in range(len(pred_labels_list)):
            f.write(f'{pred_labels_list[i]}\n')
    with open(eval_results_folder + "/true_labels_batched.json", 'w', encoding="utf-8") as f:
        for i in range(len(true_labels_list)):
            f.write(f'{true_labels_list[i]}\n')
    return pred_labels_list, true_labels_list, full_output, correct

In [76]:
def evaluate_without_label_batched(dataset):   
    full_output = get_full_output_batched(dataset)
    
    # extract the label from text, !without shuffle
    for i in range(len(dataset["instruction"])):
        print("------------------------------------")
        print(f"Iteration {i}")
        print("------------------------------------")
        output = full_output[i]
        # extract
        if len(output.split("utterance_id :: emotion_label")) == 1:
            output = extract_first_emotion(output.lower())
            pred_labels_list.append(output)
            print("Predicted: ", output)
            dataset["emotion"][i] = output
        else:
            output = output.split("utterance_id :: emotion_label")[1]
            output = extract_emotion_label(output.strip())
            output = output.strip().lower()
            pred_labels_list.append(output)
            print("Predicted: ", output)
            dataset["emotion"][i] = output

In [77]:
# Get output from model for single prompt
def get_full_output(prompt):
    start_time = time.time()
    with torch.inference_mode():
        input_ids = tokenizer(prompt, return_tensors='pt',truncation=True).input_ids.cuda()

        outputs = model.generate(input_ids=input_ids, max_new_tokens=15, )
        output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    end_time = time.time()
    print("Inference time: {:.2f} seconds".format(end_time - start_time))
    return output

In [78]:
def evaluate_with_label(dataset):
    correct = 0
    
    for i in range(len(dataset["instruction"])):
        print("------------------------------------")
        print(f"Iteration {i}")
        print("------------------------------------")
        # predict
        prompt = dataset["instruction"][i]
        output = get_full_output(prompt)
        full_output.append(output)
        # extract
        if len(output.split("utterance_id :: emotion_label")) == 1:
            output = extract_first_emotion(output.lower())
            print("Predicted: ", output)
        else:
            output = output.split("utterance_id :: emotion_label")[1]
            output = extract_emotion_label(output.strip())
            output = output.strip().lower()
            pred_labels_list.append(output)
            print("Predicted: ", output)
        # baseline
        baseline_output = dataset["out"][i]
        baseline_output = extract_emotion_label(baseline_output.strip())
        baseline_output = baseline_output.strip().lower()
        print("Baseline: ", baseline_output)
        true_labels_list.append(baseline_output)
        # Accuracy
        if output == baseline_output:
            print("Correct")
            correct += 1
        else:
            print("Incorrect")
        print("------------------------------------")

    print("------------------------------------")
    print("Accuracy: {}".format(correct / n))
    print("------------------------------------")
    # Save results
    with open(eval_results_folder + "/pred_labels.json", 'w', encoding="utf-8") as f:
        for i in range(len(pred_labels_list)):
            f.write(f'{pred_labels_list[i]}\n')
    # Save baseline also
    with open(eval_results_folder + "/true_labels.json", 'w', encoding="utf-8") as f:
        for i in range(len(true_labels_list)):
            f.write(f'{true_labels_list[i]}\n')
    return pred_labels_list, true_labels_list, full_output, correct

In [79]:
def evaluate_without_label(dataset):    
    for i in range(len(dataset["instruction"])):
        print("------------------------------------")
        print(f"Iteration {i}")
        print("------------------------------------")
        # predict
        prompt = dataset["instruction"][i]
        output = get_full_output(prompt)
        full_output.append(output)
        # extract
        if len(output.split("utterance_id :: emotion_label")) == 1:
            output = extract_first_emotion(output.lower())
            print("Predicted: ", output)
            dataset["emotion"][i] = output
        else:
            output = output.split("utterance_id :: emotion_label")[1]
            output = extract_emotion_label(output.strip())
            output = output.strip().lower()
            pred_labels_list.append(output)
            print("Predicted: ", output)
            dataset["emotion"][i] = output

#### **Begin Inference**

**Batched Inference**

In [None]:
# Batched inference
init_time = time.time()

if with_labels:
    evaluate_with_label_batched(dataset)
else:
    evaluate_without_label_batched(dataset)

final_time = time.time()
print("Total time taken for {} samples = {:.2f} seconds.".format(n, final_time - init_time))

------------------------------------
Iteration 0
------------------------------------
Inference time for one sample: 2.71 seconds
------------------------------------
Iteration 1
------------------------------------
Inference time for one sample: 2.64 seconds
------------------------------------
Iteration 2
------------------------------------
Inference time for one sample: 2.71 seconds
------------------------------------
Iteration 3
------------------------------------
Inference time for one sample: 2.68 seconds
------------------------------------
Iteration 4
------------------------------------
Inference time for one sample: 2.96 seconds
------------------------------------
Iteration 5
------------------------------------
Inference time for one sample: 2.89 seconds
------------------------------------
Iteration 6
------------------------------------
Inference time for one sample: 2.88 seconds
------------------------------------
Iteration 7
------------------------------------
Infe

**Infer one sample at a time**

In [None]:
# Per sample inference
# init_time = time.time()

# if with_labels:
#     pred_labels_list, true_labels_list, full_output, correct = evaluate_with_label(dataset)
# else:
#     pred_labels_list, true_labels_list, full_output = evaluate_without_label(dataset)

# final_time = time.time()
# print("Total time taken for {} samples = {:.2f} seconds.".format(n, final_time - init_time))

#### **Save input to next module and all results**

**Create the input to the next module as a list of JSON objects:**
```
[
    {
        "conversation_ID": 1374,
        "conversation": [
                {
                    "utterance_ID": 1,
                    "text": "Please do not do that again . It is a horrible sound .",
                    "speaker": "Chandler",
                    "video_name": "dia1375utt1.mp4",
                    "emotion": "anger"
                },
                {
                    "utterance_ID": 2,
                    "text": "Please do not do that again . It is a horrible sound .",
                    "speaker": "Chandler",
                    "video_name": "dia1375utt1.mp4",
                    "emotion": "anger"
                },
            ]
        ]
    },
]
```

In [None]:
# Marking empty predictions as neutral
# Can use visual modality for these
for i in range(n):
    if pred_labels_list[i] not in ["anger","fear","surprise","joy","sadness","disgust","neutral"]:
        pred_labels_list[i] = "neutral"
#     print("Predicted: ", pred_labels_list[i])
#     print("Baseline: ", true_labels_list[i])

In [None]:
i = 0
for conv in data:
    for utt in conv["conversation"]:
        utt["emotion"]=pred_labels_list[i]
        i += 1
save_file = eval_results_folder+"/emotion_labelled_data.json"
with open(save_file, 'w') as f:
    f.write(json.dumps(data))

#### **Calculate F1-scores**

In [None]:
if with_labels:
    # Find F1-scores
    from sklearn.metrics import classification_report
    import numpy as np
    
    emotion_labels = ['anger','disgust','fear','sadness','neutral','joy','surprise']
    emotion_index = dict(zip(emotion_labels, range(len(emotion_labels))))
   
    rep = classification_report(true_labels_list, pred_labels_list, labels=emotion_labels, output_dict=True, zero_division=0)
    print(rep)
  

#### **For Colab: Download files and disconnect**

In [None]:
# # Download the results before you lose them
# from google.colab import files
# import os

# dir_to_zip = eval_results_folder
# output_filename = eval_results_folder+".zip"
# delete_dir_after_download = "No"

# os.system( "zip -r {} {}".format( output_filename , dir_to_zip ) )

# if delete_dir_after_download == "Yes":
#     os.system( "rm -r {}".format( dir_to_zip ) )

# files.download( output_filename )

In [None]:
# # And disconnect before runnning out of compute units
# from google.colab import runtime
# runtime.unassign()