In [1]:
!pip install -r "requirements.txt"

Collecting accelerate (from -r requirements.txt (line 4))
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft (from -r requirements.txt (line 5))
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes (from -r requirements.txt (line 6))
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting trl (from -r requirements.txt (line 8))
  Downloading trl-0.7.10-py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from -r requirements.txt (line 10))

In [2]:
from accelerate import Accelerator
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    BitsAndBytesConfig,
)
import time
import json
import matplotlib.pyplot as plt
import seaborn as sns
import wandb

In [3]:
with_labels = False

In [4]:
def get_prompt_without_output(sample_data_point, speaker=False, video=False, audio=False):
	if speaker and video and audio:
		instruction = """Identify the cause utterances of the emotion in utterance {}. The video_captions for each utterance describe the video corresponding to the text.\nOutput in one line the ids of the cause utterances as a list in the given format:\nutterance_ids :: [cause utterance ids]\nDon't give any explanation.""".format(sample_data_point["utterance_id"])
	elif speaker and video:
		instruction = """Identify the cause utterances of the emotion in utterance {}. The video_captions for each utterance describe the video corresponding to the text.\nOutput in one line the ids of the cause utterances as a list in the given format:\nutterance_ids :: [cause utterance ids]\nDon't give any explanation.""".format(sample_data_point["utterance_id"])
	else:
		instruction = """Identify the cause utterances of the emotion in utterance {}. Output in one line the ids of the cause utterances as a list in the given format:\nutterance_ids :: [cause utterance ids]\nDon't give any explanation.""".format(sample_data_point["utterance_id"])

	return f"""<s>[INST]\n{sample_data_point['conversation']}{instruction}\n[/INST]\n"""

In [5]:
def get_prompt_with_output(sample_data_point, speaker=False, video=False, audio=False):
	if speaker and video and audio:
		instruction = """Identify the cause utterances of the emotion in utterance {}. The video_captions for each utterance describe the video corresponding to the text.\nOutput in one line the ids of the cause utterances as a list in the given format:\nutterance_ids :: [cause utterance ids]\nDon't give any explanation.""".format(sample_data_point["utterance_id"])
	elif speaker and video:
		instruction = """Identify the cause utterances of the emotion in utterance {}. The video_captions for each utterance describe the video corresponding to the text.\nOutput in one line the ids of the cause utterances as a list in the given format:\nutterance_ids :: [cause utterance ids]\nDon't give any explanation.""".format(sample_data_point["utterance_id"])
	else:
		instruction = """Identify the cause utterances of the emotion in utterance {}. Output in one line the ids of the cause utterances as a list in the given format:\nutterance_ids :: [cause utterance ids]\nDon't give any explanation.""".format(sample_data_point["utterance_id"])

	return f"""<s>[INST]\n{sample_data_point['conversation']}{instruction}\n[/INST]\n""", f"""utterance_ids :: {sample_data_point['causes_label']}\n</s>"""

In [6]:
def get_formatted_conversation(data_sample, speaker=False, video=False, audio=False):
    conversation_str = '"conversation": [\n'
    for utt in data_sample["conversation"]:
        conversation_str += "{{\n\"utterance_ID\": {}\n\"text\": {}\n\"speaker\": {}\n\"emotion\": {}\n}}\n".format(utt["utterance_ID"], utt["text"], utt["speaker"], utt["emotion"])
    return conversation_str

In [7]:
def get_causes_list_str(cause_list):
    str_out = "["
    i = 0
    while i < len(cause_list):
        if i == len(cause_list)-1:
            str_out += str(cause_list[i])
        else:
            str_out += str(cause_list[i]) + ","
        i += 1
    str_out += "]"
    return str_out

In [8]:
def form_input(data_sample, utt_id, speaker=False, video=False, audio=False):
    conversation_str = get_formatted_conversation(data_sample, speaker, video, audio)
    emotions_str = data_sample["conversation"][utt_id]["emotion"]
    if with_labels:
        causes_list_str = get_causes_list_str(data_sample["conversation"][utt_id]["cause_list"])

    sample_data_point = {} # has utterance_id, conversation, emotion_label, conv id

    sample_data_point["utterance_id"] = utt_id+1
    sample_data_point["conversation"] = conversation_str
    sample_data_point["emotion_label"] = emotions_str
    if with_labels:
        sample_data_point["causes_label"] = causes_list_str
    sample_data_point["causes_list"] = data_sample["conversation"][utt_id]["cause_list"]
    sample_data_point["conversation_id"] = data_sample["conversation_ID"]
    return sample_data_point

In [9]:
def generate_inst_for_each_utt_of_conv(data_point, speaker=False, video=False, audio=False):
    num_utt = len(data_point["conversation"])
    list_output_objects = []
    for i in range(num_utt):
        sample_data_point = form_input(data_point, i, speaker, video, audio)
        if with_labels:
            inst, out = get_prompt_with_output(sample_data_point, speaker, video, audio)
        else:
            inst = get_prompt_without_output(sample_data_point, speaker, video, audio)
        if with_labels:
            obj = {
                "instruction": inst,
                "out": out,
                "text": inst + out,
                "emotion_label": sample_data_point["emotion_label"],
                "utterance_id": sample_data_point["utterance_id"],
                "conversation_id": sample_data_point["conversation_id"],
                "causes_label":sample_data_point["causes_label"],
                "causes_list":sample_data_point["causes_list"]
            }
        else:
            obj = {
                "instruction": inst,
                # "out": out,
                # "text": inst + out,
                "emotion_label": sample_data_point["emotion_label"],
                "utterance_id": sample_data_point["utterance_id"],
                "conversation_id": sample_data_point["conversation_id"],
                # "causes_label":sample_data_point["causes_label"],
                # "causes_list":sample_data_point["causes_list"]
            }
        list_output_objects.append(obj)
    return list_output_objects

In [10]:
def process_dataset(data, speaker=False, video=False, audio=False):
    flattened_list = [element for sublist in map(lambda x: generate_inst_for_each_utt_of_conv(x, speaker, video, audio), data) for element in sublist]
    ds = {}
    # IMP: only include those utterances in dataset where emotion is not "neutral", we will only find cause list for those
    ds["instruction"] = [sample["instruction"] for sample in flattened_list if sample["emotion_label"] != "neutral"]
    ds["emotion_label"] = [sample["emotion_label"] for sample in flattened_list if sample["emotion_label"] != "neutral"]
    ds["utterance_id"] = [sample["utterance_id"] for sample in flattened_list if sample["emotion_label"] != "neutral"]
    ds["conversation_id"] = [sample["conversation_id"] for sample in flattened_list if sample["emotion_label"] != "neutral"]
    if with_labels:
        ds["causes_label"] = [sample["causes_label"] for sample in flattened_list if sample["emotion_label"] != "neutral"]
        ds["causes_list"] = [sample["causes_list"] for sample in flattened_list if sample["emotion_label"] != "neutral"]
        ds["out"] = [sample["out"] for sample in flattened_list if sample["emotion_label"] != "neutral"]
        ds["text"] = [sample["text"] for sample in flattened_list if sample["emotion_label"] != "neutral"]
    return ds

In [11]:
from sklearn.model_selection import train_test_split

def create_json(data, save_folder, speaker=False, video=False, audio=False):

    if not os.path.exists(save_folder):
        os.makedirs(save_folder, exist_ok=True)

    if with_labels:
        # create a map for storing cause_list for each utt of each conv
        # map of map
        cause_map = {}
        for conv in data:
            conv_id = conv["conversation_ID"]
            cause_map[conv_id] = {}
            for pair in conv["emotion-cause_pairs"]:
                emo_id = int(pair[0].split('_')[0])
                cau_id = int(pair[1])
                if emo_id in cause_map[conv_id].keys():
                    cause_map[conv_id][emo_id].append(cau_id)
                else:
                    cause_map[conv_id][emo_id] = [cau_id]
        # Now go through utt and add their "cause_list", could be [] for neutral
        for conv in data:
            conv_id = conv["conversation_ID"]
            if conv_id not in cause_map.keys():
                for utt in conv["conversation"]:
                    utt["cause_list"] = []
            else:
                for utt in conv["conversation"]:
                    utt_id = utt["utterance_ID"]
                    if utt_id not in cause_map[conv_id].keys():
                        utt["cause_list"] = []
                    else:
                        utt["cause_list"] = cause_map[conv_id][utt_id]
    else:
        for conv in data:
            for utt in conv["conversation"]:
                utt["cause_list"] = []


    print("Number of conversations in dataset: {}".format(len(data)))

    conv_file = save_folder + "conversations.json"
    with open(conv_file, 'w') as f:
        json.dump(data, f)
    return conv_file

def generate_input(args):
    text_folder = "data"
    save_dir = "cause_result"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    file_path = text_folder + "/grp5.json"
    file = open(file_path)
    data = json.load(file)

    test_file = create_json(data, save_dir, args.speaker, args.video, args.audio)
    return test_file

In [12]:
# Params
class Config:
    def __init__(self) -> None:
        self.seed = 42
        self.speaker = True
        self.video = False
        self.audio = False

In [13]:
data = []

In [14]:
config = Config()
from datasets import Dataset

if not with_labels:
    data_file = generate_input(config)
    with open(data_file, 'r') as f:
        data = json.load(f)
        dataset = process_dataset(data, speaker=config.speaker, video=config.video)
        dataset = Dataset.from_dict(dataset)
        print("Number of utterances {}".format(len(dataset["instruction"])))


Number of conversations in dataset: 165
Number of utterances 838


In [None]:
# !python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('hf_aLJKXafNc')"

In [15]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store

In [16]:
model_id = "meta-llama/Llama-2-13b-chat-hf"

In [17]:
import accelerate
model_config = AutoConfig.from_pretrained(model_id)

with accelerate.init_empty_weights():
    fake_model = AutoModelForCausalLM.from_config(model_config)

device_map=accelerate.infer_auto_device_map(fake_model, max_memory={0: "3GiB", "cpu": "6GiB"}) #
print(json.dumps(device_map, indent=4))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

{
    "model.embed_tokens": 0,
    "model.layers.0": 0,
    "model.layers.1.self_attn": 0,
    "model.layers.1.input_layernorm": "cpu",
    "model.layers.1.post_attention_layernorm": "cpu",
    "model.layers.2": "cpu",
    "model.layers.3": "cpu",
    "model.layers.4": "cpu",
    "model.layers.5.self_attn": "cpu",
    "model.layers.5.mlp.gate_proj": "cpu",
    "model.layers.5.mlp.up_proj": "cpu",
    "model.layers.5.mlp.down_proj": "disk",
    "model.layers.5.mlp.act_fn": "disk",
    "model.layers.5.input_layernorm": "disk",
    "model.layers.5.post_attention_layernorm": "disk",
    "model.layers.6": "disk",
    "model.layers.7": "disk",
    "model.layers.8": "disk",
    "model.layers.9": "disk",
    "model.layers.10": "disk",
    "model.layers.11": "disk",
    "model.layers.12": "disk",
    "model.layers.13": "disk",
    "model.layers.14": "disk",
    "model.layers.15": "disk",
    "model.layers.16": "disk",
    "model.layers.17": "disk",
    "model.layers.18": "disk",
    "model.laye

In [18]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load peft config for pre-trained checkpoint
peft_model_id = "ArefaMuzaffar/merged-llama-cause-text-new"
config = PeftConfig.from_pretrained(peft_model_id)

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", # 4-bit quantized noraml float
    bnb_compute_dtype=compute_dtype,
    llm_int8_enable_fp32_cpu_offload=True,
)

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             return_dict=True,
                                             device_map={"": Accelerator().local_process_index},
                                             offload_folder="/tmp/.offload",
                                             quantization_config= bnb_config,
                                             pretraining_tp=1,
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = PeftModel.from_pretrained(model, peft_model_id)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Peft model loaded")

adapter_config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/105M [00:00<?, ?B/s]

Peft model loaded


In [19]:
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 5120)
        (layers): ModuleList(
          (0-39): 40 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bit(i

In [20]:
print(dataset["instruction"][0])

<s>[INST]
"conversation": [
{
"utterance_ID": 1
"text": No . No , not it . Not it . Not it . Ugh .
"speaker": Monica
"emotion": anger
}
{
"utterance_ID": 2
"text": Do not crowd me .
"speaker": Monica
"emotion": disgust
}
{
"utterance_ID": 3
"text": This is it ! This is the dress ! Oh , my God . It is perfect .
"speaker": Monica
"emotion": joy
}
{
"utterance_ID": 4
"text": I am sorry , this one taken !
"speaker": Monica
"emotion": neutral
}
{
"utterance_ID": 5
"text": Whoa !
"speaker": Monica
"emotion": surprise
}
{
"utterance_ID": 6
"text": Megan !
"speaker": Monica
"emotion": surprise
}
{
"utterance_ID": 7
"text": Monica !
"speaker": Megan
"emotion": anger
}
{
"utterance_ID": 8
"text": You came !
"speaker": Monica
"emotion": joy
}
{
"utterance_ID": 9
"text": Yeah !
"speaker": Megan
"emotion": joy
}
{
"utterance_ID": 10
"text": This is my dress !
"speaker": Monica
"emotion": anger
}
{
"utterance_ID": 11
"text": No !
"speaker": Megan
"emotion": surprise
}
{
"utterance_ID": 12
"text": Ye

In [21]:
# Define folder for saving the output
if with_labels:
    eval_results_folder = "./results_train"
    if not os.path.exists(eval_results_folder):
        os.makedirs(eval_results_folder)
else:
    eval_results_folder = "./results_test"
    if not os.path.exists(eval_results_folder):
        os.makedirs(eval_results_folder)

In [22]:
n = len(dataset["instruction"])

In [23]:
print(n)

838


In [24]:
from torch.utils.data import Dataset, DataLoader

In [25]:
# Dataloader for batched inference
class InferenceDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset["instruction"])

    def __getitem__(self, idx):
        instruction = self.dataset["instruction"][idx]

        return instruction

In [26]:
# Get full output for entire dataset, batched inference
def get_full_output_batched(dataset, batch_size=1):
    init_time = time.time()
    inference_dataset = InferenceDataset(dataset, tokenizer)

    data_loader = DataLoader(inference_dataset, batch_size=batch_size, shuffle=False)

    outputs_list = []

    model.eval()
    i = 0
    with torch.no_grad():
        for batch_item in data_loader:
            start_time = time.time()
            instructions = batch_item
            print("------------------------------------")
            print(f"Iteration {i}")
            print("------------------------------------")
            i += 1
            tokenized_instruction = tokenizer(instructions, return_tensors="pt", padding=True, truncation=True, max_length=3000)
            input_ids = tokenized_instruction["input_ids"].cuda()

            outputs = model.generate(input_ids=input_ids, max_new_tokens=25) # max_new_tokens is to limit the num of tokens added to orig input
            # since we only want to add utterance_ids :: [1,2,3,4,5] we may set it to 30

            decoded_outputs = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
            end_time = time.time()
            print("Inference time for one sample: {:.2f} seconds".format((end_time - start_time)/batch_size))
            outputs_list.extend(decoded_outputs)
    final_time = time.time()
    print("Total time for {} samples using batched inferencing = {} seconds.".format(n, final_time - init_time))
    return outputs_list

In [27]:
preds_list = []
true_list = []
full_output = []

In [28]:
init_time = time.time()
full_output = get_full_output_batched(dataset,1)
final_time = time.time()

print("Total time for {} samples = {} seconds.".format(n, final_time - init_time))

------------------------------------
Iteration 0
------------------------------------




Inference time for one sample: 18.03 seconds
------------------------------------
Iteration 1
------------------------------------
Inference time for one sample: 10.10 seconds
------------------------------------
Iteration 2
------------------------------------
Inference time for one sample: 10.47 seconds
------------------------------------
Iteration 3
------------------------------------
Inference time for one sample: 10.77 seconds
------------------------------------
Iteration 4
------------------------------------
Inference time for one sample: 10.93 seconds
------------------------------------
Iteration 5
------------------------------------
Inference time for one sample: 10.98 seconds
------------------------------------
Iteration 6
------------------------------------
Inference time for one sample: 10.69 seconds
------------------------------------
Iteration 7
------------------------------------
Inference time for one sample: 10.47 seconds
------------------------------------
I

In [29]:
def extract_cause_list(output):
    lst = output.split('::')
    if len(lst) >= 3:
        output = lst[2]
    else:
        output = lst[0]
    lst = output.split('[')
    if len(lst) >= 2:
        output = lst[1]
    else:
        output = lst[0]
    output = output.strip()
    lst = output.split(']')
    output = lst[0].strip()
    output = output.split(',')
    final = []
    for string in output:
        final.append(int(string))
    return final

In [30]:
pred_causes_map = {}
def evaluate_with_label_batched(dataset):
    # extract the list from text, !without shuffle
    for i in range(len(dataset["instruction"])):
        print("------------------------------------")
        print(f"Iteration {i}")
        print("------------------------------------")
        output = full_output[i] # of the form: utterance_ids :: [1,2] or []
        # extract
        output = extract_cause_list(output.strip())
        print("Predicted: ", output)
        preds_list.append(output)
        # baseline
        baseline_output = dataset["causes_list"][i]
        print("Baseline: ", baseline_output)
        true_list.append(baseline_output)

        # Update map
        conv_id = dataset["conversation_id"][i]
        utt_id = dataset["utterance_id"][i]
        if conv_id not in pred_causes_map.keys():
            pred_causes_map[conv_id] = {}
        pred_causes_map[conv_id][utt_id] = output

In [31]:
def evaluate_without_label_batched(dataset):

    # extract the label from text, !without shuffle
    for i in range(len(dataset["instruction"])):
        print("------------------------------------")
        print(f"Iteration {i}")
        print("------------------------------------")
        output = full_output[i] # of the form: utterance_ids :: [1,2] or []
        # extract
        output = extract_cause_list(output.strip())
        print("Predicted: ", output)
        preds_list.append(output)

        # Update map
        conv_id = dataset["conversation_id"][i]
        utt_id = dataset["utterance_id"][i]
        if conv_id not in pred_causes_map.keys():
            pred_causes_map[conv_id] = {}
        pred_causes_map[conv_id][utt_id] = output

In [32]:
if with_labels:
    evaluate_with_label_batched(dataset)
else:
    evaluate_without_label_batched(dataset)

------------------------------------
Iteration 0
------------------------------------
Predicted:  [1]
------------------------------------
Iteration 1
------------------------------------
Predicted:  [1]
------------------------------------
Iteration 2
------------------------------------
Predicted:  [2]
------------------------------------
Iteration 3
------------------------------------
Predicted:  [4]
------------------------------------
Iteration 4
------------------------------------
Predicted:  [6]
------------------------------------
Iteration 5
------------------------------------
Predicted:  [6]
------------------------------------
Iteration 6
------------------------------------
Predicted:  [7, 8]
------------------------------------
Iteration 7
------------------------------------
Predicted:  [8, 9]
------------------------------------
Iteration 8
------------------------------------
Predicted:  [9, 10]
------------------------------------
Iteration 9
-----------------------

In [34]:
# Save results
with open(eval_results_folder + "/pred_text_batched.json", 'w', encoding="utf-8") as f:
    for string in full_output:
        f.write(f"{string}\n[/NEWNEW]\n")
with open(eval_results_folder + "/pred_list_batched.json", 'w', encoding="utf-8") as f:
    for i in range(len(preds_list)):
        f.write(f'{preds_list[i]}\n')
if with_labels:
    with open(eval_results_folder + "/true_list_batched.json", 'w', encoding="utf-8") as f:
        for i in range(len(true_list)):
            f.write(f'{true_list[i]}\n')
# Save dict
with open(eval_results_folder + "/pred_causes_map.json", 'w', encoding="utf-8") as f:
    json.dump(pred_causes_map, f)

In [35]:
with open("./data/grp5.json", 'r') as f:
    data = json.load(f)
for conv in data:
    conv_id = conv["conversation_ID"]
    conv["emotion-cause_pairs"] = []
    if conv_id not in pred_causes_map.keys():
        continue
    for utt_id, lst in pred_causes_map[conv_id].items():
        ff = str(utt_id)+"_"+conv["conversation"][utt_id - 1]["emotion"]
        for cause_id in lst:
            ss = str(cause_id)
            conv["emotion-cause_pairs"].append([ff, ss])
            print("Added pair: {}".format([ff, ss]))

Added pair: ['1_anger', '1']
Added pair: ['2_disgust', '1']
Added pair: ['3_joy', '2']
Added pair: ['5_surprise', '4']
Added pair: ['6_surprise', '6']
Added pair: ['7_anger', '6']
Added pair: ['8_joy', '7']
Added pair: ['8_joy', '8']
Added pair: ['9_joy', '8']
Added pair: ['9_joy', '9']
Added pair: ['10_anger', '9']
Added pair: ['10_anger', '10']
Added pair: ['11_surprise', '10']
Added pair: ['11_surprise', '11']
Added pair: ['12_anger', '11']
Added pair: ['12_anger', '12']
Added pair: ['13_anger', '13']
Added pair: ['14_anger', '14']
Added pair: ['15_anger', '14']
Added pair: ['15_anger', '15']
Added pair: ['17_anger', '17']
Added pair: ['5_surprise', '5']
Added pair: ['6_surprise', '5']
Added pair: ['6_surprise', '6']
Added pair: ['7_sadness', '6']
Added pair: ['8_joy', '8']
Added pair: ['10_joy', '10']
Added pair: ['1_joy', '1']
Added pair: ['2_joy', '2']
Added pair: ['3_surprise', '2']
Added pair: ['4_joy', '4']
Added pair: ['6_joy', '6']
Added pair: ['7_joy', '7']
Added pair: ['8_

In [36]:
with open(eval_results_folder + "/Subtask_2_pred5.json", 'w', encoding="utf-8") as f:
    json.dump(data, f)

In [37]:
# If you want to download
from google.colab import files
import os

dir_to_zip = eval_results_folder #@param {type: "string"}
output_filename = 'cause_res_grp4.zip' #@param {type: "string"}
delete_dir_after_download = "No"  #@param ['Yes', 'No']

os.system( "zip -r {} {}".format( output_filename , dir_to_zip ) )

if delete_dir_after_download == "Yes":
    os.system( "rm -r {}".format( dir_to_zip ) )

files.download( output_filename )

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>