In [1]:
import datasets
from datasets import Dataset
import torch
import transformers
from transformers import AutoTokenizer
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import pickle

In [2]:
max_snt_len = 256
models = [
    '''michellejieli/emotion_text_classifier''',
    '''microsoft/deberta-v3-base''',
    '''sileod/deberta-v3-base-tasksource-nli''',
    '''microsoft/deberta-v3-xsmall'''
]
model_id = 2
model_name = models[model_id]

**Emocause dataset**

In [3]:
with open("data/emocause/valid.json", 'r') as f:
    emocause_data_valid = json.load(f)
with open("data/emocause/test.json", 'r') as f:
    emocause_data_test = json.load(f)

print(emocause_data_valid[0])

{'original_situation': 'I felt betrayed when my girlfriend kissed another guy at a party. She was drunk, true. But still.', 'tokenized_situation': ['I', 'felt', 'betrayed', 'when', 'my', 'girlfriend', 'kissed', 'another', 'guy', 'at', 'a', 'party', '.', 'She', 'was', 'drunk', ',', 'true', '.', 'But', 'still', '.'], 'emotion': '__disappointed__', 'conv_id': 'hit:4449_conv:8898', 'annotation': [['girlfriend', 5], ['kissed', 6], ['another', 7], ['guy', 8]], 'labels': ['girlfriend', 'kissed', 'another', 'guy']}


In [4]:
def prepare_emocause_dataset(data, prefix=False):
    offset = 2 if prefix else 0
    dataset = []
    for obj in data:
        if len(obj['tokenized_situation']) >= max_snt_len - offset:
            continue
        x = [obj['emotion'][2].upper() + obj['emotion'][3:-2], '. '] if prefix else []
        x.extend(obj['tokenized_situation'])
        y = np.zeros(len(x), dtype=int)
        for word in obj['annotation']:
            y[word[1] + offset] = 1
        dataset.append({'text': x, 'labels': y, 'count': [len(obj['annotation'])]})
    return dataset

In [5]:
emocause_train, emocause_valid = train_test_split(emocause_data_valid, test_size=0.2, random_state=42)
emocause_train_ds = prepare_emocause_dataset(emocause_train)
emocause_val_ds = prepare_emocause_dataset(emocause_valid)
emocause_test_ds = prepare_emocause_dataset(emocause_data_test)

emocause_hg_train = datasets.Dataset.from_list(emocause_train_ds)
emocause_hg_val = datasets.Dataset.from_list(emocause_val_ds)
emocause_hg_test = datasets.Dataset.from_list(emocause_test_ds)
print(emocause_hg_train[0])

{'text': ['I', 'miss', 'being', 'in', 'high', 'school', '.', 'I', 'still', 'remember', 'my', 'old', 'girlfriend', 'fondly'], 'labels': [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0], 'count': [4]}


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], max_length=max_snt_len, 
                     truncation=True, padding="max_length", is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

emocause_train_dataset = emocause_hg_train.map(tokenize_and_align_labels, batched=True)
emocause_val_dataset = emocause_hg_val.map(tokenize_and_align_labels, batched=True)
emocause_test_dataset = emocause_hg_test.map(tokenize_and_align_labels, batched=True)
print(emocause_train_dataset)

Downloading (…)okenizer_config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Map:   0%|          | 0/3020 [00:00<?, ? examples/s]

Map:   0%|          | 0/755 [00:00<?, ? examples/s]

Map:   0%|          | 0/838 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'count', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3020
})


In [7]:
emocause_train_dataset = emocause_train_dataset.remove_columns(['text'])
emocause_val_dataset = emocause_val_dataset.remove_columns(['text'])
emocause_test_dataset = emocause_test_dataset.remove_columns(['text'])
emocause_train_dataset.set_format("torch")
emocause_val_dataset.set_format("torch")
emocause_test_dataset.set_format("torch")
print(emocause_train_dataset[0])

{'labels': tensor([-100,    0,    0,    0,    0,    1,    1,    0,    0,    0,    0,    0,
           1,    1,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -10

In [8]:
data = (emocause_train_dataset, emocause_val_dataset, emocause_test_dataset)
with open('./data/emocause/data.pickle', 'wb') as f:
    pickle.dump(data, f)

**EmpatheticDialogues**

In [9]:
def prepare_empdia_dataset(path):
    ds = pd.read_csv(path, on_bad_lines='skip')
    ds2 = ds.loc[ds.utterance_idx == 1]
    ds2.drop(columns=['conv_id', 'utterance_idx', 'speaker_idx', 'utterance', 'selfeval', 'tags'], inplace=True)
    ds2['prompt'] = ds2['prompt'].apply(lambda x: x.replace("_comma_", ","))
    return ds2

In [10]:
train_data_pd = prepare_empdia_dataset("./data/empatheticdialogues/train.csv")
val_data_pd = prepare_empdia_dataset("./data/empatheticdialogues/valid.csv")
test_data_pd = prepare_empdia_dataset("./data/empatheticdialogues/test.csv")

train_data_pd.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds2.drop(columns=['conv_id', 'utterance_idx', 'speaker_idx', 'utterance', 'selfeval', 'tags'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds2['prompt'] = ds2['prompt'].apply(lambda x: x.replace("_comma_", ","))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds2.drop(columns=['conv_id', 'utterance_idx', 'speaker_idx', 'utterance', 'selfeval', 'tags'], inplace=True)
A value is trying to be set 

Unnamed: 0,context,prompt
0,sentimental,I remember going to the fireworks with my best...
6,afraid,i used to scare for darkness
12,proud,I showed a guy how to run a good bead in weldi...
17,faithful,I have always been loyal to my wife.
21,terrified,A recent job interview that I had made me feel...


In [11]:
labels = train_data_pd.context.unique()
id2label = dict(zip(range(len(labels)), labels))
label2id = dict(zip(labels, range(len(labels))))
train_data_pd['context'] = train_data_pd['context'].apply(lambda x: label2id[x])
val_data_pd['context'] = val_data_pd['context'].apply(lambda x: label2id[x])
test_data_pd['context'] = test_data_pd['context'].apply(lambda x: label2id[x])

hg_train_data = Dataset.from_pandas(train_data_pd)
hg_test_data = Dataset.from_pandas(test_data_pd)
hg_val_data = Dataset.from_pandas(val_data_pd)

print(hg_train_data)

Dataset({
    features: ['context', 'prompt', '__index_level_0__'],
    num_rows: 17797
})


In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_dataset(examples):
    return tokenizer(examples['prompt'], 
                     max_length=max_snt_len, 
                     truncation=True, 
                     padding="max_length")
train_dataset = hg_train_data.map(tokenize_dataset)
test_dataset = hg_test_data.map(tokenize_dataset)
val_dataset = hg_val_data.map(tokenize_dataset)

Map:   0%|          | 0/17797 [00:00<?, ? examples/s]

Map:   0%|          | 0/2541 [00:00<?, ? examples/s]

Map:   0%|          | 0/2761 [00:00<?, ? examples/s]

In [13]:
train_dataset = train_dataset.remove_columns(['__index_level_0__', 'prompt'])
test_dataset = test_dataset.remove_columns(['__index_level_0__', 'prompt'])
val_dataset = val_dataset.remove_columns(['__index_level_0__', 'prompt'])
train_dataset = train_dataset.rename_column("context", "labels")
test_dataset = test_dataset.rename_column("context", "labels")
val_dataset = val_dataset.rename_column("context", "labels")
train_dataset.set_format("torch")
test_dataset.set_format("torch")
val_dataset.set_format("torch")

print(train_dataset[0])

{'labels': tensor(0), 'input_ids': tensor([    1,   273,  1221,   446,   264,   262, 13111,   275,   312,   410,
         1156,   260,   443,   284,   266,   509,   265,   355,   261,   304,
          278,   364,  1199,   334,   381,   267,   262,   447,   260,     2,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [14]:
data2 = (train_dataset, val_dataset, test_dataset)
with open('./data/empatheticdialogues/data.pickle', 'wb') as f:
    pickle.dump(data2, f)