In [1]:
import pandas as pd
import random
import re
import matplotlib.pyplot as plt
import matplotlib as mpl
from datasets import Dataset, load_dataset
import os
import torch
import transformers
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from matplotlib.ticker import MaxNLocator
random.seed(2023)

In [9]:
def count_labels(imported_dataset):
    return imported_dataset['emotion'].value_counts().reset_index(name = 'count').rename(columns={'index':'emotion'})

In [2]:
def plot_dis(dataset, plt_title = 'Label Distribution'):
    dataset = count_labels(dataset)
    # Create the figure and axes objects, specify the size and the dots per inches 
    fig, ax = plt.subplots(figsize=(14,5), dpi = 96)

    # Plot bars
    bar1 = ax.bar(dataset['emotion'], dataset['count'], width=0.6)

    # Create the grid 
    ax.grid(which="major", axis='x', color='#DAD8D7', alpha=0.5, zorder=1)
    ax.grid(which="major", axis='y', color='#DAD8D7', alpha=0.5, zorder=1)

    # Reformat x-axis label and tick labels
    ax.set_xlabel('', fontsize=10, labelpad=11) # No need for an axis label
    ax.xaxis.set_label_position("bottom")
    ax.xaxis.set_major_formatter(lambda s, i : f'{s:,.0f}')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.xaxis.set_tick_params(pad=2, labelbottom=True, bottom=True, labelsize=12, labelrotation=0)
    labels = dataset['emotion']
    ax.set_xticks(dataset['emotion'], labels) # Map integers numbers from the series to labels list

    # Reformat y-axis
    ax.set_ylabel('Count', fontsize=10, labelpad=11)
    ax.yaxis.set_label_position("left")
    ax.yaxis.set_major_formatter(lambda s, i : f'{s:,.0f}')
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.yaxis.set_tick_params(pad=2, labeltop=False, labelbottom=True, bottom=False, labelsize=12)

    # Add label on top of each bar
    ax.bar_label(bar1, labels=[f'{e:,.1f}' for e in dataset['count']], padding=3, color='black', fontsize=8) 

    # Add in red line and rectangle on top
    ax.plot([0.12, .9], [.98, .98], transform=fig.transFigure, clip_on=False, color='#E3120B', linewidth=.6)
    ax.add_patch(plt.Rectangle((0.12,.98), 0.04, -0.02, facecolor='#E3120B', transform=fig.transFigure, clip_on=False, linewidth = 0))

    # Add in title and subtitle
    ax.text(x=0.12, y=.93, s=plt_title, transform=fig.transFigure, ha='left', fontsize=14, weight='bold', alpha=.8)
    ax.text(x=0.12, y=.90, s="", transform=fig.transFigure, ha='left', fontsize=12, alpha=.8)

    # Colours - Choose the extreme colours of the colour map
    colours = ["#2196f3", "#bbdefb"]

    # Colormap - Build the colour maps
    cmap = mpl.colors.LinearSegmentedColormap.from_list("colour_map", colours, N=256)
    norm = mpl.colors.Normalize(dataset['count'].min(), dataset['count'].max()) # linearly normalizes data into the [0.0, 1.0] interval

    # Plot bars
    bar1 = ax.bar(dataset['emotion'],dataset['count'], color=cmap(norm(dataset['count'])), width=0.6, zorder=2)

    # Find the average data point and split the series in 2
    average = dataset['count'].mean()
    below_average = dataset[dataset['count']<average]
    above_average = dataset[dataset['count']>=average]

    # Colours - Choose the extreme colours of the colour map
    colors_high = ["#E1ACAC", "#E1ACAC"] # Extreme colours of the high scale
    colors_low = ["#004B84","#004B84"] # Extreme colours of the low scale

    # Colormap - Build the colour maps
    cmap_low = mpl.colors.LinearSegmentedColormap.from_list("low_map", colors_low, N=256)
    cmap_high = mpl.colors.LinearSegmentedColormap.from_list("high_map", colors_high, N=256)
    norm_low = mpl.colors.Normalize(below_average['count'].min(), average) # linearly normalizes data into the [0.0, 1.0] interval
    norm_high = mpl.colors.Normalize(average, above_average['count'].max())

    # Plot bars and average (horizontal) line
    bar1 = ax.bar(below_average['emotion'], below_average['count'], color=cmap_low(norm_low(below_average['count'])), width=0.6, label='Below Average', zorder=2)
    bar2 = ax.bar(above_average['emotion'], above_average['count'], color=cmap_high(norm_high(above_average['count'])), width=0.6, label='Above Average', zorder=2)
    plt.axhline(y=average, color = 'grey', linewidth=3)

    # Determine the y-limits of the plot
    ymin, ymax = ax.get_ylim()
    # Calculate a suitable y position for the text label
    y_pos = average/ymax + 0.03
    # Annotate the average line
    ax.text(0.88, y_pos, f'Average = {average:.1f}', ha='right', va='center', transform=ax.transAxes, size=8, zorder=3)

    # Add legend
    ax.legend(loc="best", ncol=2, bbox_to_anchor=[1, 1.07], borderaxespad=0, frameon=False, fontsize=8)
    


In [3]:
def plot_dis_large(dataset, plt_title = 'Label Distribution'):
    
    dataset = count_labels(dataset)
    # Create the figure and axes objects, specify the size and the dots per inches 
    fig, ax = plt.subplots(figsize=(25,3), dpi = 96)

    # Plot bars
    bar1 = ax.bar(dataset['emotion'], dataset['count'], width=0.2)

    # Create the grid 
    ax.grid(which="major", axis='x', color='#DAD8D7', alpha=0.5, zorder=1)
    ax.grid(which="major", axis='y', color='#DAD8D7', alpha=0.5, zorder=1)

    # Reformat x-axis label and tick labels
    ax.set_xlabel('', fontsize=8, labelpad=12) # No need for an axis label
    ax.xaxis.set_label_position("bottom")
    ax.xaxis.set_major_formatter(lambda s, i : f'{s:,.0f}')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.xaxis.set_tick_params(pad=2, labelbottom=True, bottom=True, labelsize=8, labelrotation=0)
    labels = dataset['emotion']
    ax.set_xticks(dataset['emotion'], labels) # Map integers numbers from the series to labels list

    # Reformat y-axis
    ax.set_ylabel('Count', fontsize=10, labelpad=11)
    ax.yaxis.set_label_position("left")
    ax.yaxis.set_major_formatter(lambda s, i : f'{s:,.0f}')
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    ax.yaxis.set_tick_params(pad=2, labeltop=False, labelbottom=True, bottom=False, labelsize=12)

    # Add label on top of each bar
    ax.bar_label(bar1, labels=[f'{e:,.1f}' for e in dataset['count']], padding=3, color='black', fontsize=8) 

    # Add in red line and rectangle on top
    ax.plot([0.12, .9], [.98, .98], transform=fig.transFigure, clip_on=False, color='#E3120B', linewidth=.6)
    ax.add_patch(plt.Rectangle((0.12,.98), 0.04, -0.02, facecolor='#E3120B', transform=fig.transFigure, clip_on=False, linewidth = 0))

    # Add in title and subtitle
    ax.text(x=0.12, y=.93, s=plt_title, transform=fig.transFigure, ha='left', fontsize=14, weight='bold', alpha=.8)
    ax.text(x=0.12, y=.90, s="", transform=fig.transFigure, ha='left', fontsize=12, alpha=.8)

    # Colours - Choose the extreme colours of the colour map
    colours = ["#2196f3", "#bbdefb"]

    # Colormap - Build the colour maps
    cmap = mpl.colors.LinearSegmentedColormap.from_list("colour_map", colours, N=256)
    norm = mpl.colors.Normalize(dataset['count'].min(), dataset['count'].max()) # linearly normalizes data into the [0.0, 1.0] interval

    # Plot bars
    bar1 = ax.bar(dataset['emotion'],dataset['count'], color=cmap(norm(dataset['count'])), width=0.6, zorder=2)

    # Find the average data point and split the series in 2
    average = dataset['count'].mean()
    below_average = dataset[dataset['count']<average]
    above_average = dataset[dataset['count']>=average]

    # Colours - Choose the extreme colours of the colour map
    colors_high = ["#E1ACAC", "#E1ACAC"] # Extreme colours of the high scale
    colors_low = ["#004B84","#004B84"] # Extreme colours of the low scale

    # Colormap - Build the colour maps
    cmap_low = mpl.colors.LinearSegmentedColormap.from_list("low_map", colors_low, N=256)
    cmap_high = mpl.colors.LinearSegmentedColormap.from_list("high_map", colors_high, N=256)
    norm_low = mpl.colors.Normalize(below_average['count'].min(), average) # linearly normalizes data into the [0.0, 1.0] interval
    norm_high = mpl.colors.Normalize(average, above_average['count'].max())

    # Plot bars and average (horizontal) line
    bar1 = ax.bar(below_average['emotion'], below_average['count'], color=cmap_low(norm_low(below_average['count'])), width=0.6, label='Below Average', zorder=2)
    bar2 = ax.bar(above_average['emotion'], above_average['count'], color=cmap_high(norm_high(above_average['count'])), width=0.6, label='Above Average', zorder=2)
    plt.axhline(y=average, color = 'grey', linewidth=3)

    # Determine the y-limits of the plot
    ymin, ymax = ax.get_ylim()
    # Calculate a suitable y position for the text label
    y_pos = average/ymax + 0.03
    # Annotate the average line
    ax.text(0.88, y_pos, f'Average = {average:.1f}', ha='right', va='center', transform=ax.transAxes, size=8, zorder=3)

    # Add legend
    ax.legend(loc="best", ncol=2, bbox_to_anchor=[1, 1.07], borderaxespad=0, frameon=False, fontsize=8)
    


In [5]:
mapping = { 'ang': "anger", 'angry': "anger", 'annoyed': "anger", 'furious': "anger", 'fru': "anger", 'annoyance': 'anger', "disapproval":'anger',
            'exc': "joy", 'joyful': "joy", 'happiness': "joy", 'hap': "joy", 'grateful': "joy", 'impressed': "joy", 'content': "joy", 'fun': "joy", 'enthusiasm': "joy",
            'excited': "joy", 'excitement': 'joy', "pride": 'joy', 'gratitude':'joy',"approval": 'joy', 'admiration':'joy','proud': 'joy',
            'fea': "fear", 'terrified': "fear", 'afraid': "fear", 
            'disgusted': 'disgust', 'hate': 'disgust', 'boredom': 'disgust', 
            'neu': "neutral", 
            'sad': "sadness", 'devastated': "sadness", 'disappointed': "sadness", "grief": "sadness", "lonely":'sadness','disappointment':'sadness',
            'sur': "surprise",'surprised': "surprise",'sup': "surprise", 'realization':'surprise',
            'hope': "optimism", 'faithful': "optimism", 'hopeful': 'optimism', 'confident':'optimism', 'prepared':'optimism',
            'guilty': "guilt", 'shame': "guilt", 'ashamed': "guilt", 'embarrassed': "guilt",
            'caring': "love",
            'anxious': "anxiety", 'worry': "anxiety", 'apprehensive': "anxiety",'nervousness':'anxiety',
            'anticipating': 'anticipation',
            'amusement': 'amusement',
            'neu': 'neutral',
            'confusion':'curiosity'
            }

class MyDict(dict):
    def __missing__(self, key):
        return key
    
def map_and_concat(df,list_of_datasets,list_of_ds_name):
    total_rows = 0
    for d,name in zip(list_of_datasets,list_of_ds_name):
        emo_labels = d['label']
        d['label'] = emo_labels.map(MyDict(mapping))
        ds = d[['sentence', 'label']]
        filtered_ds = ds[~ds['label'].str.contains('trust', case=False)]
        filtered_ds['ds_name'] = name
        df = pd.concat([df,filtered_ds])
        total_rows = total_rows + ds.shape[0]
        print(total_rows)
    return df

In [6]:
train_final = pd.DataFrame(columns=['sentence','label','ds_name'])
val_final = pd.DataFrame(columns=['sentence','label','ds_name'])
test_final = pd.DataFrame(columns=['sentence','label','ds_name'])

In [None]:
# input needs to be lower letter
def relabel_emotion(emotion):
    if isinstance(emotion, int):
        # for daily dialouge
        if emotion in [0] : return 'neutral'
        if emotion in [1] : return 'anger'
        if emotion in [2] : return 'disgust'
        if emotion in [3] : return 'fear'
        if emotion in [4] : return 'joy'
        if emotion in [5] : return 'sadness'
        if emotion in [6] : return 'surprise'
    else:
        emotion = emotion.lower()
        if emotion in ['mad','angry', 1, 'anger'] : return 'anger'
        if emotion in ['fear','scared', 3] : return 'fear'
        if emotion in ['joy','happy','joyful','happiness', 4] : return 'joy'
        if emotion in ['sadness','sad', 5] : return 'sadness'
        else : return emotion


### Individual Import, Cleaning, Analysis

#### GoEmotion (hartmann)
https://github.com/google-research/google-research/tree/master/goemotions


In [11]:
def format_go_emo(dataset):
    dataset = dataset.dropna(subset=['emotion_label'])
    for idx, row in dataset.iterrows():
        emotion_label_txt = row['emotion_label']
        emotion_label_list = [int(num) for num in emotion_label_txt.split(",")]
        row['emotion_label'] = emotion_label_list
    return dataset

##### Fine-grained

In [12]:
orig_mapping = {
  '0': 'admiration',
  '1': 'amusement',
  '2': 'anger',
  '3': 'annoyance',
  '4': 'approval',
  '5': 'caring',
  '6': 'confusion',
  '7': 'curiosity',
  '8': 'desire',
  '9': 'disappointment',
  '10': 'disapproval',
  '11': 'disgust',
  '12': 'embarrassment',
  '13': 'excitement',
  '14': 'fear',
  '15': 'gratitude',
  '16': 'grief',
  '17': 'joy',
  '18': 'love',
  '19': 'nervousness',
  '20': 'optimism',
  '21': 'pride',
  '22': 'realization',
  '23': 'relief',
  '24': 'remorse',
  '25': 'sadness',
  '26': 'surprise',
  '27': 'neutral'
}

def map_haru_emotion(emo_ds):
    final_result_emo = []
    for idx, row in emo_ds.iterrows():
        e = row["emotion_label"]
        if 1 in e:
            final_result_emo.append("amusement")
        elif 6 in e or 7 in e:
            final_result_emo.append("curiosity")
        elif 12 in e or 24 in e:
            final_result_emo.append("guilt")
        elif 20 in e:
            final_result_emo.append("optimism")
        elif 5 in e or 18 in e:
            final_result_emo.append("love")    
        else:
            l = e[random.randint(0,len(e)-1)]
            final_result_emo.append(orig_mapping[str(l)])
    emo_ds["emotion_label"] = final_result_emo
    return emo_ds           

In [21]:
train = pd.read_csv("/home/annie/Desktop/evaluate_models/dataset/GoEmotions-pytorch/data/original/train.tsv",sep='\t',names=['utterance', 'emotion_label', 'rater_id'])
test = pd.read_csv("/home/annie/Desktop/evaluate_models/dataset/GoEmotions-pytorch/data/original/test.tsv",sep='\t',names=['utterance', 'emotion_label', 'rater_id'])
val = pd.read_csv("/home/annie/Desktop/evaluate_models/dataset/GoEmotions-pytorch/data/original/dev.tsv",sep='\t',names=['utterance', 'emotion_label', 'rater_id'])

train_fine = map_haru_emotion(format_go_emo(train)).rename(columns={'utterance':'sentence','emotion_label':'label'})
test_fine = map_haru_emotion(format_go_emo(test)).rename(columns={'utterance':'sentence','emotion_label':'label'})
val_fine = map_haru_emotion(format_go_emo(val)).rename(columns={'utterance':'sentence','emotion_label':'label'})

In [22]:
test_fine

Unnamed: 0,sentence,labels,rater_id
0,I’m really sorry about your situation :( Altho...,sadness,eecwqtt
1,It's wonderful because it's awful. At not with.,admiration,ed5f85d
2,"Kings fan here, good luck to you guys! Will be...",excitement,een27c3
3,"I didn't know that, thank you for teaching me ...",gratitude,eelgwd1
4,They got bored from haunting earth for thousan...,neutral,eem5uti
...,...,...,...
5422,Thanks. I was diagnosed with BP 1 after the ho...,gratitude,efeeasc
5423,Well that makes sense.,approval,ef9c7s3
5424,Daddy issues [NAME],neutral,efbiugo
5425,So glad I discovered that subreddit a couple m...,admiration,efbvgp9


In [25]:
train_fine.to_csv("dataset/GoEmotions-pytorch/dataset_csv/train_fine_goemo.csv")
test_fine.to_csv("dataset/GoEmotions-pytorch/dataset_csv/test_fine_goemo.csv")
val_fine.to_csv("dataset/GoEmotions-pytorch/dataset_csv/val_fine_goemo.csv")

##### Ekman (mapped all fine-grained to Ekman)

In [16]:
ekman_mapping = {
"0" : "anger",
"1" : "disgust",
"2" : "fear",
"3" : "joy",
"4" : "neutral",
"5" : "sadness",
"6" : "surprise"
}
def map_ekman_emotion(emo_ds):
    final_result_emo = []
    for idx, row in emo_ds.iterrows():
        e = row["emotion_label"]
        l = e[random.randint(0,len(e)-1)]
        final_result_emo.append(ekman_mapping[str(l)])
    emo_ds["emotion_label"] = final_result_emo
    return emo_ds           

In [19]:
train_ekman = pd.read_csv("dataset/GoEmotions-pytorch/data/ekman/train.tsv",sep='\t',names=['utterance', 'emotion_label', 'rater_id'])
test_ekman = pd.read_csv("dataset/GoEmotions-pytorch/data/ekman/test.tsv",sep='\t',names=['utterance', 'emotion_label', 'rater_id'])
val_ekman = pd.read_csv("dataset/GoEmotions-pytorch/data/ekman/dev.tsv",sep='\t',names=['utterance', 'emotion_label', 'rater_id'])

train_ekman = map_ekman_emotion(format_go_emo(train_ekman)).rename(columns={'utterance':'sentence','emotion_label':'label'})
test_ekman = map_ekman_emotion(format_go_emo(test_ekman)).rename(columns={'utterance':'sentence','emotion_label':'label'})
val_ekman = map_ekman_emotion(format_go_emo(val_ekman)).rename(columns={'utterance':'sentence','emotion_label':'label'})

In [24]:
train_ekman.to_csv("dataset/GoEmotions-pytorch/dataset_csv/train_fine_goemo.csv")
test_ekman.to_csv("dataset/GoEmotions-pytorch/dataset_csv/test_fine_goemo.csv")
val_ekman.to_csv("dataset/GoEmotions-pytorch/dataset_csv/val_fine_goemo.csv")

In [53]:
train_final = map_and_concat(train_final,[train_ekman],['goemotion'])
val_final = map_and_concat(val_final,[val_ekman],['goemotion'])
test_final = map_and_concat(test_final,[test_ekman],['goemotion'])

43410
5426
5427


#### EmoryNLP (BM)
[link] https://github.com/emorynlp/emotion-detection

In [38]:
emonlp_train = load_dataset("json", data_files={"train": "https://raw.githubusercontent.com/emorynlp/emotion-detection/master/json/emotion-detection-trn.json" }, field="episodes")
emonlp_val = load_dataset("json", data_files={"train": "https://raw.githubusercontent.com/emorynlp/emotion-detection/master/json/emotion-detection-dev.json" }, field="episodes")
emonlp_test = load_dataset("json", data_files={"train": "https://raw.githubusercontent.com/emorynlp/emotion-detection/master/json/emotion-detection-tst.json" }, field="episodes")


def format_emonlp(emonlp_dataset):
    utterances_emonlp = []
    emotions_emonlp = []
    for row in emonlp_dataset['train']:
        scenes = row['scenes']
        for scene in scenes:
            utterances = scene['utterances']
            for utterance in utterances:
                # extract only text and emotion labels
                utterances_emonlp.append(utterance['transcript'])
                emotions_emonlp.append(relabel_emotion(utterance['emotion']))
    clean_emonlp = {}
    clean_emonlp["sentence"] = utterances_emonlp
    clean_emonlp["labels"] = emotions_emonlp
    clean_emonlp = Dataset.from_dict(clean_emonlp).filter(lambda e: all(e[field] is not None for field in e))
    return clean_emonlp

Found cached dataset json (/home/annie/.cache/huggingface/datasets/json/default-cecdd3f673368c60/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (/home/annie/.cache/huggingface/datasets/json/default-53f09f25d41a1e93/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset json (/home/annie/.cache/huggingface/datasets/json/default-c3a8fb9e451079f1/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [43]:
train_emonlp = format_emonlp(emonlp_train).to_pandas()
val_emonlp = format_emonlp(emonlp_val).to_pandas()
test_emonlp = format_emonlp(emonlp_test).to_pandas()

Filter:   0%|          | 0/9934 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1344 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1328 [00:00<?, ? examples/s]

In [46]:
train_emonlp.to_csv('dataset/Emorynlp/emonlp_train_clean.csv', index = False)
val_emonlp.to_csv('dataset/Emorynlp/emonlp_val_clean.csv', index = False)
test_emonlp.to_csv('dataset/Emorynlp/emonlp_test_clean.csv', index = False)

In [54]:
train_final = map_and_concat(train_final,[train_emonlp],["emoryNLP"])
val_final = map_and_concat(val_final,[val_emonlp],["emoryNLP"])
test_final = map_and_concat(test_final,[test_emonlp],["emoryNLP"])

9934
1344
1328


#### Empathetic Dialogues (excluded)

#### DailyDialog (BM)

In [None]:
daily = load_dataset("daily_dialog")

def format_dd(daily):
    utterances_daily = []
    for d in daily["dialog"]:
        utterances_daily = utterances_daily + d
    emotions_daily = []
    for d in daily["emotion"]:
        emotions_daily = emotions_daily + d
    clean_daily = {}
    clean_daily["utterance"] = utterances_daily
    clean_daily["emotion"] = []
    for emo in emotions_daily : clean_daily["emotion"].append(relabel_emotion(emo))
    clean_daily = Dataset.from_dict(clean_daily)
    clean_daily = clean_daily.filter(lambda e: all(e[field] is not None for field in e))
    return clean_daily

train_dd = format_dd(daily['train']).to_pandas()
val_dd = format_dd(daily['validation']).to_pandas()
test_dd = format_dd(daily['test']).to_pandas()

In [None]:
train_final = map_and_concat(train_final,[train_dd],["daily"])
val_final = map_and_concat(val_final,[val_dd],["daily"])
test_final = map_and_concat(test_final,[test_dd],["daily"])

#### IEMOCAP (BM)

In [None]:
excluded_utterance = []
def split_string_u(string):
    pattern = r'^(.*?)(\[[\d.-]+\]:\s*)(.*)$'
    match = re.match(pattern, string)
    
    if match:
        part1 = match.group(1).rstrip()  # Remove trailing spaces from the first part
        part2 = match.group(2).rstrip().rstrip(":")  # Remove trailing spaces from the second part
        part3 = match.group(3)
        return part1, part2, part3
    else:
        excluded_utterance.append(string)

def split_string_e(string):
   return string.strip().split('\t')
def read_transcription(path):
    transcript_ds = load_dataset("text", data_files={"test":path})
    utterance_list = []
    for u in transcript_ds['test']['text']:
        utterance = split_string_u(u)
        utterance_list.append(utterance)
    return utterance_list

def read_emotion(path):
    emo_ds = load_dataset("text", data_files={"test":path})
    print(path + '\n')
    emotion_list = []
    for i,e in enumerate(emo_ds['test']['text']):
        if (e.strip() == '') and (i + 1 < len(emo_ds['test']['text'])):
            emotion = split_string_e(emo_ds['test']['text'][i+1])
            emotion_list.append(emotion)        
    return emotion_list

def read_iemocap(dir_path):
    # initial two result df to store all transcriptions from all sessions
    # similarly for emotions
    transcriptions_df = pd.DataFrame(columns=['idx', 'labs', 'utterance', 'session'])
    emotions_df = pd.DataFrame(columns=['labs', 'idx', 'emotion', 'attribute', 'session'])
    sessions = ['Session1','Session2','Session3','Session4', 'Session5']
    for session in sessions:
        # get all transcriptions
        transcriptions = dir_path + '/IEMOCAP_full_release/' + session + '/' + 'dialog' + '/transcriptions'
        all_utterances = []
        for item in os.listdir(transcriptions):
            item_path = os.path.join(transcriptions, item)
            if os.path.isfile(item_path):
                all_utterances = all_utterances + read_transcription(item_path)
        u_df = pd.DataFrame(all_utterances, columns=['idx', 'labs', 'utterance'])
        u_df['session'] = session
        transcriptions_df = pd.concat([transcriptions_df, u_df])
        
        # get all emotions
        emoeval = dir_path + '/IEMOCAP_full_release/' + session + '/' + 'dialog' + '/EmoEvaluation'
        all_emotions = []
        for item in os.listdir(emoeval):
            item_path = os.path.join(emoeval, item)
            if os.path.isfile(item_path):
                all_emotions = all_emotions + read_emotion(item_path)
        e_df = pd.DataFrame(all_emotions, columns=['labs', 'idx', 'emotion', 'attribute']) 
        e_df['session'] = session
        emotions_df = pd.concat([emotions_df, e_df])

    return {'utterance':transcriptions_df, 
            'emotion': emotions_df}

In [None]:
iemocap_dir = 'dataset/IEMOCAP_full_release_withoutVideos'
# join by idx
read_results = read_iemocap(iemocap_dir)
clean_imocap = pd.merge(read_results['utterance'],read_results['emotion'], on='idx')
clean_imocap = clean_imocap[clean_imocap['emotion'] != 'xxx']
clean_imocap = clean_imocap[clean_imocap['emotion'] != 'dis']
clean_imocap = clean_imocap[clean_imocap['emotion'] != 'oth']

In [None]:
# train test val split: stratify base on labels
train, testval= train_test_split(clean_imocap, test_size=0.20, random_state=0, stratify=clean_imocap[['emotion']])
test, val = train_test_split(testval, test_size=0.5, random_state=0, stratify=testval[['emotion']])

train.to_csv('dataset/IEMOCAP_full_release_withoutVideos/IEMOCAP_full_release/iemocap_train.csv')
test.to_csv('dataset/IEMOCAP_full_release_withoutVideos/IEMOCAP_full_release/iemocap_test.csv')
val.to_csv('dataset/IEMOCAP_full_release_withoutVideos/IEMOCAP_full_release/iemocap_val.csv')

In [None]:
train_final = map_and_concat(train_final,[train],["iemocap"])
val_final = map_and_concat(val_final,[val],["iemocap"])
test_final = map_and_concat(test_final,[test],["iemocap"])

#### MELD (BM) (hartmann)

In [None]:
meld = load_dataset("csv", data_files={"test":"https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/test_sent_emo.csv",
                                       "train":"https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv",
                                       "val": "https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/dev_sent_emo.csv"})

def format_save_meld(meld):
    clean_meld = {}
    clean_meld["utterance"] = meld["Utterance"]

    clean_meld["emotion"] = []
    for emo in meld["Emotion"] : clean_meld["emotion"].append(relabel_emotion(emo))
    clean_meld = Dataset.from_dict(clean_meld)
    clean_meld = clean_meld.filter(lambda e: all(e[field] is not None for field in e))
    return clean_meld

In [None]:
meld_train = format_save_meld(meld['train'])
meld_val = format_save_meld(meld['val'])
meld_test = format_save_meld(meld['test'])

In [None]:
meld_train.to_csv('dataset/meld_train_clean.csv', index = False)
meld_val.to_csv('dataset/meld_val_clean.csv', index = False)
meld_test.to_csv('dataset/meld_test_clean.csv', index = False)

#### SemEval-2018, EI-reg, Mohammad et al. (hartmann)

In [None]:
def prep_semeval(filename, dataset):
    new_df = pd.DataFrame(columns=['ID', 'utterance', 'emotion'],dtype=object)
    for index, row in dataset.iterrows():
        utterance = row['Tweet']
        emotions_col = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']
        id = index
        emotions_list = []
        for emotion in emotions_col:
            if row[emotion] == 1:
                emotions_list.append(emotion) 
        
        if emotions_list: 
            e = emotions_list[random.randint(0,len(emotions_list)-1)]
            new_row = {"ID": id,
                    "utterance": utterance,
                    "emotion": e}
            print(new_row)
            new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)

    new_df.to_csv("dataset/SemEval-2018/" + filename)        


In [None]:
df = load_dataset('sem_eval_2018_task_1', 'subtask5.english')
prep_semeval('semeval_train_clean.csv',df['train'])
prep_semeval('semeval_test_clean.csv',df['test'])
prep_semeval('semeval_val_clean.csv', df['validation'])


In [None]:
semeval_train = pd.read_csv("dataset/SemEval-2018/semeval_train_clean.csv").rename(columns={'utterance':'sentence','emotion': 'label'})
semeval_test = pd.read_csv("dataset/SemEval-2018/semeval_test_clean.csv").rename(columns={'utterance':'sentence','emotion': 'label'})
semeval_val = pd.read_csv("dataset/SemEval-2018/semeval_val_clean.csv").rename(columns={'utterance':'sentence','emotion': 'label'})

#### ISEAR, Vikash (hartmann)

In [7]:
isear = pd.read_csv('dataset/ISEAR/isear_clean.csv', index_col=0)
train, testval= train_test_split(isear, test_size=0.20, random_state=0, stratify=isear[['emotion']])
test, val = train_test_split(testval, test_size=0.5, random_state=0, stratify=testval[['emotion']])


In [None]:
isear_train = train.rename(columns={'utterance':'sentence','emotion': 'label'})
isear_test = test.rename(columns={'utterance':'sentence','emotion': 'label'})
isear_val = val.rename(columns={'utterance':'sentence','emotion': 'label'})

In [None]:
train.to_csv('/home/annie/Desktop/evaluate_models/dataset/ISEAR/isear_train.csv')
test.to_csv('/home/annie/Desktop/evaluate_models/dataset/ISEAR/isear_test.csv')
val.to_csv('/home/annie/Desktop/evaluate_models/dataset/ISEAR/isear_val.csv')

#### CARER Emotion Dataset, Elvis et al. (hartmann)

In [None]:
dataset = load_dataset("dair-ai/emotion")
class_names = ["sadness", "joy", "love", "anger", "fear", "surprise"]
def map_to_label(dataset):
    col = []
    for l in dataset['label']:
        col.append(class_names[l])
    return col

test_l = map_to_label(dataset['test'])
validation_l = map_to_label(dataset['validation'])
train_l = map_to_label(dataset['train'])

carer_test = pd.DataFrame({"sentence": dataset['test']['text'], 'label': test_l, "emotion_num": dataset['test']['label']})
carer_test.to_csv("dataset/Emotion_Elvis/EmoElvis_test_clean.csv")

carer_val = pd.DataFrame({"sentence": dataset['validation']['text'], 'label': validation_l, "emotion_num": dataset['validation']['label']})
carer_val.to_csv("dataset/Emotion_Elvis/EmoElvis_validation_clean.csv")

carer_train = pd.DataFrame({"sentence": dataset['train']['text'], 'label': train_l, "emotion_num": dataset['train']['label']})
carer_train.to_csv("dataset/Emotion_Elvis/EmoElvis_train_clean.csv")

#### Crowdflower (hartmann)

In [None]:
url = "https://raw.githubusercontent.com/tlkh/text-emotion-classification/master/dataset/original/text_emotion.csv"
flower = load_dataset("csv", data_files=url )
X = flower['train']
y = flower['train']['sentiment']
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
train_index, test_index = next(sss.split(X, y))

train=X[train_index]
testval = X[test_index]
testval = Dataset.from_dict(testval)

X = testval
y = testval['sentiment']
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=0)
val_index, test_index = next(sss.split(X, y))
val=X[val_index]
test=X[test_index]

flower_train = Dataset.from_dict(train).rename(columns={"sentiment":'label', 'utterence':'sentence'})
flower_test = Dataset.from_dict(test).rename(columns={"sentiment":'label', 'utterence':'sentence'})
flower_val = Dataset.from_dict(val).rename(columns={"sentiment":'label', 'utterence':'sentence'})

In [None]:
flower_train.to_csv("dataset/crowdflower_data/crowdflower_trian.csv")
flower_test.to_csv("dataset/crowdflower_data/crowdflower_test.csv")
flower_val.to_csv("dataset/crowdflower_data/crowdflower_valid.csv")

In [None]:
train_final = map_and_concat(train_final,[meld_train,semeval_train,isear_train,carer_train,flower_train],["meld","semeval","isear","carer","flower"])
val_final = map_and_concat(val_final,[meld_val,semeval_val,isear_val,carer_val,flower_val],["meld","semeval","isear","carer","flower"])
test_final = map_and_concat(test_final,[meld_test,semeval_test,isear_test,carer_test,flower_test],["meld","semeval","isear","carer","flower"])

In [None]:
train_final.to_json()

### Individual Import, Cleaning, Analysis