# Process IEMOCAP

## 1 - Extract Label Transcript

In [20]:
import os
import csv
import sys
import numpy as np
from util import *

In [21]:
create_folder('../FY Project/My Project/data/processed/IEMOCAP')


File path --> '../FY Project/My Project/data/processed/IEMOCAP' already exists


In [22]:
out_file = '../My Project/data/processed/IEMOCAP/processed_tran.csv'
out_file_trans = '../My Project/data/processed/IEMOCAP/sentence_only.txt'
os.system('rm ' + out_file)  # Remove out file if it exists
os.system('rm ' + out_file_trans)


1

In [23]:
def extract_transcript(list_files, out_file, out_file_trans):
    '''
    Extracts transcript for each uniques session.

        Parameters:
            list_files (list): A list of files (with fullnames) to process transcript
            out_file (string): Out file to write processed transcript

        Returns:
            None
    '''
    file_lines = []

    for file in list_files: # Processes each file in file list

        with open(file, 'r') as in_file:
            file_lines = in_file.readlines()

        with open(out_file, 'a') as outfile:
            csv_writer = csv.writer(outfile)
            file_lines = sorted(file_lines)

            for line in file_lines:
                line_split = line.split(':')

                # Select session name i.e. (Ses01F_impro01_F000)
                name = line_split[0].split(' ')[0].strip()

                # Unwanted case
                if name[:3] != 'Ses':
                    continue
                elif name[-3:-1] == 'XX':
                    continue
                
                transcript = line_split[1].strip()

                # cnt += 1
                csv_writer.writerow([name, transcript])

                with open(out_file_trans, 'a') as outfile_trans:
                    outfile_trans.write(transcript + '\n')


In [24]:
list_files = []

for x in range(1, 5):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/dialog/transcriptions'

    file_search(path, list_files)
    list_files = sorted(list_files)

    print(f"{sess_title}, # Num of files: {len(list_files)}")

extract_transcript(list_files, out_file, out_file_trans)


Session1, # Num of files: 28
Session2, # Num of files: 58
Session3, # Num of files: 90
Session4, # Num of files: 120


## 2 - Extract Label

In [25]:
out_file = '../My Project/data/processed/IEMOCAP/label.csv'
os.system('rm ' + out_file)  # Remove out file if it exists


1

In [26]:
category_list = ['ang', 'hap', 'sad', 'neu', 'fru', 'exc', 'fea', 'sur', 'dis', 'oth', 'xxx']
category = {}

for cat_type in category_list:
    if cat_type in category:
        continue
    else:
        category[cat_type] = len(category)


In [27]:
def find_category(lines):
    '''
    Find ground truth category for each session recording in txt file.

        Parameters:
            lines (list): Lines extracted from each sessions Emoevaluation txt file

        Returns:
            cat_emo_list (list): List contains each Session name with groud-truth emotion \
                i.e. [['Ses01F_impro01_F000, 'neu']]

    '''
    cat_emo_list = []
    is_target_line = True

    for line in lines:

        # Check if line is in format --> [START_TIME - END_TIME] TURN_NAME EMOTION [V, A, D]
        if is_target_line == True:

            try:
                line_split = line.split('\t')

                session_id = line_split[1].strip()
                cat_label = line_split[2].strip()

                if cat_label not in category:  # Confirm cat_label is in category dictionary
                    print(f'Invalid key --> {cat_label}')
                    sys.exit()  # Exit script

                cat_emo_list.append([session_id, cat_label])
                is_target_line = False  # Subsequent lines are not target line i.e. C-E2:	Neutral;	()

            except:
                print(f'ERROR --> {line}')  # Error encontered on line
                sys.exit()

        else:
            if line == '\n':
                is_target_line = True

    return cat_emo_list


In [28]:
def extract_labels(list_files, out_file):
    '''
    Extracts transcript for each unique session.

        Parameters:
            list_files (list): A list of files (with fullnames) to process transcript
            out_file (string): Out file to write processed transcript

        Returns:
            None
    '''
    lines = []
    sorted_cat_emo_list = []
    
    for file in list_files:

        with open(file, 'r') as in_file:
            lines = in_file.readlines()

            # Remove header --> '% [START_TIME - END_TIME] TURN_NAME EMOTION [V, A, D]'
            lines = lines[2:]
            cat_emo_list = find_category(lines)

        sorted_cat_emo_list = sorted(cat_emo_list)

        with open(out_file, 'a') as outfile:
            csv_writer = csv.writer(outfile)
            csv_writer.writerows(sorted_cat_emo_list)


In [29]:
list_files = []
skip_dir = ['Attribute', 'Categorical', 'Self-evaluation']

for x in range(1, 5):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/dialog/EmoEvaluation/'
    file_search(path, list_files, skip_dir)
    list_files = sorted(list_files)

    print(f"{sess_title}, # Num of files: {len(list_files)}")

extract_labels(list_files, out_file)


Session1, # Num of files: 28
Session2, # Num of files: 58
Session3, # Num of files: 90
Session4, # Num of files: 120


In [30]:
len(list_files)

120

## 2(1) - Convert Label Category

* Angry:   -->    0 'ang'(1103)
* Happy:   -->    1 'exc'(1041), 'hap'(595)
* Sad:     -->    2 'sad'(1084)
* Neutral: -->    3 'neu'(1708)

In [31]:
lines = [] 
with open('./data/processed/IEMOCAP/label.csv', 'r') as f:
    csv_reader = csv.reader(f)
    lines = [line for line in csv_reader if len(line) > 0]

print(len(lines))


7869


In [32]:
with open('./data/processed/IEMOCAP/processed_label.txt', 'w') as f:

    with open('./data/processed/IEMOCAP/processed_ids.txt', 'w') as f2:

        for line in lines:
            if line[1] == 'ang':
                f.write('ang\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'hap':
                f.write('hap\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'sad':
                f.write('sad\n')
                f2.write(line[0]+'\n')
            elif line[1] == 'neu':
                f.write('neu\n')
                f2.write(line[0]+'\n')
            else:
                f.write('-1\n')


In [33]:
lines = []
with open('./data/processed/IEMOCAP/processed_label.txt', 'r') as f:
    lines = f.readlines()

lines = [line.strip() for line in lines]

print('Angry (0)\t-->', len([x for x in lines if x == 'ang']))
print('Happy (1)\t-->', len([x for x in lines if x == 'hap']))
print('Sad (2)\t\t-->', len([x for x in lines if x == 'sad']))
print('Neutral (3)\t-->', len([x for x in lines if x == 'neu']))


Angry (0)	--> 933
Happy (1)	--> 452
Sad (2)		--> 839
Neutral (3)	--> 1324


## Convert labels to four categories ['ang', 'hap', 'sad', 'neu']

In [34]:
with open('./data/processed/IEMOCAP/FC_label.txt', 'w') as f:
    for label in lines:
        if label != '-1':
            f.write(label+'\n')


## Get sentences for four categories

In [35]:
sentences = []

with open('./data/processed/IEMOCAP/sentence_only.txt') as f:
    full_sentences = f.readlines()

sentences = [x.strip() for x in full_sentences]

In [36]:
with open('./data/processed/IEMOCAP/FC_sentence.txt', 'w') as f:
    for index, label in enumerate(lines):
        if label != '-1':
            f.write(sentences[index]+'\n')


In [37]:
b = []
with open('./data/processed/IEMOCAP/FC_label.txt', 'r') as f:
    x = f.readlines()

b = [line.strip() for line in x]


In [38]:
print(len(b), b[:5])

3548 ['neu', 'neu', 'neu', 'neu', 'ang']
