# Process IEMOCAP

## 1 - Extract Label Transcript

In [1]:
import os
import csv
import sys
import numpy as np
from util import *

In [2]:
create_folder('../My Project/data/processed/IEMOCAP')


File path --> '../My Project/data/processed/IEMOCAP' already exists


In [3]:
out_file = '../My Project/data/processed/IEMOCAP/processed_tran.csv'
os.system('rm ' + out_file)  # Remove out file if it exists


1

In [4]:
def extract_transcript(list_files, out_file):
    '''
    Extracts transcript for each uniques session.

        Parameters:
            list_files (list): A list of files (with fullnames) to process transcript
            out_file (string): Out file to write processed transcript

        Returns:
            None
    '''
    file_lines = []

    for file in list_files: # Processes each file in file list

        with open(file, 'r') as in_file:
            file_lines = in_file.readlines()

        with open(out_file, 'a') as outfile:
            csv_writer = csv.writer(outfile)
            file_lines = sorted(file_lines)

            for line in file_lines:
                line_split = line.split(':')

                # Select session name i.e. (Ses01F_impro01_F000)
                name = line_split[0].split(' ')[0].strip()

                # Unwanted case
                if name[:3] != 'Ses':             # noise transcription such as reply  M: sorry
                    continue
                elif name[-3:-1] == 'XX':        # we don't have matching pair in label
                    continue
                
                transcript = line_split[1].strip()

                # cnt += 1
                csv_writer.writerow([name, transcript])


In [5]:
list_files = []

for x in range(1, 6):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/dialog/transcriptions'

    file_search(path, list_files)
    list_files = sorted(list_files)

    print(f"{sess_title}, # Num of files: {len(list_files)}")

extract_transcript(list_files, out_file)


Session1, # Num of files: 28
Session2, # Num of files: 58
Session3, # Num of files: 90
Session4, # Num of files: 120
Session5, # Num of files: 151


## 2 - Extract Label

In [6]:
out_file = '../My Project/data/processed/IEMOCAP/label.csv'
os.system('rm ' + out_file)  # Remove out file if it exists


1

In [7]:
category_list = ['ang', 'hap', 'sad', 'neu', 'fru', 'exc', 'fea', 'sur', 'dis', 'oth', 'xxx']
category = {}

for cat_type in category_list:
    if cat_type in category:
        continue
    else:
        category[cat_type] = len(category)


In [8]:
def find_category(lines):
    '''
    Find ground truth category for each session recording in txt file.

        Parameters:
            lines (list): Lines extracted from each sessions Emoevaluation txt file

        Returns:
            cat_emo_list (list): List contains each Session name with groud-truth emotion \
                i.e. [['Ses01F_impro01_F000, 'neu']]

    '''
    cat_emo_list = []
    is_target_line = True

    for line in lines:

        # Check if line is in format --> [START_TIME - END_TIME] TURN_NAME EMOTION [V, A, D]
        if is_target_line == True:

            try:
                line_split = line.split('\t')

                session_id = line_split[1].strip()
                cat_label = line_split[2].strip()

                if cat_label not in category:  # Confirm cat_label is in category dictionary
                    print(f'Invalid key --> {cat_label}')
                    sys.exit()  # Exit script

                cat_emo_list.append([session_id, cat_label])
                is_target_line = False  # Subsequent lines are not target line i.e. C-E2:	Neutral;	()

            except:
                print(f'ERROR --> {line}')  # Error encontered on line
                sys.exit()

        else:
            if line == '\n':
                is_target_line = True

    return cat_emo_list


In [9]:
def extract_labels(list_files, out_file):
    '''
    Extracts transcript for each uniques session.

        Parameters:
            list_files (list): A list of files (with fullnames) to process transcript
            out_file (string): Out file to write processed transcript

        Returns:
            None
    '''
    lines = []
    
    for file in list_files:

        with open(file, 'r') as in_file:
            lines = in_file.readlines()

            # Remove header --> '% [START_TIME - END_TIME] TURN_NAME EMOTION [V, A, D]'
            lines = lines[2:]
            cat_emo_list = find_category(lines)

        cat_emo_list.sort()

        with open(out_file, 'a') as outfile:
            csv_writer = csv.writer(outfile)
            csv_writer.writerows(cat_emo_list)


In [10]:
list_files = []
skip_dir = ['Attribute', 'Categorical', 'Self-evaluation']

for x in range(1, 6):
    sess_title = 'Session' + str(x)

    path = f'./data/raw/IEMOCAP_full_release/{sess_title}/dialog/EmoEvaluation/'
    file_search(path, list_files, skip_dir)
    list_files.sort()

    print(f"{sess_title}, # Num of files: {len(list_files)}")

extract_labels(list_files, out_file)


Session1, # Num of files: 28
Session2, # Num of files: 58
Session3, # Num of files: 90
Session4, # Num of files: 120
Session5, # Num of files: 151
