In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# choose language
lang = 'EN' # 'ZH' or 'EN'

# import EmpatheticPersonas dataset
df = pd.read_csv('empatheticPersonas' + lang + '.csv', index_col=0)
df.head()

In [16]:
# text  - extracts the 1181 expressions of emotions 
# label - emotion corresponding to text extract
if lang == 'ZH':
    emotions = ['悲伤', '愤怒', '快乐', '焦虑']
elif lang == 'EN':
    emotions = ['Sad', 'Angry', 'Happy', 'Anxious']

text = []
label = []
start_index = 0
label_encoding = 0
for emotion in emotions:
    for col in range(1,4):
        if lang == 'ZH':
            colname = emotion +' - 患者反应 ' + str(col)
        elif lang == 'EN':
            colname = emotion +' - Patient response ' + str(col)
        
        text += df[colname].dropna().values.tolist()
    end_index = len(text)
    # Label Encoding: 0 - 'Sad', 1- 'Angry', 2- 'Happy', 3 -'Anxious'
    label += [label_encoding]*(end_index - start_index)
    start_index = end_index
    label_encoding+=1

# print(text, label)

In [17]:
# sanity check - check text and label same length
print(len(text))
print(len(label))

1181
1181


In [18]:
# create the organised df
df = pd.DataFrame()
df['text']=text
df['labels']=label

In [19]:
df['labels'].value_counts()

0    300
2    300
1    297
3    284
Name: labels, dtype: int64

In [20]:
# train-test-val split (80-10-10 approx.)
# TRAIN SET
df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=0, stratify=df['labels'])
df_val, df_test = train_test_split(df_test, test_size=0.5, shuffle=True, random_state=0, stratify=df_test['labels'])

In [21]:
# check distribution: approx equal
print(df_train['labels'].value_counts(normalize=True)*100)
print(df_test['labels'].value_counts(normalize=True)*100)
print(df_val['labels'].value_counts(normalize=True)*100)


2    25.423729
0    25.423729
1    25.105932
3    24.046610
Name: labels, dtype: float64
2    25.210084
1    25.210084
0    25.210084
3    24.369748
Name: labels, dtype: float64
1    25.423729
2    25.423729
0    25.423729
3    23.728814
Name: labels, dtype: float64


In [22]:
# export to csv
df.to_csv('emotions/EmpatheticPersonas/' + lang + '/emotionlabeled_full.csv', index=False)
df_train.to_csv('emotions/EmpatheticPersonas/' + lang + '/emotionlabeled_train.csv', index=False)
df_test.to_csv('emotions/EmpatheticPersonas/' + lang + '/emotionlabeled_test.csv', index=False)
df_val.to_csv('emotions/EmpatheticPersonas/' + lang + '/emotionlabeled_val.csv', index=False)

In [24]:
# translate-train-all dataset 
# ZH_train + EN_full
zh_train = pd.read_csv('emotions/EmpatheticPersonas/ZH/emotionlabeled_train.csv', index_col=0)
en_full = pd.read_csv('emotions/EmpatheticPersonas/EN/emotionlabeled_train.csv', index_col=0)

train_all = pd.concat([zh_train, en_full])
train_all = train_all.sample(frac=1) # shuffle the dataset

train_all.to_csv('emotions/EmpatheticPersonas/EN-ZH/emotionlabeled_train.csv')