In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [85]:
# choose language
lang = 'EN' # 'ZH' or 'EN'

# import EmpatheticPersonas dataset
df = pd.read_csv('data/empatheticPersonas' + lang + '.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Age,Sad - Patient response 1,Sad - Patient response 2,Sad - Patient response 3,Sad - Was this caused by a specific event/s?,Sad - Was this caused by a recent or distant event (or events)?,Sad - Have you recently attempted protocol 6 and found this reignited unmanageable emotions as a result of old events?,Sad - Have you recently attempted protocol 11 and found this reignited unmanageable emotions as a result of old events?,Sad - Thank you. Now I will ask some questions to understand your situation.,Sad - Have you strongly felt or expressed any of the following emotions towards someone:,...,Happy - Patient response 3,Happy - That's Good! Let me recommend a protocol you can attempt.,All emotions - From what you have said I believe you are feeling {}. Is this correct?,All emotions - I am sorry. Please select from the emotions below the one that best reflects what you are feeling:,All emotions - Thank you for taking part. See you soon,"All emotions - Here are my recommendations, please select the protocol that you would like to attempt","All emotions - Please try to go through this protocol now. When you finish, press 'continue'",All emotions - Do you feel better or worse after having taken this protocol?,All emotions - Would you like to attempt another protocol? (Patient feels better),All emotions - Would you like to attempt another protocol? (Patient feels worse)
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Male,30-39,I am feeling a bit down,I'm in a pretty low mood,I'm feeling rather sad at the moment,I'm sorry to hear you aren't feeling well. Did...,"I can identify with that. Now, are these feeli...",Knowing that your feelings are important to me...,Knowing that your feelings are important to me...,Thank you for being so helpful and patient thu...,"I understand it may be difficult, but I would ...",...,,,,,,,,,,
Female,50-59,I don't feel that great today. My dog is unwell.,I feel a little sad. I am worried about losing...,I'm quite down. I had a disagreement with my s...,Did something in particular happen that made y...,,Some people have found that protocol 6 doesn't...,Some people have found that protocol 11 doesn'...,Thank you for sharing that with me. I am going...,Have you had or expressed any of the following...,...,,,,,,,,,,
Male,30-39,"Not very well, my dog died yesterday.","Not great, just been feeling depressed all week.","Could be better, I'd rather the day was over a...",I'm sorry to hear that. Is there one particula...,"Ah, I see. Would you say this was a recent or ...","Thank you, I understand. Would you say that th...","Thank you, I understand. Would you say that th...",Thank you for your patience. I will now ask yo...,"Thanks, I appreciate your patience. Would you ...",...,,,,,,,,,,
Female,25-29,I'm feeling kind of down,I'm really sad because my dog ran away,"My boyfriend broke up with me, so I'm pretty s...",Thank you. Did something specific happen to ma...,"Thank you, I understand. Could you tell me if ...",Can you let me know if you were recently worki...,Can you let me know if you were recently worki...,Thanks for telling me that. I will now ask you...,Could you tell me if you've been having any of...,...,,,,,,,,,,
Male,25-29,Not so good as I have been watching the news c...,I'm kind of firm as the school year is coming ...,I'm sad as it's a long weekend but I have to w...,Did something happen to bring about this feeling?,Could you tell me if this event or events happ...,"Did you attempt protocol 6 and, if so, did loo...","Did you attempt protocol 11 and, if so, did lo...","Thank you, I appreciate that. I'm going to try...",Can you think of a time where you strongly fel...,...,,,,,,,,,,


In [86]:
# text  - extracts the 1181 expressions of emotions 
# label - emotion corresponding to text extract
if lang == 'ZH':
    emotions = ['悲伤', '愤怒', '快乐', '焦虑']
elif lang == 'EN':
    emotions = ['Sad', 'Angry', 'Happy', 'Anxious']

text = []
label = []
start_index = 0
label_encoding = 0
for emotion in emotions:
    for col in range(1,4):
        if lang == 'ZH':
            colname = emotion +' - 患者反应 ' + str(col)
        elif lang == 'EN':
            colname = emotion +' - Patient response ' + str(col)
        
        text += df[colname].dropna().values.tolist()
    end_index = len(text)
    # Label Encoding: 0 - 'Sad', 1- 'Angry', 2- 'Happy', 3 -'Anxious'
    label += [label_encoding]*(end_index - start_index)
    start_index = end_index
    label_encoding+=1

# print(text, label)

In [87]:
# sanity check - check text and label same length
print(len(text))
print(len(label))

1181
1181


In [88]:
# create the organised df
df = pd.DataFrame()
df['text']=text
df['labels']=label

In [89]:
df['labels'].value_counts()

0    300
2    300
1    297
3    284
Name: labels, dtype: int64

In [90]:
# manual train-test-val split (80-10-10 approx.)
# TRAIN SET
# take first 236 of each emotion as train
sad = df.loc[df['labels']==0]
angry = df.loc[df['labels']==1]
happy = df.loc[df['labels']==2]
anxious = df.loc[df['labels']==3]

df_train = pd.concat([sad.iloc[0:236], angry.iloc[0:236], happy.iloc[0:236], anxious.iloc[0:236]], ignore_index=True)
df_train = df_train.sample(frac=1).reset_index(drop=True) # shuffle the dataset

# TEST AND VALIDATION SET
# remainder split randomly 50-50
df_test = pd.concat([sad.iloc[236:], angry.iloc[236:], happy.iloc[236:], anxious.iloc[236:]], ignore_index=True)
df_val, df_test = train_test_split(df_test, test_size=0.5, shuffle=True, random_state=0)

In [91]:
# check distribution: approx equal
print(df_train['labels'].value_counts(normalize=True)*100)
print(df_test['labels'].value_counts(normalize=True)*100)
print(df_val['labels'].value_counts(normalize=True)*100)


1    25.0
3    25.0
2    25.0
0    25.0
Name: labels, dtype: float64
0    31.092437
2    28.571429
1    22.689076
3    17.647059
Name: labels, dtype: float64
1    28.813559
2    25.423729
3    22.881356
0    22.881356
Name: labels, dtype: float64


In [92]:
# export to csv
df.to_csv('data/' + lang + '/emotionlabeled_full.csv')
df_train.to_csv('data/' + lang + '/emotionlabeled_train.csv')
df_test.to_csv('data/' + lang + '/emotionlabeled_test.csv')
df_val.to_csv('data/' + lang + '/emotionlabeled_val.csv')