# EmpatheticPersonas Empathy (Labelled) Dataset (1100 Instances)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Data Analysis and Cleaning

In [2]:
# load the dataset
EP_EN = pd.read_csv('EN/EP_EN_annotated.csv')
EP_ZH = pd.read_csv('ZH/EP_ZH_annotated.csv')


In [3]:
# function to check and remove overlap
def overlap_check(df):
    df_list = df['response'].tolist()

    overlap_index = [index for (index,response) in enumerate(df_list) if response in df_list[index+1:]]
    
    return overlap_index


In [4]:
# check for overlap
EN_overlap = overlap_check(EP_EN)
print(len(EN_overlap)) # 79 instances

ZH_overlap = overlap_check(EP_ZH)
print(len(ZH_overlap)) # 84 instances 


79
84


In [5]:
# remove overlap
EP_EN.drop(labels=EN_overlap, axis=0, inplace=True)
EP_ZH.drop(labels=ZH_overlap, axis=0, inplace=True)
print(len(EP_EN))
print(len(EP_ZH))

1021
1016


In [6]:
# Sanity Check - make sure no more overlaps
print(len(overlap_check(EP_EN)))
print(len(overlap_check(EP_ZH)))

0
0


### Data Split for Training

In [7]:
# Check distribution - slightly skewed
print(EP_EN['empathy_score'].value_counts(normalize=True)*100) 
print(EP_ZH['empathy_score'].value_counts(normalize=True)*100)

1    35.063663
2    34.965720
0    29.970617
Name: empathy_score, dtype: float64
2    35.137795
1    35.137795
0    29.724409
Name: empathy_score, dtype: float64


In [8]:
# Extract from each category
def balance_dataset(df):
    min_len = min(df['empathy_score'].value_counts())
    df_balanced = pd.DataFrame()
    for score in range(3):
        df_score = df[df['empathy_score']==score]
        df_score = df_score.sample(min_len, random_state=0)
        df_balanced = pd.concat([df_balanced, df_score])

    return df_balanced


In [9]:
EN_balanced = balance_dataset(EP_EN)
ZH_balanced = balance_dataset(EP_ZH)

# Sanity check
print(EN_balanced['empathy_score'].value_counts(normalize=True)*100) 
print(ZH_balanced['empathy_score'].value_counts(normalize=True)*100)

0    33.333333
1    33.333333
2    33.333333
Name: empathy_score, dtype: float64
0    33.333333
1    33.333333
2    33.333333
Name: empathy_score, dtype: float64


In [10]:
# Keep only reponse and empathy score
EN_balanced.drop(labels=['annotator1_score','annotator2_score','annotator3_score'], axis=1, inplace=True)
ZH_balanced.drop(labels=['annotator1_score','annotator2_score','annotator3_score'], axis=1, inplace=True)

# Change headings
newlabels = {'response': 'text', 
            'empathy_score': 'labels'}

EN_balanced = EN_balanced.rename(columns = newlabels)
ZH_balanced = ZH_balanced.rename(columns = newlabels)

In [11]:
# Train Test Splits
# EN 90% Train-10% Test Split
EN_train, EN_test = train_test_split(EN_balanced, test_size=0.1, shuffle=True, random_state=0, stratify=EN_balanced['labels'])

# Check
print(EN_train['labels'].value_counts()) 
print(EN_test['labels'].value_counts())

2    276
1    275
0    275
Name: labels, dtype: int64
0    31
1    31
2    30
Name: labels, dtype: int64


In [12]:
# ZH 80% Train - 10% Val - 10% Test Split
ZH_train, ZH_test = train_test_split(ZH_balanced, test_size=0.2, shuffle=True, random_state=0, stratify=ZH_balanced['labels'])
ZH_val, ZH_test = train_test_split(ZH_test, test_size=0.5, shuffle=True, random_state=0, stratify=ZH_test['labels'])

# Check
print(ZH_train['labels'].value_counts()) 
print(ZH_val['labels'].value_counts())
print(ZH_test['labels'].value_counts())

2    242
1    241
0    241
Name: labels, dtype: int64
1    31
2    30
0    30
Name: labels, dtype: int64
0    31
2    30
1    30
Name: labels, dtype: int64


In [13]:
# Final Train Set
EN_ZH_train = pd.concat([EN_train, ZH_train])
EN_ZH_train = EN_ZH_train.sample(frac=1).reset_index(drop=True) # shuffle the dataset

# Check
print(EN_ZH_train['labels'].value_counts()) 

2    518
1    516
0    516
Name: labels, dtype: int64


In [14]:
# Export to CSV
EN_train.to_csv('EN/EP_EN_train.csv')
EN_test.to_csv('EN/EP_EN_test.csv')

ZH_train.to_csv('ZH/EP_ZH_train.csv')
ZH_val.to_csv('ZH/EP_ZH_val.csv')
ZH_test.to_csv('ZH/EP_ZH_test.csv')

EN_ZH_train.to_csv('EP_empathy_train.csv') 

# Empathetic Rewritings Model Labelled
## Extract high empathy sentences

In [15]:
df_model = pd.read_csv('model_labelled.csv')

In [20]:
# find high empathy text
high_emp = [index for index,emp_level in enumerate(list(df_model['class'])) if emp_level==2]

print(len(high_emp))

666


In [22]:
df_zh = pd.read_csv('EP_empathy_2144_ZH.csv')
df_en = pd.read_csv('EP_empathy_2144_EN.csv')

# Extract only high empathy sentences
high_emp_zh = df_zh.iloc[high_emp]
high_emp_en = df_en.iloc[high_emp]

high_emp_zh['emotion'].value_counts()
high_emp_en['emotion'].value_counts()

焦虑      229
所有情绪    178
悲伤      125
愤怒      107
快乐       27
Name: emotion, dtype: int64


Anxious         229
All emotions    178
Sad             125
Angry           107
Happy            27
Name: emotion, dtype: int64

In [23]:
high_emp_zh.to_csv('high_empathy_ZH.csv')
high_emp_en.to_csv('high_empathy_EN.csv')