# Empathy Labelled Dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Data Analysis and Cleaning

In [26]:
# load the dataset
EP_EN = pd.read_csv('empathy_labelled_en.csv')
EP_ZH = pd.read_csv('empathy_labelled_zh.csv')


In [27]:
# function to check and remove overlap
def overlap_check(df):
    df_list = df['response'].tolist()

    overlap_index = [index for (index,response) in enumerate(df_list) if response in df_list[index+1:]]
    
    return overlap_index


In [28]:
# check for overlap
EN_overlap = overlap_check(EP_EN)
print(len(EN_overlap)) # 79 instances

ZH_overlap = overlap_check(EP_ZH)
print(len(ZH_overlap)) # 84 instances 


79
84


In [29]:
# remove overlap
EP_EN.drop(labels=EN_overlap, axis=0, inplace=True)
EP_ZH.drop(labels=ZH_overlap, axis=0, inplace=True)
print(len(EP_EN))
print(len(EP_ZH))

1021
1016


In [30]:
# Sanity Check - make sure no more overlaps
print(len(overlap_check(EP_EN)))
print(len(overlap_check(EP_ZH)))

0
0


In [32]:
# Export to CSV
EP_EN.to_csv('empathy_EN_clean.csv',index=False)
EP_ZH.to_csv('empathy_ZH_clean.csv',index=False)

### Data Split for Training

In [41]:
df_EN = pd.read_csv('empathy_EN_clean.csv')
df_ZH = pd.read_csv('empathy_ZH_clean.csv')

# Check distribution now - slightly skewed
print(df_EN['empathy_score'].value_counts(normalize=True)*100) 
print(df_ZH['empathy_score'].value_counts(normalize=True)*100)

1    35.063663
2    34.965720
0    29.970617
Name: empathy_score, dtype: float64
2    35.137795
1    35.137795
0    29.724409
Name: empathy_score, dtype: float64


In [42]:
# Extract from each category
def balance_dataset(df):
    min_len = min(df['empathy_score'].value_counts())
    df_balanced = pd.DataFrame()
    for score in range(3):
        df_score = df[df['empathy_score']==score]
        df_score = df_score.sample(min_len, random_state=0)
        df_balanced = pd.concat([df_balanced, df_score])

    return df_balanced


In [51]:
EN_balanced = balance_dataset(df_EN)
ZH_balanced = balance_dataset(df_ZH)

# Sanity check
print(EN_balanced['empathy_score'].value_counts(normalize=True)*100) 
print(ZH_balanced['empathy_score'].value_counts(normalize=True)*100)

0    33.333333
1    33.333333
2    33.333333
Name: empathy_score, dtype: float64
0    33.333333
1    33.333333
2    33.333333
Name: empathy_score, dtype: float64


In [45]:
# Export full balanced dataset to csv
EN_balanced.to_csv('balanced/EN_full.csv')
ZH_balanced.to_csv('balanced/ZH_full.csv')

In [52]:
# Keep only reponse and empathy score
EN_balanced.drop(labels=['annotator1_score','annotator2_score','annotator3_score'], axis=1, inplace=True)
ZH_balanced.drop(labels=['annotator1_score','annotator2_score','annotator3_score'], axis=1, inplace=True)

# Change headings
newlabels = {'response': 'text', 
            'empathy_score': 'labels'}

EN_balanced = EN_balanced.rename(columns = newlabels)
ZH_balanced = ZH_balanced.rename(columns = newlabels)

In [56]:
# Train Test Splits
# EN 90% Train-10% Test Split
EN_train, EN_test = train_test_split(EN_balanced, test_size=0.1, shuffle=True, random_state=0, stratify=EN_balanced['labels'])

# Check
print(EN_train['labels'].value_counts()) 
print(EN_test['labels'].value_counts())

2    276
1    275
0    275
Name: labels, dtype: int64
0    31
1    31
2    30
Name: labels, dtype: int64


In [55]:
# ZH 80% Train - 10% Val - 10% Test Split
ZH_train, ZH_test = train_test_split(ZH_balanced, test_size=0.2, shuffle=True, random_state=0, stratify=ZH_balanced['labels'])
ZH_val, ZH_test = train_test_split(ZH_test, test_size=0.5, shuffle=True, random_state=0, stratify=ZH_test['labels'])

# Check
print(ZH_train['labels'].value_counts()) 
print(ZH_val['labels'].value_counts())
print(ZH_test['labels'].value_counts())

2    242
1    241
0    241
Name: labels, dtype: int64
1    31
2    30
0    30
Name: labels, dtype: int64
0    31
2    30
1    30
Name: labels, dtype: int64


In [58]:
# Final Train Set
EN_ZH_train = pd.concat([EN_train, ZH_train])
EN_ZH_train = EN_ZH_train.sample(frac=1).reset_index(drop=True) # shuffle the dataset

# Check
print(EN_ZH_train['labels'].value_counts()) 

2    518
1    516
0    516
Name: labels, dtype: int64


In [59]:
# Export to CSV
EN_train.to_csv('balanced/EN_train.csv')
ZH_train.to_csv('balanced/ZH_train.csv')
EN_ZH_train.to_csv('balanced/EN_ZH_train.csv') 
EN_test.to_csv('balanced/EN_test.csv')
ZH_val.to_csv('balanced/ZH_val.csv')
ZH_test.to_csv('balanced/ZH_test.csv')
