In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Step 1: Load the dataset
train_file = 'empatheticdialogues/train.csv'
val_file = 'empatheticdialogues/valid.csv'
test_file = 'empatheticdialogues/test.csv'

train_df = pd.read_csv(train_file, on_bad_lines='skip')
val_df = pd.read_csv(val_file, on_bad_lines='skip')
test_df = pd.read_csv(test_file, on_bad_lines='skip')

non_standard_indices = [2355, 36628, 49433, 56957, 65019]
train_df = train_df.drop(index=non_standard_indices)
train_df.shape


(76663, 8)

In [2]:
train_df['tags'].unique()

array([nan, '<HI>', '<HI> <UNIGRAM>', '<UNIGRAM>', '<POLITICAL>',
       '<UNIGRAM> <NUMERAL>', '<IRREGULAR_COLON_FORMAT>'], dtype=object)

In [3]:
train_df.fillna(0, inplace=True)


In [4]:

print(f'Cleaned DataFrame shape: {train_df.shape}')

Cleaned DataFrame shape: (76663, 8)


In [5]:
train_df['tags'] = 0

In [6]:
# Save the cleaned dataset to a new CSV file
train_df.to_csv('cleaned_empathetic_dialogues.csv', index=False)


In [7]:
train_df['selfeval'].dtype

dtype('O')

In [8]:
selfeval_split = train_df['selfeval'].str.split('|', expand=True)
selfeval_split

Unnamed: 0,0,1,2,3,4
0,5,5,5_2,2,5
1,5,5,5_2,2,5
2,5,5,5_2,2,5
3,5,5,5_2,2,5
4,5,5,5_2,2,5
...,...,...,...,...,...
76663,5,5,5_5,5,5
76664,5,5,5_5,5,5
76665,5,5,5_5,5,5
76666,5,5,5_5,5,5


In [9]:
CD_split = selfeval_split[2].str.split('_', expand=True)
CD_split

Unnamed: 0,0,1
0,5,2
1,5,2
2,5,2
3,5,2
4,5,2
...,...,...
76663,5,5
76664,5,5
76665,5,5
76666,5,5


In [10]:
selfeval_final = pd.DataFrame({
    'A': selfeval_split[0],
    'B': selfeval_split[1],
    'C': CD_split[0],
    'D': CD_split[1],
    'E': selfeval_split[3],
    'F': selfeval_split[4]
})

In [11]:
selfeval_final

Unnamed: 0,A,B,C,D,E,F
0,5,5,5,2,2,5
1,5,5,5,2,2,5
2,5,5,5,2,2,5
3,5,5,5,2,2,5
4,5,5,5,2,2,5
...,...,...,...,...,...,...
76663,5,5,5,5,5,5
76664,5,5,5,5,5,5
76665,5,5,5,5,5,5
76666,5,5,5,5,5,5


In [12]:
# Step 4: Convert the new columns to numeric (if needed)
selfeval_final = selfeval_final.apply(pd.to_numeric, errors='coerce')


In [13]:
train_df = pd.concat([train_df, selfeval_final], axis=1)
# Step 6: Remove duplicate columns
train_df = train_df.loc[:, ~train_df.columns.duplicated()]

# Display the DataFrame with the new columns
print("\nDataFrame with separated 'selfeval' values (duplicates removed):")
print(train_df[['selfeval', 'A', 'B', 'C', 'D', 'E', 'F']].head())



DataFrame with separated 'selfeval' values (duplicates removed):
      selfeval    A    B    C    D    E    F
0  5|5|5_2|2|5  5.0  5.0  5.0  2.0  2.0  5.0
1  5|5|5_2|2|5  5.0  5.0  5.0  2.0  2.0  5.0
2  5|5|5_2|2|5  5.0  5.0  5.0  2.0  2.0  5.0
3  5|5|5_2|2|5  5.0  5.0  5.0  2.0  2.0  5.0
4  5|5|5_2|2|5  5.0  5.0  5.0  2.0  2.0  5.0


In [14]:
train_df

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags,A,B,C,D,E,F
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,0,5.0,5.0,5.0,2.0,2.0,5.0
1,hit:0_conv:1,2,sentimental,I remember going to the fireworks with my best...,0,Was this a friend you were in love with_comma_...,5|5|5_2|2|5,0,5.0,5.0,5.0,2.0,2.0,5.0
2,hit:0_conv:1,3,sentimental,I remember going to the fireworks with my best...,1,This was a best friend. I miss her.,5|5|5_2|2|5,0,5.0,5.0,5.0,2.0,2.0,5.0
3,hit:0_conv:1,4,sentimental,I remember going to the fireworks with my best...,0,Where has she gone?,5|5|5_2|2|5,0,5.0,5.0,5.0,2.0,2.0,5.0
4,hit:0_conv:1,5,sentimental,I remember going to the fireworks with my best...,1,We no longer talk.,5|5|5_2|2|5,0,5.0,5.0,5.0,2.0,2.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76663,hit:12424_conv:24848,5,sentimental,I found some pictures of my grandma in the att...,389,Yeah reminds me of the good old days. I miss ...,5|5|5_5|5|5,0,5.0,5.0,5.0,5.0,5.0,5.0
76664,hit:12424_conv:24849,1,surprised,I woke up this morning to my wife telling me s...,294,I woke up this morning to my wife telling me s...,5|5|5_5|5|5,0,5.0,5.0,5.0,5.0,5.0,5.0
76665,hit:12424_conv:24849,2,surprised,I woke up this morning to my wife telling me s...,389,Oh hey that's awesome! That is awesome right?,5|5|5_5|5|5,0,5.0,5.0,5.0,5.0,5.0,5.0
76666,hit:12424_conv:24849,3,surprised,I woke up this morning to my wife telling me s...,294,It is soooo awesome. We have been wanting a b...,5|5|5_5|5|5,0,5.0,5.0,5.0,5.0,5.0,5.0


In [15]:
# Optionally, save the updated DataFrame to a new CSV file
train_df.to_csv('updated_empathetic_dialogues.csv', index=False)

In [16]:
unique_tags = train_df['tags'].unique()
print(unique_tags)


[0]
