In [1]:
import pandas as pd

In [None]:
from datasets import load_dataset
dataset_dict = load_dataset("cirimus/super-emotion")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 439361/439361 [00:01<00:00, 339481.26 examples/s]
Generating validation split: 100%|██████████| 54835/54835 [00:00<00:00, 359213.43 examples/s]
Generating test split: 100%|██████████| 58625/58625 [00:00<00:00, 972535.73 examples/s]


In [15]:
train_dataset = dataset_dict['train']
validation_dataset = dataset_dict['validation']
test_dataset = dataset_dict['test']

In [17]:
print(f"Training set size: {len(train_dataset)} rows")
print(f"Validation set size: {len(validation_dataset)} rows")
print(f"Test set size: {len(test_dataset)} rows")

Training set size: 439361 rows
Validation set size: 54835 rows
Test set size: 58625 rows


In [18]:
df = pd.DataFrame(train_dataset)
df.head()

Unnamed: 0,text,labels,labels_str,labels_source,source
0,also I was the point person on my companys tr...,[4],[Neutral],[neutral],MELD
1,You mustve had your hands full.,[4],[Neutral],[neutral],MELD
2,That I did. That I did.,[4],[Neutral],[neutral],MELD
3,So lets talk a little bit about your duties.,[4],[Neutral],[neutral],MELD
4,My duties? All right.,[6],[Surprise],[surprise],MELD


In [32]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 20000)

In [34]:
df.shape

(439361, 5)

In [35]:
# Define the emotions we want to keep
desired_emotions = ['joy', 'sadness', 'anger', 'fear', 'love', 'neutral', 'surprise']

# Convert list format to string and filter rows
df['labels_source'] = df['labels_source'].str[0]  # Convert ['emotion'] to 'emotion'
df = df[df['labels_source'].isin(desired_emotions)]

# Verify the results
print("Emotion distribution after filtering:")
print(df['labels_source'].value_counts())

Emotion distribution after filtering:
labels_source
joy         122631
sadness     107484
anger        53301
fear         41454
love         33581
neutral      24443
surprise     16273
Name: count, dtype: int64


In [36]:
df.shape

(399167, 5)

In [37]:
# Print original distribution
print("Original distribution:")
print(df['labels_source'].value_counts())
print("\nOriginal total samples:", len(df))

# Set maximum samples per class
MAX_SAMPLES = 30000

# Balance the dataset
balanced_dfs = []
for emotion in df['labels_source'].unique():
    # Get all samples for this emotion
    emotion_df = df[df['labels_source'] == emotion]
    
    # If we have more than MAX_SAMPLES, randomly sample MAX_SAMPLES
    if len(emotion_df) > MAX_SAMPLES:
        emotion_df = emotion_df.sample(n=MAX_SAMPLES, random_state=42)
    
    balanced_dfs.append(emotion_df)

# Combine all balanced emotion dataframes
df_balanced = pd.concat(balanced_dfs, ignore_index=True)

# Print new distribution
print("\nBalanced distribution:")
print(df_balanced['labels_source'].value_counts())
print("\nBalanced total samples:", len(df_balanced))

# Replace the original dataframe with balanced one
df = df_balanced

Original distribution:
labels_source
joy         122631
sadness     107484
anger        53301
fear         41454
love         33581
neutral      24443
surprise     16273
Name: count, dtype: int64

Original total samples: 399167

Balanced distribution:
labels_source
fear        30000
joy         30000
sadness     30000
anger       30000
love        30000
neutral     24443
surprise    16273
Name: count, dtype: int64

Balanced total samples: 190716

Balanced distribution:
labels_source
fear        30000
joy         30000
sadness     30000
anger       30000
love        30000
neutral     24443
surprise    16273
Name: count, dtype: int64

Balanced total samples: 190716


In [40]:
# Check final shape of the balanced dataset
df.shape

(190716, 5)

In [41]:
df.columns

Index(['text', 'labels', 'labels_str', 'labels_source', 'source'], dtype='object')

In [48]:
df['labels_source'].value_counts()

labels_source
fear        30000
joy         30000
sadness     30000
anger       30000
love        30000
neutral     24443
surprise    16273
Name: count, dtype: int64

In [52]:
df_new = df[['text', 'labels_source']]
df_new.head()

Unnamed: 0,text,labels_source
0,also I was the point person on my companys tr...,neutral
1,You mustve had your hands full.,neutral
2,That I did. That I did.,neutral
3,So lets talk a little bit about your duties.,neutral
4,"Now youll be heading a whole division, so you...",neutral


In [54]:
df_new.shape

(190716, 2)

In [57]:
df_new.rename(columns={'labels_source':'emotions'}, inplace=True)
df_new.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.rename(columns={'labels_source':'emotions'}, inplace=True)


Index(['text', 'emotions'], dtype='object')

In [59]:
df_new.shape

(190716, 2)

In [61]:
df_new.to_csv('./../data/superemotion.csv', sep=',',index=False)

In [65]:
df_new['emotions'].value_counts()

emotions
fear        30000
joy         30000
sadness     30000
anger       30000
love        30000
neutral     24443
surprise    16273
Name: count, dtype: int64