In [3]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
from datetime import datetime
from datasets import load_dataset

In [4]:
seed = 42

hf_data_id = 'google-research-datasets/go_emotions'
hf_data_config = 'raw'
prev_sampled_data_path = 'go_emotions_fear_single_227_comb_183_translated_cleaned_curated.csv'

selected_emotion = 'fear'
num_samples_single = 233 - 227
num_samples_comb = 196 - 183
save_path = f'go_emotions_{selected_emotion}_single_{num_samples_single}_comb_{num_samples_comb}.csv'
print("CSV save path:", save_path)

CSV save path: go_emotions_fear_single_6_comb_13.csv


In [5]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


In [6]:
dataset = load_dataset(hf_data_id, hf_data_config, split='train')
print(dataset)

Dataset({
    features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
    num_rows: 211225
})


In [7]:
df = pd.DataFrame(dataset)
print("DF size:", len(df))
df.head()

DF size: 211225


Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
# cols =  ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 
#          'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 
#          'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 
#          'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],

emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 
            'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 
            'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
valid_emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
invalid_emotions = list(set(emotions) - set(valid_emotions))

print("Valid emotions:", valid_emotions)
print("Invalid emotions:", invalid_emotions)

Valid emotions: ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
Invalid emotions: ['desire', 'approval', 'realization', 'nervousness', 'disapproval', 'amusement', 'caring', 'admiration', 'disappointment', 'annoyance', 'embarrassment', 'pride', 'confusion', 'grief', 'neutral', 'remorse', 'optimism', 'gratitude', 'relief', 'love', 'curiosity', 'excitement']


In [9]:
# Filter data with active invalid emotions
df_filtered = df[~df[invalid_emotions].eq(1).any(axis=1)]
print("Filtered DF size:", len(df_filtered))

Filtered DF size: 25771


In [10]:
# Select data with active selected emotion
df_selected = df[df[selected_emotion] == 1][['id', 'text'] + valid_emotions]
print("Selected DF size:", len(df_selected))
df_selected.head()

Selected DF size: 3197


Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise
336,edt2mxr,Yes. One of her fingers is getting a sore on i...,0,0,1,0,0,0
410,ee91zfs,I watched a really bad horror movie she starre...,0,0,1,0,0,0
449,eet8ikg,oh my [NAME]. it was hurting my brain. i only ...,0,0,1,0,0,0
521,edouv2b,NEXT TIME! ON THE CURSE OF OAK ISLAND!,0,0,1,0,0,0
639,ee8otcl,Yeah it’s not like I want my cat to lick me in...,0,0,1,0,0,0


### Clean Data Before Sampling

In [11]:
df_sampled_prev = pd.read_csv(prev_sampled_data_path)
df_sampled_prev.head()

Unnamed: 0,id,curation_status,text,text_translated,anger,disgust,fear,joy,sadness,surprise,neutral,num_emotions
0,eerqv7t,1,But how's the breathing through the nose. That...,Tapi kumaha ieu engap liwat irung. Éta pertany...,0,0,1,0,0,0,0,1
1,edm6gdi,1,"Im at the game,somebody please put me out of m...","Abdi di buruan, wios hampura, tulung bébaskeun...",0,0,1,0,0,0,0,1
2,ednyk0x,1,I was afraid things would if we kept seeing ea...,Kuring sieun hal-hal bakal jadi hese lamun ura...,0,0,1,0,0,0,0,1
3,ed4bf6i,1,Yeah that’s one of the things I’m most worried...,"Heueuh, éta salah sahiji hal anu kuring paling...",0,0,1,0,0,0,0,1
4,edfr8bs,1,this off season is dark and full of terrors,Musim luar ieu poék jeung pinuh ku kasieun.,0,0,1,0,0,0,0,1


In [22]:
df_cleaned = df_selected[~df_selected['id'].isin(df_sampled_prev['id'])]
print("Cleaned DF size (after cleaning prev. sampled data):", len(df_cleaned))

Cleaned DF size (after cleaning prev. sampled data): 2531


In [23]:
print("Num. of duplicates by ID:", df_cleaned['id'].duplicated().sum())
print("Num. of duplicates by text:", df_cleaned['text'].duplicated().sum())
df_cleaned = df_cleaned.drop_duplicates(subset=['text'], keep='first')
print("Num. of duplicates by ID (after cleaning duplicates):", df_cleaned['id'].duplicated().sum())
print("Num. of duplicates by text (after cleaning duplicates):", df_cleaned['text'].duplicated().sum())
print("Cleaned DF size (after cleaning duplicates):", len(df_cleaned))

Num. of duplicates by ID: 799
Num. of duplicates by text: 803
Num. of duplicates by ID (after cleaning duplicates): 0
Num. of duplicates by text (after cleaning duplicates): 0
Cleaned DF size (after cleaning duplicates): 1728


In [25]:
cond_r = df_cleaned['text'].str.contains("r/")
cond_u = df_cleaned['text'].str.contains("/u/")
cond_name = df_cleaned['text'].str.contains("NAME]")
# cond_double = df_cleaned['text'].str.contains(r'(.)\1{2,}')

df_cleaned = df_cleaned[~(cond_r | cond_u | cond_name)]
print("Cleaned DF size:", len(df_cleaned))
# df_cleaned.head()

Cleaned DF size: 1438


### Sample Data

In [30]:
def create_num_emotions_col(row):
    return sum(row[valid_emotions].tolist())

df_cleaned.apply(create_num_emotions_col, axis=1)
df_cleaned['num_emotions'] = df_cleaned.apply(create_num_emotions_col, axis=1)

df_single = df_cleaned[(df_cleaned[selected_emotion] == 1) & (df_cleaned['num_emotions'] == 1)]
df_comb = df_cleaned[(df_cleaned[selected_emotion] == 1) & (df_cleaned['num_emotions'] > 1)]

print("Total data with single emotion:", len(df_single))
print("Total data with combined emotions:", len(df_comb))

Total data with single emotion: 1369
Total data with combined emotions: 69


In [36]:
df_sampled_single = df_single.sample(n=num_samples_single*2, random_state=seed)
df_sampled_comb = df_comb.sample(n=num_samples_comb*2, random_state=seed)
df_sampled = pd.concat([df_sampled_single, df_sampled_comb])[['id', 'text'] + valid_emotions]
df_sampled['neutral'] = 0

print("Total sampled data with single emotion:", len(df_sampled_single))
print("Total sampled data with combined emotions:", len(df_sampled_comb))
print("Total sampled data:", len(df_sampled))
df_sampled.head()

Total sampled data with single emotion: 12
Total sampled data with combined emotions: 26
Total sampled data: 38


Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,neutral
49437,ef7uazr,I miss mass Rez her new ult is horrendous,0,0,1,0,0,0,0
154820,ef7vmo8,The way that ladder is attached to the buildin...,0,0,1,0,0,0,0
82973,edpcjyf,Cause emotions are difficult and people fear r...,0,0,1,0,0,0,0
67705,ed4ucik,You should have said “might even have enough f...,0,0,1,0,0,0,0
14110,eev7tjm,Now I'm worried that cat will get salmonella :(,0,0,1,0,0,0,0


In [37]:
df_sampled.to_csv(save_path, index=False)
print("Saved to:", save_path)

Saved to: go_emotions_fear_single_6_comb_13.csv
