Loading the dataset

In [2]:
#importing the necessary libraries
import pandas as pd 
from datasets import load_dataset 

#loading the dataset using the Hugging Face datasets library
df = load_dataset("google-research-datasets/go_emotions")

In [3]:
# Exploring the training set
df['train'][0]

{'text': "My favourite food is anything I didn't have to cook myself.",
 'labels': [27],
 'id': 'eebbqej'}

In [6]:
df['validation'][0]

{'text': 'Is this in New Orleans?? I really feel like this is New Orleans.',
 'labels': [27],
 'id': 'edgurhb'}

In [None]:
#mapping the labels to the emotion names; 
# Link to mapping: https://github.com/google-research/google-research/blob/756ae45c4880ad6a01869608250d85a8fb253799/goemotions/data/emotions.txt

label_index = {
    "0": "admiration",
    "1": "amusement",
    "2": "anger",
    "3": "annoyance",
    "4": "approval",
    "5": "caring",
    "6": "confusion",
    "7": "curiosity",
    "8": "desire",
    "9": "disappointment",
    "10": "disapproval",
    "11": "disgust",
    "12": "embarassment",
    "13": "excitement",
    "14": "fear",
    "15": "gratitude",
    "16": "grief",
    "17": "joy",
    "18": "love",
    "19": "nervousness",
    "20": "optimism",
    "21": "pride",
    "22": "realization",
    "23": "relief",
    "24": "remorse",
    "25": "sadness",
    "26": "surprise",
    "27": "neutral"
}

In [31]:
train_subset[998]

{'text': 'Oh god yes. Top quality cringe. Thank you for sharing',
 'labels': [4, 15],
 'id': 'eemffjk'}

In [32]:
#Get the first 1000 rows of the training set, map the labels to the emotion names and store in data_df

# Get first 1000 rows from training set
train_subset = df['train'].select(range(1000))

# Convert to list of dictionaries and map labels to emotion names
data_list = []
for example in train_subset:
    # Map each label (integer) to its emotion name using label_index
    emotion_labels_list = [label_index[str(label)] for label in example['labels']]
    emotion_labels_string = ", ".join(emotion_labels_list)
    data_list.append({
        'comment': example['text'],
        'emotion labels': emotion_labels_string  # Now contains emotion names instead of numbers
    })

# Create DataFrame
comments_df = pd.DataFrame(data_list)


In [48]:
#print the first 10 rows of the dataframe
comments_df.sample(10)

Unnamed: 0,comment,emotion labels
329,I have dayz but haven’t played in years. Last ...,amusement
763,Is that the dude from spiderman?,curiosity
647,"Wow this was an eye-opening response, and you’...","admiration, approval, surprise"
237,I wanted to see him show out over Kemba.,"desire, neutral"
401,[NAME] looked terrible that game. I was really...,fear
409,"Same here man, terrible feeling",fear
954,"Stop breaking the rules, you're still posting ...",neutral
0,My favourite food is anything I didn't have to...,neutral
889,I've used tofu and nutritional yeast before. I...,approval
968,deliberately getting in someone's face while t...,neutral


In [None]:
# Count all individual emotions in the dataframe
# quick check to see if all emotions are used

emotion_counts = {emotion: 0 for emotion in label_index.values()}

for emotion_combo in comments_df['emotion labels']:
    for emotion in emotion_combo.split(', '):
        emotion_counts[emotion] += 1

# Print the counts
print("=" * 60)
print("COUNT OF ALL EMOTIONS IN THE DATAFRAME:")
print("=" * 60)
for emotion, count in sorted(emotion_counts.items()):
    print(f"{emotion}: {count}")

# (2) Check for missing emotions
missing_emotions = [emotion for emotion, count in emotion_counts.items() if count == 0]

print("\n" + "=" * 60)
print("MISSING EMOTIONS:")
print("=" * 60)
if not missing_emotions:
    print("All emotions are present at least once.")
else:
    print("The following emotions are missing:", missing_emotions)
print("=" * 60)



COUNT OF ALL EMOTIONS IN THE DATAFRAME:
admiration: 99
amusement: 61
anger: 41
annoyance: 57
approval: 67
caring: 21
confusion: 33
curiosity: 50
desire: 13
disappointment: 34
disapproval: 49
disgust: 13
embarassment: 6
excitement: 13
fear: 10
gratitude: 63
grief: 6
joy: 39
love: 47
nervousness: 3
neutral: 336
optimism: 29
pride: 1
realization: 19
relief: 2
remorse: 9
sadness: 32
surprise: 35

MISSING EMOTIONS:
All emotions are present at least once.
