<a href="https://colab.research.google.com/github/advait2811/emotion_and_sentiment_evolution/blob/main/prep_goemotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# ===========================
# Emotion Evolution Baseline
# ===========================

# Step 0: Install deps
!pip install datasets transformers scikit-learn -q

# ---------------------------
# Step 1: Prepare Dataset
# ---------------------------
from datasets import load_dataset

print("📥 Loading GoEmotions dataset...")
ds = load_dataset("go_emotions")

# Original 27 emotion labels from Google Research GoEmotions
goem_labels = [
    "admiration","amusement","anger","annoyance","approval","caring","confusion",
    "curiosity","desire","disappointment","disapproval","disgust","embarrassment",
    "excitement","fear","gratitude","grief","joy","love","nervousness","optimism",
    "pride","relief","remorse","sadness","surprise","neutral"
]
print(f"✅ Loaded {len(goem_labels)} original labels")

# Map 27 → 5 emotions
label_map = {
    "admiration":"joy","amusement":"joy","anger":"anger","annoyance":"anger",
    "approval":"joy","caring":"neutral","confusion":"neutral","curiosity":"neutral",
    "desire":"neutral","disappointment":"sadness","disapproval":"anger","disgust":"anger",
    "embarrassment":"neutral","excitement":"joy","fear":"fear","gratitude":"joy",
    "grief":"sadness","joy":"joy","love":"joy","nervousness":"fear",
    "optimism":"joy","pride":"joy","relief":"joy","remorse":"sadness",
    "sadness":"sadness","surprise":"neutral","neutral":"neutral"
}
target_labels = ["joy","sadness","anger","fear","neutral"]

map_idx = {i: target_labels.index(label_map[goem_labels[i]]) for i in range(len(goem_labels))}

# Process dataset
def process(example):
    if example["labels"]:
        # Filter out invalid label indices (outside the range 0-26)
        valid_labels = [i for i in example["labels"] if 0 <= i < len(goem_labels)]
        if valid_labels:
            mapped = [map_idx[i] for i in valid_labels]
            example["label"] = mapped[0]   # take primary
        else:
            # Handle cases where all labels are invalid
            example["label"] = target_labels.index("neutral")
    else:
        example["label"] = target_labels.index("neutral")
    return example

print("🔄 Processing dataset...")
ds = ds.map(process)
ds = ds.remove_columns(["labels","id"])
ds = ds.rename_column("label","labels")

print(ds)

# ---------------------------
# Step 2: Train Baseline Model
# ---------------------------
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Labels
id2label = {0:"joy",1:"sadness",2:"anger",3:"fear",4:"neutral"}
label2id = {v:k for k,v in id2label.items()}

# Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

ds = ds.map(tokenize, batched=True)
ds = ds.remove_columns(["text"])
ds.set_format("torch")

# Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,
    id2label=id2label,
    label2id=label2id
)

# Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy":acc,"macro_f1":f1}

# Training args
training_args = TrainingArguments(
    output_dir="./baseline_model",
    eval_strategy="epoch", # Changed from evaluation_strategy to eval_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("🚀 Starting training...")
trainer.train()

print("📊 Evaluating on test set...")
results = trainer.evaluate(ds["test"])
print("✅ Test Results:", results)

📥 Loading GoEmotions dataset...
✅ Loaded 27 original labels
🔄 Processing dataset...
DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 5427
    })
})


Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.6371,0.684847,0.73756,0.642385
2,0.5876,0.698967,0.730925,0.641417
3,0.5251,0.738415,0.728345,0.624624


📊 Evaluating on test set...


✅ Test Results: {'eval_loss': 0.6913541555404663, 'eval_accuracy': 0.7307904919845218, 'eval_macro_f1': 0.6347644287690642, 'eval_runtime': 19.2747, 'eval_samples_per_second': 281.56, 'eval_steps_per_second': 17.64, 'epoch': 3.0}
