## Load Libraries & Dataset

In [1]:
from datasets import load_dataset
from collections import Counter
import pandas as pd
import math

# Load GoEmotions dataset
dataset = load_dataset("go_emotions")

# Emotion class names (GoEmotions)
class_names = [
    "admiration", "amusement", "anger", "annoyance", "approval",
    "caring", "confusion", "curiosity", "desire", "disappointment",
    "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness",
    "optimism", "pride", "realization", "relief", "remorse",
    "sadness", "surprise", "neutral"
]

AUGMENTATION_THRESHOLD = 500

  from .autonotebook import tqdm as notebook_tqdm


## Count Single and Multi-label Samples Per Class

In [2]:

def compute_class_stats(dataset, split, add_augment_ratio=False):
    single_label_counter = Counter()
    multi_label_counter = Counter()

    # Iterate over the dataset split (train, validation, test)
    for example in dataset[split]:
        labels = example['labels']
        if isinstance(labels, list):
            if len(labels) == 1:
                single_label_counter[labels[0]] += 1
            elif len(labels) > 1:
                for label in labels:
                    multi_label_counter[label] += 1

    # Calculate the total count per class (single-label + multi-label)
    total_counts = {class_id: single_label_counter.get(class_id, 0) + multi_label_counter.get(class_id, 0)
                    for class_id in set(single_label_counter.keys()).union(set(multi_label_counter.keys()))}

    # Create the data for the DataFrame
    df_data = {
        'Class ID': list(total_counts.keys()),
        'Class Name': [class_names[c] for c in total_counts.keys()],
        'Single Label Count': [single_label_counter.get(c, 0) for c in total_counts.keys()],
        'Multi Label Count': [multi_label_counter.get(c, 0) for c in total_counts.keys()],
        'Total Count': [total_counts[c] for c in total_counts.keys()]
    }

    # Optionally add the augmentation ratio
    if add_augment_ratio:
        df_data['Augment Ratio'] = [
            math.ceil(AUGMENTATION_THRESHOLD / total_counts[c]) if total_counts[c] < AUGMENTATION_THRESHOLD else 0
            for c in total_counts.keys()
        ]

    # Return the DataFrame along with the total_counts dictionary
    return pd.DataFrame(df_data).sort_values("Class ID"), total_counts

# Compute stats for training, validation, and test splits
train_stats, total_counts = compute_class_stats(dataset, "train", add_augment_ratio=True)
val_stats, _ = compute_class_stats(dataset, "validation", add_augment_ratio=False)
test_stats, _ = compute_class_stats(dataset, "test", add_augment_ratio=False)

# Display the stats
print("Train Split Statistics (with Augment Ratio):")
print(train_stats)

print("\nValidation Split Statistics (no augmentation):")
print(val_stats)

print("\nTest Split Statistics (no augmentation):")
print(test_stats)


Train Split Statistics (with Augment Ratio):
    Class ID      Class Name  Single Label Count  Multi Label Count  \
0          0      admiration                2710               1420   
1          1       amusement                1652                676   
2          2           anger                1025                542   
3          3       annoyance                1451               1019   
4          4        approval                1873               1066   
5          5          caring                 649                438   
6          6       confusion                 858                510   
7          7       curiosity                1389                802   
8          8          desire                 389                252   
9          9  disappointment                 709                560   
10        10     disapproval                1402                620   
11        11         disgust                 498                295   
12        12   embarrassment    

## select class from the table

In [3]:
minority_classes = set(train_stats[train_stats['Augment Ratio'] > 0]['Class ID'])
print(f"Minority classes selected for augmentation: {minority_classes}")

# Function to decide if a sample needs augmentation (same logic we discussed)
def should_augment_sample(labels, minority_classes):
    return any(label in minority_classes for label in labels)

# Collect samples from training split that need augmentation
train_data = dataset["train"]

# Filter the dataset
augmentation_candidates = []
for example in train_data:
    labels = example['labels']
    if should_augment_sample(labels, minority_classes):
        augmentation_candidates.append(example)

print(f"Number of augmentation candidates selected: {len(augmentation_candidates)}")


Minority classes selected for augmentation: {12, 16, 19, 21, 23}


Number of augmentation candidates selected: 803


### build prompt

In [4]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import json
import time
import openai
import json
from random import randint
from typing import List


In [5]:
# Load API key from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# Instantiate the OpenAI client properly
client = OpenAI(api_key=api_key)

In [7]:
def build_prompt(text, upsample_ratio, labels):
    """
    Builds the GPT prompt with labels and upsample ratio for generating paraphrases.
    """
    
    label_str = ", ".join([str(label) for label in labels])  # Convert each label to string
    
    # Build the prompt using explicit string formatting
    prompt = f"""You are an expert at emotional writing and paraphrasing.

Your task is to generate {upsample_ratio} different rephrased versions of the following sentence.
Make sure to preserve the original emotional meaning, which is described by these emotion labels: {label_str}.
Do NOT remove or change the emotions. Only rephrase the sentence in different words, keeping the tone and emotional meaning intact.

---

Now, here is your task:
Emotion labels: {label_str}
Original sentence:
"{text}"

Generate {upsample_ratio} paraphrases in a consistent JSON format, where each paraphrase is an entry in the list:

{{
    "paraphrases": [
        {{"paraphrase_1": "<paraphrase_1_text>"}},
        {{"paraphrase_2": "<paraphrase_2_text>"}},
        ...
        {{"paraphrase_{upsample_ratio}": "<paraphrase_n_text>"}}
    ]
}}
"""
    return prompt

def generate_paraphrases_gpt(text, upsample_ratio, labels, model="gpt-4.1-nano", temperature=0.7):
    """
    Calls the OpenAI API to generate paraphrases using the completions endpoint.
    
    Args:
        text (str): The original sentence.
        upsample_ratio (int): Number of paraphrases to generate.
        labels (list): List of emotional labels.
        model (str): GPT model to use ("gpt-4" or "gpt-3.5-turbo").
        temperature (float): Controls randomness.

    Returns:
        str: Raw GPT response text.
    """
    prompt = build_prompt(text, upsample_ratio, labels)
    
    # Use the completion endpoint
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        max_tokens=1000
    )
    
    return response.choices[0].message.content.strip()



In [8]:
# Example minority class (e.g., grief and sadness)
example_text = "I can't believe they're gone. It hurts so much."
example_labels = ["grief", "sadness"]
upsample_ratio = 3  # Let's generate 3 paraphrases for testing

# Call the function to generate paraphrases
output = generate_paraphrases_gpt(example_text, upsample_ratio, example_labels)
print(output)


{
    "paraphrases": [
        {"paraphrase_1": "I still can't accept that they're no longer here. The pain is overwhelming."},
        {"paraphrase_2": "It's so hard to believe they're gone. The sadness cuts so deep."},
        {"paraphrase_3": "I struggle to grasp that they're absent. The hurt feels unbearable."}
    ]
}


### data augmentation

In [9]:
from datasets import load_dataset
import pandas as pd
import openai
import os
import json
import time
from random import randint
from typing import List
from dotenv import load_dotenv

# Load GoEmotions dataset
dataset = load_dataset("go_emotions")

# Check the dataset structure (train, val, test)
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# Show the first 3 samples for reference
print(train_data[:3])


{'text': ["My favourite food is anything I didn't have to cook myself.", 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead', 'WHY THE FUCK IS BAYLESS ISOING'], 'labels': [[27], [27], [2]], 'id': ['eebbqej', 'ed00q6i', 'eezlygj']}


In [10]:
import json
import ast

def safe_parse_json(paraphrases_json):
    print("Raw content repr():", repr(paraphrases_json))

    try:
        # First attempt: direct parsing
        return json.loads(paraphrases_json)
    except json.JSONDecodeError:
        print("Direct json.loads() failed. Trying to unescape...")

        try:
            # Attempt unescaping if double-escaped
            unescaped = paraphrases_json.encode('utf-8').decode('unicode_escape')
            print("Unescaped content:", repr(unescaped))
            return json.loads(unescaped)
        except json.JSONDecodeError:
            print("Still failed. Trying ast.literal_eval as last resort...")
            try:
                evaluated = ast.literal_eval(paraphrases_json)
                # Convert eval result (Python dict) back to string, then parse
                return evaluated
            except Exception as final_err:
                print("Parsing completely failed:", final_err)
                raise ValueError("Failed to parse paraphrases JSON from API output.")


In [11]:
def generate_paraphrases_gpt_with_retry(text: str, upsample_ratio: int, labels: List[str],
                                        model: str = "gpt-4.1-nano", temperature: float = 0.75,
                                        max_retries: int = 5, backoff_factor: float = 1.5):
    retries = 0
    while retries < max_retries:
        try:
            prompt = build_prompt(text, upsample_ratio, labels)
            
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=1000
            )

            print("Full API Response:", response)
            paraphrases_json = response.choices[0].message.content.strip()
            print("Response Content as String:", paraphrases_json)

            # Use the safe parsing function
            parsed = safe_parse_json(paraphrases_json)
            paraphrases = parsed.get("paraphrases", [])

            print(f"Parsed Paraphrases: {paraphrases}")

            if len(paraphrases) == upsample_ratio:
                return paraphrases
            else:
                raise ValueError(f"Unexpected number of paraphrases. Expected {upsample_ratio}, got {len(paraphrases)}.")

        except Exception as e:
            print(f"Error: {e}. Retrying {retries + 1}/{max_retries}...")
            retries += 1
            if retries >= max_retries:
                print("Max retries reached. Returning empty list.")
                return []
            time.sleep(randint(1, 2) * (backoff_factor ** retries))

# Usage in augment dataset function
def augment_single_sample_with_retry(example, total_counts):
    text = example['text']
    example_labels = example['labels']
    
    # Calculate the upsample ratio for the current sample
    upsample_ratio = 0
    for label in example_labels:
        if total_counts[label] < 500:  # Minor class
            upsample_ratio = max(upsample_ratio, class_upsample_ratios.get(label, 0))
    
    # If the sample needs augmentation, generate paraphrases
    if upsample_ratio > 0:
        paraphrases = generate_paraphrases_gpt_with_retry(text, upsample_ratio, example_labels)
        
        # Add original sample and its paraphrases to the augmented_data list
        augmented_samples = [
            {"text": paraphrase[f"paraphrase_{i+1}"], "labels": example_labels}
            for i, paraphrase in enumerate(paraphrases)
        ]
        return augmented_samples
    else:
        return [example]  # No augmentation needed, return the original sample


In [12]:
# Example Usage
example_text = "I can't believe they're gone. It hurts so much."
example_labels = ["grief", "sadness"]
upsample_ratio = 7  # Let's generate 7 paraphrases for testing

# Call the function to generate paraphrases with retry and validation
output = generate_paraphrases_gpt_with_retry(example_text, upsample_ratio, example_labels)
print(output)

Full API Response: ChatCompletion(id='chatcmpl-BYFWr129AfLPAy0ehHa8KXxbPYu8Q', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n    "paraphrases": [\n        {"paraphrase_1": "I still can\'t accept that they\'re no longer here. The pain is overwhelming."},\n        {"paraphrase_2": "It’s hard to believe they’re gone. The sorrow is almost too much to bear."},\n        {"paraphrase_3": "I’m struggling to grasp that they’ve left us. The ache in my heart is so deep."},\n        {"paraphrase_4": "They’re gone, and I can\'t find words for how much it hurts inside."},\n        {"paraphrase_5": "It’s heartbreaking to realize they’re gone. The sadness feels endless."},\n        {"paraphrase_6": "I still can’t believe they’re gone. The pain cuts so deep into my soul."},\n        {"paraphrase_7": "They’re no longer here, and the grief is almost too much to handle."}\n    ]\n}', refusal=None, role='assistant', annotations=[], audio=None, funct

In [17]:
def augment_dataset(dataset, total_counts):
    augmented_data = []

    # Iterate over the dataset
    for example in dataset:
        text = example['text']
        example_labels = example['labels']

        # Calculate the upsample ratio for the current sample
        upsample_ratio = 0
        for label in example_labels:
            if total_counts[label] < 500:  # Minor class
                upsample_ratio = max(upsample_ratio, class_upsample_ratios.get(label, 0))
        
        # If the sample needs augmentation, generate paraphrases
        if upsample_ratio > 0:
            paraphrases = generate_paraphrases_gpt_with_retry(text, upsample_ratio, example_labels)
            
            # Add original sample and its paraphrases to the augmented_data list
            for paraphrase in paraphrases:
                augmented_data.append({
                    "text": paraphrase[f"paraphrase_{paraphrases.index(paraphrase)+1}"],
                    "labels": example_labels  # Same labels for the augmented sample
                })
        else:
            # Add the original sample if no augmentation is needed
            augmented_data.append(example)

    return augmented_data



In [18]:
# Calculate upsample ratio based on class size
def calculate_upsample_ratio(class_count, threshold=500):
    """
    Calculate the upsample ratio for a class to meet the threshold.
    """
    if class_count < threshold:
        return math.ceil(threshold / class_count)
    else:
        return 0  # No augmentation needed for classes above the threshold

# Calculate the upsample ratio for each class in the training set
class_upsample_ratios = {
    class_id: calculate_upsample_ratio(count) for class_id, count in total_counts.items() if count < AUGMENTATION_THRESHOLD
}

# Display the upsample ratios for minority classes
print("Upsample ratios for minority classes:", class_upsample_ratios)



Upsample ratios for minority classes: {12: 2, 16: 7, 19: 4, 21: 5, 23: 4}


In [15]:
# Example Usage
example_text = "I can't believe they're gone. It hurts so much."
example_labels = ["grief", "sadness"]
upsample_ratio = 7  # Let's generate 7 paraphrases for testing

# Call the function to generate paraphrases with retry mechanism
output = generate_paraphrases_gpt_with_retry(example_text, upsample_ratio, example_labels)

Full API Response: ChatCompletion(id='chatcmpl-BYFX9LqgSFL63mFjwaMeM3dTaa7MK', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n    "paraphrases": [\n        {"paraphrase_1": "I still can\'t accept that they\'re no longer here. The pain is overwhelming."},\n        {"paraphrase_2": "It\'s hard to believe they\'re gone; the sorrow is almost unbearable."},\n        {"paraphrase_3": "I remain in shock that they have left; the sadness cuts so deep."},\n        {"paraphrase_4": "They’re gone, and it hurts my heart more than I can express."},\n        {"paraphrase_5": "I can\'t fathom that they are no longer with us; the grief feels endless."},\n        {"paraphrase_6": "It’s so painful to realize they’re gone; my heart is heavy with sadness."},\n        {"paraphrase_7": "The fact that they’re gone feels unreal, and the pain is so profound."}\n    ]\n}', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool

In [None]:
# Augment the training dataset
augmented_train_data = augment_dataset(dataset["train"], total_counts)

# Convert to DataFrame for easy inspection and saving
augmented_train_df = pd.DataFrame(augmented_train_data)

# Display the first 3 rows of the augmented data
print(augmented_train_df.head(3))

# Save the augmented data to a JSONL file
augmented_train_df.to_json("augmented_train_data_test.jsonl", orient="records", lines=True)


In [None]:
class EmotionDataset(Dataset):
    def __init__(self, path: Union[Path, str], tokenizer, num_classes: int = 28, max_length: int = 256):
        self.samples = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                item = json.loads(line)
                self.samples.append(item)

        self.tokenizer  = tokenizer
        self.max_length = max_length
        self.num_classes = num_classes  # Number of classes for multi-label task
        self.labels     = [s["labels"] for s in self.samples]  # Multi-label list of labels

    def __getitem__(self, idx):
        item = self.samples[idx]
        enc = self.tokenizer(
            item["text"],
            max_length=self.max_length,
            truncation=True,
            padding="max_length",  # Ensure padding is done to a fixed max_length
            return_tensors="pt",
        )

        # Handle multi-label padding
        label = item["labels"]
        label_vector = torch.zeros(self.num_classes)  # Initialize a zero vector of size num_classes
        for l in label:
            label_vector[l] = 1  # Set the class indices to 1

        return {
            "input_ids": enc["input_ids"].squeeze(0),  # Remove the batch dimension
            "attention_mask": enc["attention_mask"].squeeze(0),  # Remove the batch dimension
            "labels": label_vector  # Return the multi-label as a binary vector
        }

    def __len__(self):
        return len(self.samples)

    def get_labels(self):
        return self.labels

# 1) Point to your local “model” folder
model_path = Path(__file__).resolve().parent.parent / "outputs" / "goemotions_transfer2" / "checkpoint-12950"
tokenizer = AutoTokenizer.from_pretrained(model_path)  # Correct tokenizer path

# 2) Define paths to your train, validation, and test JSONL files
base_dir = Path(__file__).resolve().parent.parent
dataset_dir = base_dir / "data" / "augmented_go_emotion"
train_path = dataset_dir / "train.jsonl"
val_path = dataset_dir / "validation.jsonl"
test_path = dataset_dir / "test.jsonl"

# 3) Load train/val/test datasets
train_dataset = EmotionDataset(train_path, tokenizer)
val_dataset   = EmotionDataset(val_path, tokenizer)
test_dataset  = EmotionDataset(test_path, tokenizer)

# Get labels for each dataset
train_labels = train_dataset.get_labels()
val_labels   = val_dataset.get_labels()
test_labels  = test_dataset.get_labels()

# Log the dataset sizes
logging.info(f"Train dataset loaded with {len(train_dataset)} samples.")
logging.info(f"Validation dataset loaded with {len(val_dataset)} samples.")
logging.info(f"Test dataset loaded with {len(test_dataset)} samples.")
