## PreProcessing Datasets for NLP Emotion Prediction

In [None]:
import pandas as pd
from datasets import load_dataset
from collections import Counter
import json
import random
import nltk
import torch
from sklearn.utils import resample
from collections import Counter
from transformers import pipeline
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.augmenter.char as nac
from deep_translator import GoogleTranslator
from textattack.augmentation import EasyDataAugmenter
from nltk.corpus import wordnet
import dask.dataframe as dd
from dask import delayed, compute
from tqdm import tqdm  # For progress tracking

#### 1. ISEAR Dataset

In [8]:
# Emotion mapping to target 8 emotions
TARGET_EMOTIONS = {"Neutral", "Happy", "Anger", "Sadness", "Fear", "Surprise", "Confusion", "Disgust"}

iseardf = pd.read_csv(r'D:\Data Science Projects\AI Emotion Analysis\data\eng_dataset.csv')

isear_emotion_mapping = {
    "anger": "Anger",
    "disgust": "Disgust",
    "fear": "Fear",
    "happy": "Happy",
    "sadness": "Sadness",
    "surprise": "Surprise",
    "joy" : "Happy"
}

iseardf["emotion"] = iseardf["sentiment"].map(isear_emotion_mapping)
iseardf = iseardf[iseardf["emotion"].isin(TARGET_EMOTIONS)]
iseardf = iseardf[["content", "emotion"]]
iseardf.rename(columns={"content": "text"}, inplace=True)

#### 2. GoEmotions Dataset

In [None]:
dataset = load_dataset("google-research-datasets/go_emotions")

goemotions_df = pd.concat([
    pd.DataFrame(dataset["train"]),
    pd.DataFrame(dataset["validation"]),
    pd.DataFrame(dataset["test"])
], ignore_index=True)

emotion_labels = dataset["train"].features["labels"].feature.names

goemotions_emotion_mapping = {
    "neutral": "Neutral",
    "admiration": "Happy",
    "approval": "Happy",
    "gratitude": "Happy",
    "annoyance": "Anger",
    "amusement": "Happy",
    "curiosity": "Neutral",
    "disapproval": "Anger",
    "love": "Happy",
    "optimism": "Happy",
    "anger": "Anger",
    "joy": "Happy",
    "confusion": "Confusion",
    "sadness": "Sadness",
    "disappointment": "Sadness",
    "realization": "Neutral",
    "caring": "Happy",
    "surprise": "Surprise",
    "excitement": "Happy",
    "disgust": "Disgust",
    "desire": "Neutral",
    "fear": "Fear",
    "remorse": "Sadness",
    "embarrassment": "Fear",
    "nervousness": "Fear",
    "relief": "Neutral",
    "pride": "Happy",
    "grief": "Sadness"
}

goemotions_df["emotion"] = goemotions_df["labels"].apply(lambda labels: [goemotions_emotion_mapping[emotion_labels[i]] for i in labels])
goemotions_df = goemotions_df.explode("emotion")
goemotions_df = goemotions_df[goemotions_df["emotion"].isin(TARGET_EMOTIONS)]  
goemotions_df = goemotions_df[["text", "emotion"]]


#### 3. DailyDialog Dataset

In [None]:
daily_dialog = load_dataset("daily_dialog", trust_remote_code=True)

emotion_mapping = {
    0: "Neutral",
    1: "Anger",
    2: "Disgust",
    3: "Fear",
    4: "Happy",
    5: "Sadness",
    6: "Surprise"
}

TARGET_EMOTIONS = ["Neutral", "Anger", "Disgust", "Fear", "Happy", "Sadness", "Surprise"]

def preprocess_split(split):
    data = []
    for example in split:
        dialog = example["dialog"]  
        emotions = example["emotion"]  
        
        for utterance, emotion in zip(dialog, emotions):
            utterance = utterance.strip()
            if len(utterance.split()) < 10:
                continue
            emotion_name = emotion_mapping[emotion]
            if TARGET_EMOTIONS and emotion_name not in TARGET_EMOTIONS:
                continue
            data.append({
                "text": utterance,
                "emotion": emotion_name
            })
    return pd.DataFrame(data)

train_df = preprocess_split(daily_dialog["train"])
test_df = preprocess_split(daily_dialog["test"])
val_df = preprocess_split(daily_dialog["validation"])

combined_df_dailydialog = pd.concat([train_df, test_df, val_df], ignore_index=True)
combined_df_dailydialog = combined_df_dailydialog[combined_df_dailydialog["emotion"].isin(TARGET_EMOTIONS)]

#### 4. ESConv Dataset

In [11]:
TARGET_EMOTIONS = {"Neutral", "Happy", "Anger", "Sadness", "Fear", "Surprise", "Confusion", "Disgust"}

EMOTION_LABELS = {
    "joy": "Happy",
    "happy": "Happy",
    "excited": "Happy",
    
    "anger": "Anger",
    "angry": "Anger",
    "frustrated": "Anger",
    "jealousy": "Anger",  
    
    "sad": "Sadness",
    "sadness": "Sadness",
    "depression": "Sadness",
    "guilt": "Sadness",
    "pain": "Sadness",

    "fear": "Fear",
    "anxiety": "Fear",
    "nervousness": "Fear",

    "surprise": "Surprise",
    
    "disgust": "Disgust",
    "shame": "Disgust",

    "neutral": "Neutral",

    "confused": "Confusion",
    "confusion": "Confusion"
}

def standardize_emotions(label):
    """Map dataset-specific emotion labels to a unified set, keeping only relevant categories."""
    mapped_label = EMOTION_LABELS.get(label.lower(), None)
    return mapped_label if mapped_label in TARGET_EMOTIONS else None

dataset = load_dataset("thu-coai/ESConv")

df = pd.concat([
    pd.DataFrame(dataset["train"]),
    pd.DataFrame(dataset["validation"]),
    pd.DataFrame(dataset["test"])
])

seeker_texts = []
for index, row in df.iterrows():
    conversation = json.loads(row[0])  

    if "emotion_type" in conversation and "dialog" in conversation:
        emotion = conversation["emotion_type"]  
        dialog = conversation["dialog"]  

        
        seeker_messages = " ".join(turn["text"] for turn in dialog if turn["speaker"] == "usr")

        
        seeker_texts.append({
            "conversation_id": index,  
            "text": seeker_messages,   
            "emotion": emotion        
        })


df_seeker = pd.DataFrame(seeker_texts)
df_seeker["emotion"] = df_seeker["emotion"].map(standardize_emotions)

  conversation = json.loads(row[0])


#### 5. Dair-ai Emotion Dataset

In [None]:
dataset = load_dataset("dair-ai/emotion","unsplit")

# mapping
original_labels = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# Mapping to new emotions
new_label_mapping = {
    "joy": "Happy",
    "sadness": "Sadness",
    "anger": "Anger",
    "fear": "Fear",
    "surprise": "Surprise",
    "love": "Neutral" 
}

def map_labels(example):
    example["new_label"] = new_label_mapping[original_labels[example["label"]]]
    return example

dataset = dataset.map(map_labels)
data_emotion = pd.DataFrame(dataset['train'])
data_emotion = data_emotion[['text','new_label']]
data_emotion.columns = ['text','emotion']

In [33]:
data_emotion

Unnamed: 0,text,emotion
0,i feel awful about it too because it s my job ...,Sadness
1,im alone i feel awful,Sadness
2,ive probably mentioned this before but i reall...,Happy
3,i was feeling a little low few days back,Sadness
4,i beleive that i am much more sensitive to oth...,Neutral
...,...,...
416804,that was what i felt when i was finally accept...,Happy
416805,i take every day as it comes i m just focussin...,Fear
416806,i just suddenly feel that everything was fake,Sadness
416807,im feeling more eager than ever to claw back w...,Happy


### Merging the datasets

In [None]:
final_df = pd.concat([iseardf, goemotions_df, combined_df_dailydialog,df_seeker[['text','emotion']],data_emotion], ignore_index=True)
final_df.drop_duplicates(subset=['text'], inplace=True)
final_df.reset_index(drop=True, inplace=True)
print(final_df.head())

emotion_counts = Counter(final_df["emotion"])
emotion_df = pd.DataFrame(emotion_counts.items(), columns=["Emotion", "Count"]).sort_values(by="Count", ascending=False)
print(emotion_df)

                                                text emotion
0  At the point today where if someone says somet...   Anger
1  @CorningFootball  IT'S GAME DAY!!!!      T MIN...   Anger
2  This game has pissed me off more than any othe...   Anger
3  @spamvicious I've just found out it's Candice ...   Anger
4  @moocowward @mrsajhargreaves @Melly77 @GaryBar...   Anger
     Emotion   Count
2      Happy  163167
3    Sadness  124141
4    Neutral   93354
0      Anger   63782
1       Fear   47515
5   Surprise   14211
6  Confusion    1535
7    Disgust    1010


In [22]:
final_df

Unnamed: 0,text,emotion
0,At the point today where if someone says somet...,Anger
1,@CorningFootball IT'S GAME DAY!!!! T MIN...,Anger
2,This game has pissed me off more than any othe...,Anger
3,@spamvicious I've just found out it's Candice ...,Anger
4,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,Anger
...,...,...
508710,that was what i felt when i was finally accept...,Happy
508711,i take every day as it comes i m just focussin...,Fear
508712,i just suddenly feel that everything was fake,Sadness
508713,im feeling more eager than ever to claw back w...,Happy


In [19]:
final_df.to_csv(r'D:\Data Science Projects\AI Emotion Analysis\data\merged_dataset_2.csv', index=False)

### Train Test Validation Split Before UpSampling

In [None]:
from sklearn.model_selection import train_test_split
final_df = pd.read_csv(r'D:\Data Science Projects\AI Emotion Analysis\data\merged_dataset_2.csv')

In [None]:
train_df, test_df = train_test_split(
    final_df, 
    test_size=0.1,  
    random_state=42, 
    stratify=final_df["emotion"]  
)

train_df, val_df = train_test_split(
    train_df, 
    test_size=0.1,      
    random_state=42, 
    stratify=train_df["emotion"]  
)

print("Train Distribution:", Counter(train_df["emotion"]))
print("Validation Distribution:", Counter(val_df["emotion"]))
print("Test Distribution:", Counter(test_df["emotion"]))

Train Distribution: Counter({'Happy': 132165, 'Sadness': 100554, 'Neutral': 75617, 'Anger': 51663, 'Fear': 38487, 'Surprise': 11511, 'Confusion': 1243, 'Disgust': 818})
Validation Distribution: Counter({'Happy': 14685, 'Sadness': 11173, 'Neutral': 8402, 'Anger': 5741, 'Fear': 4276, 'Surprise': 1279, 'Confusion': 138, 'Disgust': 91})
Test Distribution: Counter({'Happy': 16317, 'Sadness': 12414, 'Neutral': 9335, 'Anger': 6378, 'Fear': 4752, 'Surprise': 1421, 'Confusion': 154, 'Disgust': 101})


In [21]:
train_df.to_csv(r'D:\Data Science Projects\AI Emotion Analysis\data\train.csv', index=False)
val_df.to_csv(r'D:\Data Science Projects\AI Emotion Analysis\data\val.csv', index=False)
test_df.to_csv(r'D:\Data Science Projects\AI Emotion Analysis\data\test.csv', index=False)

### Handling Class Imbalance - Augmentation Strategy for Balanced Emotion Data

In [None]:
# Checking for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Augmentation
syn_aug = naw.SynonymAug(aug_p=0.3)

device_id = 0 if torch.cuda.is_available() else -1  # Ensure device is an integer
bert_aug = naw.ContextualWordEmbsAug(model_path="bert-base-uncased", action="substitute", device=device_id)

del_aug = naw.RandomWordAug(action="delete", aug_p=0.2)

back_trans_aug = naw.BackTranslationAug(
    from_model_name="Helsinki-NLP/opus-mt-en-fr",
    to_model_name="Helsinki-NLP/opus-mt-fr-en"
)

eda_augmenter = EasyDataAugmenter()

# Paraphrasing model using GPU
paraphrase_model = pipeline("text2text-generation", model="t5-small", device=device_id)

augmenters = [syn_aug, bert_aug, del_aug, back_trans_aug, eda_augmenter]

# Custom augmentations
def synonym_replacement(text, n=2):
    if not isinstance(text, str):
        return text
    words = text.split()
    new_words = words.copy()
    random_indices = [i for i, word in enumerate(words) if wordnet.synsets(word)]
    random.shuffle(random_indices)
    replaced = 0
    for i in random_indices:
        if replaced >= n:
            break
        synonyms = wordnet.synsets(words[i])
        if synonyms:
            lemmas = synonyms[0].lemmas()
            if lemmas:
                new_words[i] = lemmas[0].name()
                replaced += 1
    return ' '.join(new_words)

def back_translate(text, target_lang='fr'):
    if not isinstance(text, str) or not text.strip():
        return text
    try:
        translated = GoogleTranslator(source='en', target=target_lang).translate(text)
        back_translated = GoogleTranslator(source=target_lang, target='en').translate(translated)
        return back_translated if back_translated else text
    except Exception as e:
        print(f"Back-translation failed: {e}")
        return text

def paraphrase_text(text):
    try:
        inputs = f"paraphrase: {text}"
        result = paraphrase_model(inputs, max_length=100, do_sample=True)
        return result[0]['generated_text']
    except Exception as e:
        print(f"Paraphrasing failed: {e}")
        return text

def apply_augmenter(text, augmenter):
    augmented = augmenter.augment(text)
    return augmented if isinstance(augmented, str) else text

def augment_text(text):
    methods = [
        synonym_replacement,
        back_translate,
        paraphrase_text,
        lambda x: apply_augmenter(x, random.choice(augmenters))
    ]
    random.shuffle(methods)
    augmented_text = str(text)
    for method in methods[:2]:
        try:
            augmented_text = method(augmented_text)
        except Exception as e:
            print(f"Augmentation failed: {e}")
    return augmented_text

def augment_with_dask(texts, n_partitions=4):
    """
    Augments a list of texts in parallel using Dask.
    """
    ddf = dd.from_pandas(pd.DataFrame({"text": texts}), npartitions=n_partitions)
    meta = ("text", "object")  # Output will be a Series with text data

    ddf["augmented_text"] = ddf["text"].map(augment_text,meta = meta)
    
    # Compute the result with a progress bar
    with tqdm(total=len(texts), desc="Augmenting Texts") as pbar:
        augmented_df = ddf.compute()
        pbar.update(len(texts))  # Update progress bar
    
    return augmented_df["augmented_text"].tolist()

def balance_specific_emotions(train_df, target_size=10000, n_partitions=4):
    """
    Upsamples only the 'Confusion' and 'Disgust' classes using Dask-based parallel augmentation.
    Other classes are retained as-is.
    """
    balanced_data = []
    emotions_to_upsample = ['Confusion', 'Disgust']
    all_emotions = train_df["emotion"].unique()

    pbar = tqdm(all_emotions, desc="Processing Emotions")

    for emotion in pbar:
        group = train_df[train_df["emotion"] == emotion]
        
        if emotion in emotions_to_upsample:
            # Upsample if the emotion is 'Confusion' or 'Disgust'
            if len(group) < target_size:
                additional = target_size - len(group)
                resampled = resample(group["text"], n_samples=additional, random_state=42)
                
                augmented = augment_with_dask(resampled, n_partitions=n_partitions)
                augmented_df = pd.DataFrame({
                    "text": augmented,
                    "emotion": [emotion] * additional
                })
                group = pd.concat([group, augmented_df])
        balanced_data.append(group)

        pbar.set_description(f"Processing Emotion: {emotion}")
        pbar.refresh()

    pbar.close()  
    return pd.concat(balanced_data).reset_index(drop=True)

# Upsampling 'Confusion' and 'Disgust' classes because of class imbalance.

balanced_df = balance_specific_emotions(train_df, target_size=10000, n_partitions=4)
print("Class distribution after upsampling:", Counter(balanced_df["emotion"]))

Using device: cuda


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\aliir\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Device set to use cuda:0
Processing Emotion: Anger:  50%|█████     | 4/8 [00:00<00:00, 37.45it/s]  You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Emotion: Anger:  50%|█████     | 4/8 [00:15<00:00, 37.45it/s]

Back-translation failed: None --> text must be a valid text with maximum 5000 character,otherwise it cannot be translated


Augmenting Texts: 100%|██████████| 8645/8645 [53:28<00:00,  2.69it/s]
Processing Emotion: Confusion:  88%|████████▊ | 7/8 [53:28<09:13, 553.14s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors
Augmenting Texts: 100%|██████████| 8933/8933 [2:10:04<00:00,  1.14it/s]
Processing Emotion: Disgust: 100%|██████████| 8/8 [3:03:32<00:00, 1376.61s/it]


Class distribution after upsampling: Counter({'Happy': 140742, 'Sadness': 103781, 'Neutral': 86949, 'Anger': 54677, 'Fear': 42027, 'Surprise': 13989, 'Confusion': 10000, 'Disgust': 10000})


In [6]:
balanced_df.to_csv(r'D:\Data Science Projects\AI Emotion Analysis\data\train_upsampled.csv',index=False)