# Emotion classification on GoEmotions - Data Augmentation

Import necessary libraries

In [3]:
import numpy as np
import pandas as pd
import pathlib
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

from sklearn.metrics import precision_recall_fscore_support, classification_report
import nltk.corpus
from sklearn import metrics
from scipy.special import softmax

import transformers
from transformers import  AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer,TrainerCallback, EarlyStoppingCallback
import glob
from datasets import Dataset

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from transformers import EvalPrediction   
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action


In [2]:
with torch.no_grad():
    torch.cuda.empty_cache()

## 1. Load data

In [4]:
folder_path = 'data/clean/'
file_pattern = folder_path + '*.csv'
csv_files = glob.glob(file_pattern)

for csv_file in csv_files:
    if 'train' in csv_file:
        df_train = pd.read_csv(csv_file)
    elif 'val' in csv_file:
        df_val = pd.read_csv(csv_file)
    else:
        df_test = pd.read_csv(csv_file)

In [5]:
df_train.head()

Unnamed: 0,clean_text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,my favourite food is anything i did not have t...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,now if he does off himself everyone will think...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,why the fuck is bayless isoing,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,to make her feel threatened,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,dirty southern wankers,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#function to clean the generated augmented data
def clean_text(text):
    chars_to_remove = ["’", "‘", "–", "—", "~", "|", "“", "”", "…", "'", "`", "_","“"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    
    #handle emoticons and emojis
    text = handle_emoticons (text)
    text = handle_emojis (text)

    text = text.lower()# lower case
    
    #fix contraction from text
    #Fix contraction before removing punctuation is important 
    text = contractions.fix(text)

    text = re.sub(r"http\S*|\S*\.com\S*|\S*www\S*", " ", text)# eliminate urls
    text = re.sub(r"\s@\S+", " ", text)# eliminate @mentions
    
    text = re.sub(r'[^\w\s]',' ', text) #remove all punctuations
    text = re.sub(r'\n', '', text) #remove line breaks
    text = re.sub(rx, ' ', text)

    text = re.sub(r"\s+", " ", text) # replace all whitespaces with a single space
    text = text.strip() #remove leading and trailing spaces


    return text    

## 2. Performing Data Augmentation

### Bert based Augmentation

In [6]:
# Create BERT-based augmenter
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="insert"
)

In [None]:
bert_train_df = df_train.copy()

In [7]:
# Apply augmentation to the 'clean_text' column
bert_train_df['augmented_text'] = bert_train_df['clean_text'].apply(lambda x: aug.augment(x))

In [7]:
bert_train_df.drop(columns=['clean_text'], inplace=True)
bert_train_df

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,augmented_text
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,['my favourite food is anything the i did requ...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,['and now surely if tonight he does strip off ...
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,['but why the damn fuck is this bayless isoing']
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,['to forcibly make her parents feel threatened']
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,['as dirty southern bill wankers']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43403,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,['it added you to mate well i may have also ju...
43404,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,['always i thought it that which was funny or ...
43405,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,['what are you talking talking about anything ...
43406,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,['feel more... like a sex baptism with sexy re...


In [8]:
bert_train_df.rename(columns={'augmented_text': 'text'}, inplace=True)

In [9]:
bert_train_df['text'] = bert_train_df['text'].str.strip("['']")

In [None]:
bert_train_df['clean_text'] = bert_train_df['text'].apply(clean_text)

In [None]:
bert_train_df.drop(columns=['text'], inplace=True)

In [16]:
bert_train_df.head()

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,clean_text
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,my favourite food is anything the i did requir...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,and now surely if tonight he does strip off hi...
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,but why the damn fuck is this bayless isoing
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,to forcibly make her parents feel threatened
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,as dirty southern bill wankers


In [None]:
bert_train_df.to_csv('data/clean/augmented/augmented_insert_train.csv', index=False)  

### Word2Vec based Augmentation

In [None]:
# model_type: word2vec
aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='GoogleNews-vectors-negative300.bin',
    action="insert")

In [None]:
w2v_train_df = df_train.copy()

In [None]:
# Apply augmentation to the 'clean_text' column
w2v_train_df['augmented_text'] = w2v_train_df['clean_text'].apply(lambda x: aug.augment(x))

Drop necessary columns, clean

In [None]:
w2v_train_df.rename(columns={'augmented_text': 'text'}, inplace=True)

In [None]:
#df_train contained clean_text column, need to replace it so have to drop it first
w2v_train_df.drop(columns=['clean_text'], inplace=True)

In [None]:
#remove the text from brackets
w2v_train_df['text'] = w2v_train_df['text'].str.strip("['']")

In [None]:
#Apply the clean text function
w2v_train_df['clean_text'] = w2v_train_df['text'].apply(clean_text)

In [None]:
w2v_train_df.drop(columns=['text'], inplace=True)

In [None]:
#save the csv
w2v_train_df.to_csv('data/clean/augmented/augmented_w2v_train.csv', index=False)  

### WordNet based Augmentation

In [None]:
aug = naw.SynonymAug(aug_src='wordnet')

In [None]:
#make a copy of the df_train
wordnet_train_df = df_train.copy()

In [None]:
# Apply augmentation to the 'clean_text' column
wordnet_train_df['augmented_text'] = wordnet_train_df['clean_text'].apply(lambda x: aug.augment(x))

Drop necessary columns, clean

In [None]:
wordnet_train_df.rename(columns={'augmented_text': 'text'}, inplace=True)

In [None]:
#df_train contained clean_text column, need to replace it so have to drop it first
wordnet_train_df.drop(columns=['clean_text'], inplace=True)

In [None]:
#remove the text from brackets
wordnet_train_df['text'] = wordnet_train_df['text'].str.strip("['']")

In [None]:
#Apply the clean text function
wordnet_train_df['clean_text'] = wordnet_train_df['text'].apply(clean_text)

In [None]:
wordnet_train_df.drop(columns=['text'], inplace=True)

In [None]:
#save the csv
wordnet_train_df.to_csv('data/clean/augmented/augmented_wordnet_subs_train.csv', index=False)  