https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp/data?select=val.txt dataset

https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb augmenter

In [1]:
!pip install nlpaug



In [20]:
import numpy as np
import pandas as pd
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

In [21]:
df = pd.read_csv('train.txt', sep = ';', header=None)
df.columns = ['text', 'emotion']
#Daha sonra augmented cümleleri eklemek için mevcut dataset'in bir kopyası oluşturuldu.
df_augmented = df.copy()

In [22]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [23]:
#Bazı cümleler birden fazla olduğu için aynılar çıkarıldı
df.drop_duplicates(subset=['text'],inplace=True,ignore_index=True)
df_augmented.drop_duplicates(subset=['text'],inplace=True,ignore_index=True)

In [24]:
df.emotion.value_counts()

emotion
joy         5350
sadness     4664
anger       2155
fear        1933
love        1299
surprise     568
Name: count, dtype: int64

# Keyboard Augmenter

### ChatBot benzeri şeyler için olabilecek klavye typolarının tespiti için kullanılır

In [25]:
aug = nac.KeyboardAug()
for i in range(df["text"].value_counts().sum()):
    augmented_text = aug.augment(df['text'][i])
    new_row = pd.DataFrame ({"text" : augmented_text , "emotion" : df["emotion"][i]},index=[0])
    df_augmented = pd.concat([df_augmented, new_row],ignore_index = True)

In [26]:
df_augmented.tail()

Unnamed: 0,text,emotion
31933,i i8st had a very brief ^*me in the bexHbat an...,sadness
31934,i am now 5uEnimg and i teep )a%betic that i am...,sadness
31935,i feel st#ogg and gloF overall,joy
31936,i feel like 4Mis was Wucm a rude coNjemt and i...,anger
31937,i knIS a lot but i f#eI so stupOW bDVakse i ca...,sadness


# Spelling Augmenter

### Aynı şekilde yazım hataları için kullanılır

In [27]:
aug = naw.SpellingAug()
for i in range(df["text"].value_counts().sum()):
    augmented_text = aug.augment(df['text'][i],n=3)
    for a in range(3):
        new_row = pd.DataFrame ({"text" : augmented_text[a] , "emotion" : df["emotion"][i]},index=[0])
        df_augmented = pd.concat([df_augmented, new_row],ignore_index = True)

In [28]:
df_augmented.tail()

Unnamed: 0,text,emotion
79840,i feed like this has such o rude coment [[ann ...,anger
79841,i feeld like this was cush a rude comment and'...,anger
79842,i know a lof bout ye feel so stupid because it...,sadness
79843,i meet a lot but i fill so stupid because l ca...,sadness
79844,ye know as lot but i feel so stupid because it...,sadness


# Synonym Augmenter

## WordNet ile bazı kelimeleri eş anlamlılarıyla değiştirerek veriyi çoğaltıyor.

### Not: WordNet paketi bu verisetinde güzel çalışmıyor. PPDB paketi ile yapılabilir fakat o paketi de el ile indirmek zorunda olduğumuz için kodu bu şekilde bırakıyorum.

In [29]:
aug = naw.SynonymAug(aug_src='wordnet')
for i in range(df["text"].value_counts().sum()):
    augmented_text = aug.augment(df['text'][i])
    new_row = pd.DataFrame ({"text" : augmented_text , "emotion" : df["emotion"][i]},index=[0])
    df_augmented = pd.concat([df_augmented, new_row],ignore_index = True)

In [30]:
df_augmented.tail()

Unnamed: 0,text,emotion
95809,single just had a rattling brief time in the b...,sadness
95810,i am now turn and i palpate pathetic that i pe...,sadness
95811,i palpate strong and serious overall,joy
95812,i feel alike this be such a rude remark and im...,anger
95813,one do it a hatful but one feel so stupid beca...,sadness
