In [1]:
from tqdm.auto import tqdm
import os

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split , StratifiedKFold


import tensorflow as tf 
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model, save_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Input,Dense, LSTM, RNN, Bidirectional, GlobalAveragePooling2D , Dropout, Conv1D, Flatten

from transformers import TFAutoModel , AutoTokenizer
# import ray
# from ray import tune

!pip install numpy requests nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action


Collecting nlpaug
  Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
     |████████████████████████████████| 410 kB 537 kB/s            
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.10


In [2]:
class config:
    train_path = "../input/dravidianlangtech2022-personal/Train_Data_Combined.csv"
    val_path = "../input/dravidianlangtech2022-personal/Validation_Data_Combined.csv"
    save_dir = "./result"
    seed = 55
    try:
        AUTOTUNE = tf.data.AUTOTUNE   
    except:
        AUTOTUNE = tf.data.experimental.AUTOTUNE 
    epochs = 50
    max_len = 64
    batch_size = 32
    hf_path = "google/muril-base-cased"
def seed_everything(seed = config.seed):
    print(f"seeded everything to seed {seed}")
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
if not os.path.exists(config.save_dir):
    os.makedirs(config.save_dir)
seed_everything()


seeded everything to seed 55


In [3]:
df_train = pd.read_csv(config.train_path)
#df_val = pd.read_csv(config.val_path)
df_train = df_train.drop([7127,7865])


In [4]:
df_train

Unnamed: 0.1,Unnamed: 0,label,text
0,0,None-of-the-above,enaku unmaikum aluha wantu thirunangaigal thei...
1,1,Transphobic,SUPERSTAR VIJAY dai arivuketta polu ithu thapp...
2,2,None-of-the-above,Ugka smile cute a iruku😊😊...
3,3,None-of-the-above,Anna i am waiting na 🥰🥰🥰
4,4,None-of-the-above,Yanda tamilnadu la evvalavo pirachana iruku at...
...,...,...,...
8178,2235,None-of-the-above,ராஜா ஏண்டா பின்னாடி பின்னாடி பார்த்து பேசுற......
8179,2236,None-of-the-above,பழய சித்தாகதை கிளிக வேண்டும் ஆண்ணா
8180,2237,None-of-the-above,SRI NARAYANA நீ ஒரு ஆரிய இந்து சரியா
8181,2238,Counter-speech,அறியா வயதில் குழந்தைகளை அடித்து சொல்லிக்கொ...


In [5]:
Frequency_list = df_train.label.value_counts()

In [6]:
df_train.label.value_counts()

None-of-the-above    5011
Misandry             1276
Counter-speech        497
Xenophobia            392
Misogyny              336
Hope-Speech           299
Homophobia            207
Transphobic           163
Name: label, dtype: int64

In [7]:
# df_train = df_train.replace({'Counter-speech':0,
#                              'Homophobia':1, 
#                              'Hope-Speech':2, 
#                              'Misandry':3, 
#                              'Misogyny':4, 
#                              'None-of-the-above':5, 
#                              'Transphobic':6,
#                              'Xenophobia':7})df_val

In [8]:

aug_val = Frequency_list[0]//Frequency_list['Misandry']

In [9]:
def augment_word_level(iterable_df,aug_val):
    aug = naw.ContextualWordEmbsAug(model_path=config.hf_path,model_type='bert',aug_p=0.3)
    list_x = []
    for row in tqdm(iterable_df):
        for i in range(aug_val):
            list_per_row = []
            list_per_row.append(row[0])
            list_per_row.append(row[1])
            augmented_text = aug.augment(row[2])
            list_per_row.append(augmented_text)
            list_x.append(list_per_row)
    return list_x
        

In [10]:
df_trans = df_train[df_train['label']=='Misandry']
df_trans_aug = augment_word_level(df_trans.values,aug_val)
dataframe_trans_aug = pd.DataFrame(df_trans_aug, columns = ['index', 'label','text'])
dataframe_trans_aug.to_csv('Misandry_aug.csv')

Downloading:   0%|          | 0.00/181 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/113 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/909M [00:00<?, ?B/s]

  0%|          | 0/1276 [00:00<?, ?it/s]