In [1]:
from tqdm.auto import tqdm
import os

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split , StratifiedKFold


import tensorflow as tf 
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model, save_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Input,Dense, LSTM, RNN, Bidirectional, GlobalAveragePooling2D , Dropout, Conv1D, Flatten

from transformers import TFAutoModel , AutoTokenizer
# import ray
# from ray import tune

!pip install numpy requests nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action


Collecting nlpaug
  Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
     |████████████████████████████████| 410 kB 291 kB/s            
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.10


In [2]:
class config:
    train_path = "../input/dravidianlangtech2022-personal/Train_Data_Combined.csv"
    val_path = "../input/dravidianlangtech2022-personal/Validation_Data_Combined.csv"
    save_dir = "./result"
    seed = 55
    try:
        AUTOTUNE = tf.data.AUTOTUNE   
    except:
        AUTOTUNE = tf.data.experimental.AUTOTUNE 
    epochs = 50
    max_len = 64
    batch_size = 32
    hf_path = "google/muril-base-cased"
def seed_everything(seed = config.seed):
    print(f"seeded everything to seed {seed}")
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
if not os.path.exists(config.save_dir):
    os.makedirs(config.save_dir)
seed_everything()


seeded everything to seed 55


In [3]:
df_train = pd.read_csv(config.train_path)
#df_val = pd.read_csv(config.val_path)
df_train = df_train.drop([7127,7865])


In [4]:
df_train

Unnamed: 0.1,Unnamed: 0,label,text
0,0,None-of-the-above,enaku unmaikum aluha wantu thirunangaigal thei...
1,1,Transphobic,SUPERSTAR VIJAY dai arivuketta polu ithu thapp...
2,2,None-of-the-above,Ugka smile cute a iruku😊😊...
3,3,None-of-the-above,Anna i am waiting na 🥰🥰🥰
4,4,None-of-the-above,Yanda tamilnadu la evvalavo pirachana iruku at...
...,...,...,...
8178,2235,None-of-the-above,ராஜா ஏண்டா பின்னாடி பின்னாடி பார்த்து பேசுற......
8179,2236,None-of-the-above,பழய சித்தாகதை கிளிக வேண்டும் ஆண்ணா
8180,2237,None-of-the-above,SRI NARAYANA நீ ஒரு ஆரிய இந்து சரியா
8181,2238,Counter-speech,அறியா வயதில் குழந்தைகளை அடித்து சொல்லிக்கொ...


In [5]:
Frequency_list = df_train.label.value_counts()

In [6]:
df_train.label.value_counts()

None-of-the-above    5011
Misandry             1276
Counter-speech        497
Xenophobia            392
Misogyny              336
Hope-Speech           299
Homophobia            207
Transphobic           163
Name: label, dtype: int64

In [7]:
aug_val = Frequency_list[0]//Frequency_list['Xenophobia']

In [8]:
# df_train = df_train.replace({'Counter-speech':0,
#                              'Homophobia':1, 
#                              'Hope-Speech':2, 
#                              'Misandry':3, 
#                              'Misogyny':4, 
#                              'None-of-the-above':5, 
#                              'Transphobic':6,
#                              'Xenophobia':7})df_val

In [9]:
df_trans = df_train[df_train['label']=='Xenophobia']

In [10]:
df_trans

Unnamed: 0.1,Unnamed: 0,label,text
64,65,Xenophobia,kudisekiram tamilnadu china controll poidum...
78,79,Xenophobia,China eanna mo plan pannuthu
95,96,Xenophobia,Chaina ai Alithe thiravendum vera vali illa...
115,116,Xenophobia,புரட்சியாளன் சுபா yaara ivan...😂😂deiii oyaala...
117,118,Xenophobia,Dai... unakkum America kum sanda na nee avana ...
...,...,...,...
8046,2103,Xenophobia,கைபர் கணவாய் வழியாக வந்தீர்கள் நீ இந்தியன் இல்லை
8074,2131,Xenophobia,selva raja நீ ஒரு கிறிஸ்தவ தேவிடியா னா. .
8084,2141,Xenophobia,எவனுக்கு பெற்றாளோ பீகாரிலிருந்து வந்த நாய் தமி...
8142,2199,Xenophobia,திருட்டு ட்ராவிடா சாதி வெறி நாயே...தெலுங்கு பா...


In [11]:
!pip install indic-nlp-library
from indicnlp.tokenize import indic_tokenize  
def tokenizer(text):
    return indic_tokenize.trivial_tokenize(text,lang='ta')

def augment_word_level(iterable_df,aug_val):
    aug = naw.WordEmbsAug(model_type='fasttext',tokenizer = tokenizer,
                      model_path='../input/fasttext-indic-nlp-tamil/indicnlp.ft.ta.300.vec')
    list_x = []
    for row in tqdm(iterable_df):
        for i in range(aug_val):
            list_per_row = []
            list_per_row.append(row[0])
            list_per_row.append(row[1])
            augmented_text = aug.augment(row[2])
            list_per_row.append(augmented_text)
            list_x.append(list_per_row)
    return list_x
        

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.81-py3-none-any.whl (40 kB)
     |████████████████████████████████| 40 kB 494 kB/s            
Collecting sphinx-argparse
  Downloading sphinx_argparse-0.3.1-py2.py3-none-any.whl (12 kB)
Collecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinx>=1.2.0
  Downloading Sphinx-4.5.0-py3-none-any.whl (3.1 MB)
     |████████████████████████████████| 3.1 MB 520 kB/s            
Collecting alabaster<0.8,>=0.7
  Downloading alabaster-0.7.12-py2.py3-none-any.whl (14 kB)
Collecting sphinxcontrib-htmlhelp>=2.0.0
  Downloading sphinxcontrib_htmlhelp-2.0.0-py2.py3-none-any.whl (100 kB)
     |████████████████████████████████| 100 kB 6.3 MB/s            
[?25hCollecting snowballstemmer>=1.1
  Downloading snowballstemmer-2.2.0-py2.py3-none-any.whl (93 kB)
     |████████████████████████████████| 93 kB 915 kB/s             
[?25hCollecting docutils<0.18,>=0.14
  Downloading docutils-0.17

In [12]:
df_trans_aug = augment_word_level(df_trans.values,aug_val)

  0%|          | 0/392 [00:00<?, ?it/s]

In [13]:
dataframe_trans_aug = pd.DataFrame(df_trans_aug, columns = ['index', 'label','text'])


In [14]:
dataframe_trans_aug = dataframe_trans_aug.sample(frac=1)

In [15]:
dataframe_trans_aug

Unnamed: 0,index,label,text
2661,4420,Xenophobia,Dei nenga corona il removing pani vitutu nenga...
282,491,Xenophobia,Chiii vijayabaskar de ivlo paithiyam iruka😔😔😔
4423,1752,Xenophobia,காசுக்காக வந்தேரி மத இஸ்லாமா மாறிக்கிட்டு நினை...
3508,5828,Xenophobia,Pls china www.tamil murdered panunga
270,486,Xenophobia,Rajini mannuku endhagalarudhan dhangachi adhup...
...,...,...,...
3419,5637,Xenophobia,Indha coimbatore annan yeppadiyavadhu olithu k...
2021,3343,Xenophobia,Daiii sappa mooku karanungala... 😓ungaluku mat...
968,1745,Xenophobia,Dai appeared muthal china karana kollanum
4391,1633,Xenophobia,; கொரோன என்ற அபகரிக்கப்பட்டு நாட்டு பொருளாதரத்...


In [16]:
dataframe_trans_aug.to_csv('Xenophobia_aug.csv')

In [17]:
aug_val_Hope = Frequency_list[0]//Frequency_list['Counter-speech']
df_Hope = df_train[df_train['label']=='Counter-speech']
df_Hope_aug = augment_word_level(df_Hope.values,aug_val_Hope)
dataframe_Hope_aug = pd.DataFrame(df_Hope_aug, columns = ['index', 'label','text'])
dataframe_Hope_aug = dataframe_Hope_aug.sample(frac=1)
dataframe_Hope_aug.to_csv('Counter_speech_aug.csv')

  0%|          | 0/497 [00:00<?, ?it/s]

In [18]:
aug_val_Miso = Frequency_list[0]//Frequency_list['Misandry']
df_Miso = df_train[df_train['label']=='Misandry']
df_Miso_aug = augment_word_level(df_Miso.values,aug_val_Miso)
dataframe_Miso_aug = pd.DataFrame(df_Miso_aug, columns = ['index', 'label','text'])
dataframe_Miso_aug = dataframe_Miso_aug.sample(frac=1)
dataframe_Miso_aug.to_csv('Misandry_aug.csv')

  0%|          | 0/1276 [00:00<?, ?it/s]