In [1]:
import os
import re
import gc
from collections import Counter

import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.layers import Input, Dropout, Dense, GlobalAveragePooling1D, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from keras.callbacks import Callback, EarlyStopping
from keras import backend as K

from transformers import TFBertModel
from transformers import AutoTokenizer

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold

DATA_PATH = './data'
SOLUTION_PATH = './solutions'
WEIGHTS_PATH = './model_weights'

In [2]:
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

In [3]:
train = pd.concat([train, test])

In [4]:
list_classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('\n', ' ', text)
    
    text = re.sub(r"thats", " that is ", text)
    text = re.sub(r"youre", " you are ", text)
    text = re.sub(r"cant", " can not ", text)
    text = re.sub(r"didnt", " did not ", text)
    text = re.sub(r"dont", " do not ", text)
    text = re.sub(r"doesnt", " does not ", text)
    text = re.sub(r"ive", " i have ", text)
    text = re.sub(r"youve", " you have ", text)
    text = re.sub(r"wont", " will not ", text)
    text = re.sub(r"hes", " he is ", text)
    text = re.sub(r"isnt", " is not ", text)
    text = re.sub(r"havent", " have not ", text)
    text = re.sub(r"arent", " are not ", text)
    text = re.sub(r"whats", " what is ", text)
    text = re.sub(r"wasnt", " was not ", text)
    text = re.sub(r"theres", " there is ", text)
    text = re.sub(r"youll", " you will ", text)
    text = re.sub(r"wouldnt", " would not ", text)
    text = re.sub(r"shouldnt", " should not ", text)
    text = re.sub(r"theyre", " they are ", text)

    text = text.strip(' ')
    return text

In [6]:
train['comment_text'] = train['comment_text'].map(lambda x : clean_text(x))

In [7]:
train['comment_text'] = train['comment_text'].str.lower().str.replace(r'[^\w\s]+', '', regex=True)
train['hash'] = train['comment_text'].str[:200].apply(hash)

In [8]:
train = train.drop_duplicates(subset=['hash'])
train.shape

(307958, 9)

In [9]:
train = train[train.columns[:-1]]

In [10]:
train[train.columns[2:]].sum()

toxic            15065.0
severe_toxic      1560.0
obscene           8313.0
threat             465.0
insult            7747.0
identity_hate     1380.0
dtype: float64

In [11]:
max_length = 120
model_name = 'bert-base-uncased'

auto_tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name, output_hidden_states=True, return_dict=True)

2022-05-16 01:27:55.572238: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoi

In [12]:
vocab = auto_tokenizer.get_vocab()
counter = Counter()
temp = train.loc[train[train.columns[2:]].sum(axis=1) > 0, 'comment_text']

for i in range(len(temp)):
    sentence = temp.iloc[i]
    words = sentence.split()[:]
    
    for word in words:
        if word.strip() in vocab:
            pass
        else:
            counter[word.strip()] += 1

In [13]:
counter.most_common(20)

[('faggot', 1985),
 ('nigger', 1964),
 ('cunt', 1538),
 ('wiki', 1467),
 ('moron', 1143),
 ('fag', 1015),
 ('vandalism', 985),
 ('wanker', 807),
 ('dickhead', 770),
 ('admin', 753),
 ('edits', 724),
 ('faggots', 677),
 ('fucksex', 624),
 ('yourselfgo', 621),
 ('twat', 611),
 ('delete', 546),
 ('poop', 503),
 ('buttsecks', 498),
 ('bitc', 483),
 ('noobs', 460)]

In [14]:
# model dont have embedding for many common slur words
for word in ['retard', 'nigger', 'cunt', 'twat', 'moron', 'wanker', 'faggot', 'cocksucker']:
    if word in vocab:
        print(word)