In [4]:
import re
import numpy as np
import pandas as pd
import keras
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, BatchNormalization, LSTM
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

word_lemmatizer = WordNetLemmatizer()
eng_stop = set(stopwords.words('english'))

In [5]:
!pip install transformers



In [3]:
from transformers import BertTokenizer

bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [6]:
# constants
BASE_PATH = '../input/jigsaw-toxic-comment-classification-challenge/'
TRAIN_PATH = 'train.csv.zip'
TEST_PATH = 'test.csv.zip'
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
NUM_CLASSES = 6
MAX_WORDS=10000
MAX_LEN=128

In [7]:
# load train test dataframe
train = pd.read_csv(f"{BASE_PATH}{TRAIN_PATH}")
test = pd.read_csv(f"{BASE_PATH}{TEST_PATH}")

In [8]:
train_text = train['comment_text'].fillna("CVxTz").to_list()
train_labels = train[LABELS].values
test_text = test['comment_text'].fillna("CVxTz").to_list()

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub("'", "", text)
    words = re.split(r'\W+', text)
    text = " ".join(words)
    text = re.sub("\d+", "", text)
    text = " ".join(text.split())
    return text.strip()

In [10]:
clean_train_text = list(map(clean_text, train_text))
clean_test_text = list(map(clean_text, test_text))

In [11]:
def remove_stopwords(text):
    words = [word for word in text.split() if word not in eng_stop]
    return " ".join(words)

In [12]:
def lemmatize(text):
    words = text.split()
    lemmatized_words = list(map(word_lemmatizer.lemmatize, words))
    return " ".join(lemmatized_words)

In [13]:
clean_train_text = list(map(remove_stopwords, clean_train_text))
clean_train_text = list(map(lemmatize, clean_train_text))

clean_test_text = list(map(remove_stopwords, clean_test_text))
clean_test_text = list(map(lemmatize, clean_test_text))

In [14]:
corpus = [*clean_train_text, *clean_test_text]
#w2v_model = train_word2vec(corpus)

In [15]:
def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

In [19]:
input_ids = tokenize_sentences(clean_train_text, tokenizer)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, padding="post")
attention_masks = create_attention_masks(input_ids)

HBox(children=(FloatProgress(value=0.0, max=159571.0), HTML(value='')))




In [21]:
from transformers import TFBertModel, BertConfig

In [28]:
def create_model():
    # BERT encoder
    encoder = TFBertModel.from_pretrained("bert-base-uncased")
    
    # multi-label classification model
    input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    embedding = encoder(input_ids, attention_mask=attention_mask)[1]
    
    dense_1 = Dense(64, activation='relu')(embedding)
    dense_1 = Dropout(0.1)(dense_1)
    dense_1 = BatchNormalization()(dense_1)
    
    output = Dense(6, activation='sigmoid')(dense_1)
    
    model = keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    return model

In [29]:
model = create_model()
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_3 (TFBertModel)   ((None, 128, 768), ( 109482240   input_6[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 64)           49216       tf_bert_model_3[0][1]            
____________________________________________________________________________________________

In [30]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
with tf.device('/gpu:0'):
    history = model.fit([input_ids, attention_masks], train_labels, epochs=1, batch_size=64, validation_split=0.2)



In [None]:
test_input_ids = tokenize_sentences(clean_test_text, tokenizer)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

HBox(children=(FloatProgress(value=0.0, max=153164.0), HTML(value='')))

In [None]:
test_labels = model.predict([test_input_ids, test_attention_masks])

res = []
ids = test["id"].to_list()
for idx, label in zip(ids, test_labels):
    res.append([idx, *label])

out = pd.DataFrame(res, columns=["id", *LABELS])
out.to_csv("out.csv", index=False)