In [1]:
import numpy as np
import pandas as pd
from keras.layers import GlobalMaxPool1D, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM, Dense
from tqdm import tqdm

np.random.seed(42)

In [2]:
from transformers import DistilBertTokenizer, DistilBertConfig, TFDistilBertModel

distil_bert = 'distilbert-base-uncased' # Pick any desired pre-trained model

# Defining DistilBERT tokonizer
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)

In [3]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True,
                                             return_attention_mask=True, return_token_type_ids=True, truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


In [4]:
data = pd.read_csv("./toxic_comments.csv")[:10000]

In [5]:
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
9995,1a790ff1007a10e3,Numbers may be either listed separately at the...,0,0,0,0,0,0
9996,1a7a4868968e2b9e,"Those two love to disagree, don't they? 206.17...",0,0,0,0,0,0
9997,1a7c3bec9a71415d,"""I have changed """"Lance Thomas"""" to """"Lance Th...",0,0,0,0,0,0
9998,1a7c9c14b0cf0fe0,states \n\nCourts: I have been putting all art...,0,0,0,0,0,0


In [6]:
y = data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [7]:
X = data["comment_text"]

In [8]:
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.1, random_state=1)

In [9]:
X_train_raw = tokenize(x_train, tokenizer)

100%|██████████████████████████████████████| 9000/9000 [00:15<00:00, 573.99it/s]


In [10]:
X_train_ids = X_train_raw[0]

In [11]:
X_train_masks = X_train_raw[1]

In [12]:
X_train_segments = X_train_raw[2]

In [13]:
X_test_raw = tokenize(x_test, tokenizer)

100%|██████████████████████████████████████| 1000/1000 [00:01<00:00, 555.85it/s]


In [14]:
X_test_ids = X_test_raw[0]

In [15]:
X_test_masks = X_test_raw[1]

In [16]:
X_test_segments = X_test_raw[2]

In [17]:
config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config)

input_ids_in = Input(shape=(128,), name='input_token', dtype='int32')
input_masks_in = Input(shape=(128,), name='masked_token', dtype='int32')

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = Bidirectional(LSTM(25, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
X = GlobalMaxPool1D()(X)
X = Dense(25, activation='relu')(X)
X = Dropout(0.2)(X)
X = Dense(6, activation='sigmoid')(X)
model = Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model.layers[:3]:
  layer.trainable = False


model.summary()

2021-07-28 18:17:40.062321: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-07-28 18:17:40.143654: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDi

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                         

In [18]:
model.compile(optimizer="adam",loss='categorical_crossentropy',metrics=['accuracy'])
history = model.fit(
    [X_train_ids, X_train_masks],np.array(y_train),
    validation_split = 0.2,
    batch_size = 64,
    epochs = 1,
    verbose = 1
)



2021-07-28 18:17:46.052480: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)




In [19]:
model.evaluate([X_test_ids, X_test_masks], np.array(y_test))



[0.392819881439209, 0.9610000252723694]

In [40]:
i = np.random.randint(0, X_test_ids.shape[0])
y_predicted = pd.DataFrame(model.predict([X_test_ids[i], X_test_masks[i]])).mean()
y_true = y_test.iloc[i]
print("-"*30)
print(f"Predicted {y_predicted}:\n\nTrue {y_true}")

------------------------------
Predicted 0    0.000244
1    0.000038
2    0.000250
3    0.000042
4    0.000161
5    0.000095
dtype: float32:

True toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 3979, dtype: int64
