In [None]:

# Install Transformers
!pip install transformers==4.35.2



In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm


import tensorflow as tf
from keras.layers import Dense, Activation, Dropout
from keras.layers import Input
from keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint


import transformers
import tokenizers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

  _torch_pytree._register_pytree_node(


In [None]:
print(transformers.__version__)
print(tokenizers.__version__)

4.35.2
0.15.2


In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
        # encode the word to vector of integer

    encode_dictionary = tokenizer.batch_encode_plus(
        texts,
        return_attention_mask=True,
        return_token_type_ids=True,
        pad_to_max_length=True,
        max_length=maxlen,
        truncation=True)

    return encode_dictionary['input_ids']

In [None]:
from tensorflow.keras.regularizers import l2

def build_model(transformer, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]

    # Add Dropout to avoid overfitting
    cls_token = Dropout(0.3)(cls_token)

    # Add Dense layer with L2 regularization
    out = Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))(cls_token)

    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  




REPLICAS:  8


In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 7
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192


MODEL='bert-base-multilingual-cased'

In [None]:
# First load the real tokenizer
#tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

#use the pre-trained model bert as a tokenizer
#bert tokenizer has vocabulary for emoji.
from transformers import TFAutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Save the loaded tokenizer locally
#tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
#fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
#fast_tokenizer

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
train = pd.read_csv("Toxicity Dataset/jigsaw-toxic-comment-train.csv")
valid = pd.read_csv('Toxicity Dataset/validation.csv')
test = pd.read_csv('Toxicity Dataset/test.csv')

In [None]:
%%time
#call the function regular encode on for all the 3 dataset to convert each words after the tokenizer
#into a vector
#x_train,x_test, and x_validation will have the comment text column only,(in test called "content")
x_train = np.array(regular_encode(list(train.comment_text.values), tokenizer, maxlen=MAX_LEN))
x_valid = np.array(regular_encode(list(valid.comment_text.values), tokenizer, maxlen=MAX_LEN))
x_test = np.array(regular_encode(list(test.content.values), tokenizer, maxlen=MAX_LEN))

#y_train,y_valid will have the target column "toxic"
y_train = train.toxic.values
y_valid = valid.toxic.values



CPU times: user 5min 11s, sys: 873 ms, total: 5min 12s
Wall time: 5min 11s


In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048,seed=40)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
%%time
# in the TPU
with strategy.scope():
    #take the encoder results of bert from transformers and use it as an input in the NN model
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_word_ids (InputLayer  [(None, 192)]             0         
 )                                                               
                                                                 
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPo   177853440 
 )                           olingAndCrossAttentions             
                             (last_hidden_state=(Non             
                             e, 192, 768),                       
                              pooler_output=(None, 7             
                             68),                                
                              past_key_values=None,              
                             hidden_states=None, att             
                             entions=None, cross_att             
                             entions=None)                 

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    callbacks=[early_stopping],
    epochs=EPOCHS
)

Epoch 1/7




Epoch 2/7


In [None]:

n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14


In [None]:
model.save('multilingual_bert_toxic_seven_epochs.h5')