In [None]:
!pip install transformers
!pip install sentencepiece
!pip install -U scikit-learn
!pip install keras-nlp tensorflow --upgrade
!pip install seaborn


In [None]:
# Import the necessary packages
import os
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

from keras import backend as K


## Helper Functions

In [None]:
'''
Method to encode the texts using the tokenizer in a quick way

Params:
    texts  - inpt text to be encoded
    tokenizer - tokenizer function to be used
    chunk_size - the size of the chunk 
    maxlen - maximum length of the chunks
    
Returns:
    numpy array of encoded texts
'''
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding
    ng(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

'''
Method to encode the texts using a tokenizer in a normal way

Params:
    texts  - inpt text to be encoded
    tokenizer - tokenizer function to be used
    maxlen - maximum length of the chunks
Returns:
    numpy array of encoded texts
'''
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
'''
Method to calculate the recall score

Params:
    y_true - original y values
    y_pred - predicted y values
Returns:
    recall score
'''
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

'''
Method to calculate the precision score

Params:
    y_true - original y values
    y_pred - predicted y values
Returns:
    precision score    
'''
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

'''
Method to calculate the f1 score

Params:
    y_true - original y values
    y_pred - predicted y values
Returns:
    f1 score    
'''
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
'''
Method to build the stacked Transformer architecture.

Params:
    num_heads - number of multi-headed attention blocks
    feed_forward_dim - number of neurons in the feed forward network
    attention_dropout - dropout percent after attention layer
    feed_forward_dropout - dropout percent after feed forward layer
Returns:
    final model
'''
def vanilla_transformer(num_heads=8, feed_forward_dim=768, attention_dropout=0.1, feed_forward_dropout=0.1):
    attention_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=sequence_output.shape[-1])(sequence_output, sequence_output)
    attention_layer = Dropout(attention_dropout)(attention_layer)
    attention_layer = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(attention_layer + sequence_output)
    
    feed_forward_layer = Dense(feed_forward_dim, activation='relu')(attention_layer)
    feed_forward_layer = Dropout(feed_forward_dropout)(feed_forward_layer)
    transformer_layer = tf.keras.layers.LayerNormalization(
        epsilon=1e-6)(feed_forward_layer + attention_layer)
    
    return transformer_layer

'''
Method to build the stacked Transformer architecture.

Params:
    transformer - input transformer architecture
    max_len - maximum length of a chunk
Returns:
    final model
'''
def build_model(transformer, max_len=512):
    
    # define the input layer
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    
    # define the feature encoder using XLM-RoBERTa model
    sequence_output = transformer(input_word_ids)[0]
    
    transformer_layer = vanilla_transformer()(sequence_output)

    cls_token = transformer_layer[:, 0, :]
    x1 = Dense(256, activation = 'relu')(cls_token)
    x1 = Dense(128, activation = 'relu')(x1)
    x1 = Dense(64, activation = 'relu')(x1)
    x1 = Dense(4, activation = 'relu')(x1)
    x1 = Dense(32, activation = 'relu')(x1)
    x1 = Dense(16, activation = 'relu')(x1)
    out = Dense(2, activation='softmax')(x1)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy', recall_m, precision_m, f1_m])
    
    return model

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 1
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 275
MODEL = 'jplu/tf-xlm-roberta-base'

## Create fast tokenizer

In [None]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Load text data into memory

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])

train = train.groupby('toxic').apply(
    lambda x: x.sample(frac=0.5)
)

train = train.droplevel(0)

print("length of train data : ",len(train.index))
print("length of valid data : ", len(valid.index))


In [None]:
# Subcample the data

nontoxic = train[train['toxic']==0]
toxic = train[train["toxic"]==1]

new_nontoxic = nontoxic.sample(toxic.shape[0])
new_nontoxic.shape[0]

In [None]:
from sklearn.utils import shuffle
train = shuffle(pd.concat([toxic, new_nontoxic]))
train_unsampled = shuffle(pd.concat([toxic, nontoxic]))

In [None]:
# delete the unneccary variabled from memory to save ram space.
del train1
del train2

In [None]:
# preprocess the data into x_train, x_valid and x_test
%%time 

x_train = regular_encode(train.comment_text.values.tolist(), tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text.values.tolist(), tokenizer, maxlen=MAX_LEN)

x_test = regular_encode(test.content.values.tolist(), tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values.tolist()
y_valid = valid.toxic.values.tolist()

In [None]:
# preprocess the data into x_train_unsampled

%%time 

x_train_unsampled = regular_encode(train_unsampled.comment_text.values.tolist(), tokenizer, maxlen=MAX_LEN)
y_train_unsampled = train_unsampled.toxic.values.tolist()


In [None]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes = 2)
y_train_unsampled = tf.keras.utils.to_categorical(y_train_unsampled, num_classes = 2)
y_valid = tf.keras.utils.to_categorical(y_valid, num_classes = 2)

## Build datasets objects

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

n_steps = x_train.shape[0] // BATCH_SIZE
n_steps_valid = x_valid.shape[0] // BATCH_SIZE

del x_train
del y_train

## Load model into the TPU

In [None]:
id2label = {0: "NONTOXIC", 1: "TOXIC"}
label2id = {"NONTOXIC": 0, "TOXIC": 1}

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL, num_labels=2, id2label=id2label, label2id=label2id)
    
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

## Train Model

First, we train on the subset of the training set, which is completely in English.

In [None]:
EPOCHS = 10
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Now that we have pretty much saturated the learning potential of the model on english only data, we train it for one more epoch on the `validation` set, which is significantly smaller but contains a mixture of different languages.

In [None]:
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps_valid,
    epochs=3
)

In [None]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
def plot_confusion_matrix(cm, classes, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting normalize=True.
    """
    plt.figure(figsize=(10,10))

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = np.around(cm, decimals=2)
        cm[np.isnan(cm)] = 0.0
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
###Overall Model
target_names = ['Toxic', 'Non-Toxic']

Y_pred = model.predict(x_valid)
y_preds = np.argmax(Y_pred, axis=1)
print('Confusion Matrix')
rounded_labels=np.argmax(y_valid, axis=1)
cm = confusion_matrix(y_true = rounded_labels, y_pred = y_preds)
print(cm)
plot_confusion_matrix(cm, target_names, title='Confusion Matrix')

In [None]:
import seaborn as sns
def plot_confusion_matrix2(y_true, y_pred, classes):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap=plt.cm.Blues, xticklabels=classes, yticklabels=classes, fmt='g')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()
plot_confusion_matrix2(y_true = rounded_labels, y_pred = y_preds, classes=target_names)

In [None]:
from sklearn.metrics import precision_recall_curve
precision = dict()
recall = dict()
for i in range(2):
    precision[i], recall[i], _ = precision_recall_curve(y_valid[:,i],
                                                        Y_pred[:, i])
    if (i==0):
      plt.plot(recall[i], precision[i], lw=2, label='Toxic')
    elif (i==1):
      plt.plot(recall[i], precision[i], lw=2, label='NonToxic')
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()

In [None]:
axes = plt.gca()
acc = train_history_2.history['accuracy']
loss = train_history_2.history['loss']
epochs = range(1, len(acc) + 1)
#Train and validation accuracy
plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title('Training accuracy and Validation accuracy')

plt.legend()

plt.figure()
#Train and validation loss
plt.plot(epochs, loss, 'b', label='Training loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title('Training loss and Validation loss')
plt.legend()
plt.show()

In [None]:
axes = plt.gca()
acc = train_history.history['accuracy']
val_acc = train_history.history['val_accuracy']
loss = train_history.history['loss']
val_loss = train_history.history['val_loss']
epochs = range(1, len(acc) + 1)
#Train and validation accuracy
plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title('Training accuracy and Validation accuracy')

plt.legend()

plt.figure()
#Train and validation loss
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title('Training loss and Validation loss')
plt.legend()
plt.show()

In [None]:
model.evaluate(valid_dataset)

In [None]:
from sklearn.metrics import classification_report
import time
import numpy as np
start_time = time.time()
test_predictions = model.predict(x_valid)
# Comparing the predictions to actual forest cover types for the test rows
# test is the data right after splitting into train, test and val (shuffle was false in dataset so the order will match)
rounded_labels=np.argmax(y_valid, axis=1)
test_predictions = np.argmax(test_predictions, axis=1)
print(classification_report(rounded_labels,test_predictions))
print("Time taken to predict the model " + str(time.time() - start_time))

In [None]:
del train_dataset
del valid_dataset

In [None]:
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
submission = pd.read_csv

In [None]:
# x_test = regular_encode(test.content.values.tolist()[0:63812], tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test.content.values.tolist(), tokenizer, maxlen=MAX_LEN)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)