In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.activations import *
from tensorflow.keras.constraints import *
from tensorflow.keras.initializers import *
from tensorflow.keras.regularizers import *
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, CSVLogger
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

from sklearn.metrics import accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm_notebook
tqdm_notebook().pandas()
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [2]:
def seed_everything(framework,SEED):
    random.seed(SEED)
    os.environ["PYTHONHASHSEED"] = str(SEED)
    np.random.seed(SEED)
    if framework == 'Pytorch':
        torch.manual_seed(SEED)
        torch.cuda.manual_seed(SEED)
    elif framework == 'Tensorflow':
        tf.random.set_seed(SEED)
        
framework = 'Tensorflow'
SEED = 2020
seed_everything(framework, SEED)

In [3]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

def build_model(transformer, loss='binary_crossentropy', max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    cls_token = tf.keras.layers.Dropout(0.2)(cls_token)
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=3e-5), loss=loss, metrics=[tf.keras.metrics.AUC()])
    
    return model

In [4]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [5]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
#GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 200
MODEL = 'jplu/tf-xlm-roberta-large'

# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=738.0, style=ProgressStyle(description_…





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [6]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

train_es = pd.read_csv('../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-es-cleaned.csv')
train_fr = pd.read_csv('../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-fr-cleaned.csv')
train_it = pd.read_csv('../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-it-cleaned.csv')
train_pt = pd.read_csv('../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-pt-cleaned.csv')
train_ru = pd.read_csv('../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru-cleaned.csv')
train_tr = pd.read_csv('../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr-cleaned.csv')

valid_data = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test_data = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

toxic = len(train2[['comment_text', 'toxic']].query('toxic==1'))

# Combine train data
train_data = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    
    train_es[['comment_text', 'toxic']].query('toxic==0').sample(n=10000, random_state=1995),
    train_es[['comment_text', 'toxic']].query('toxic==1').sample(n=2000, random_state=608),
    train_fr[['comment_text', 'toxic']].query('toxic==0').sample(n=10000, random_state=84),
    train_fr[['comment_text', 'toxic']].query('toxic==1').sample(n=2000, random_state=1993),
    train_it[['comment_text', 'toxic']].query('toxic==0').sample(n=10000, random_state=609),
    train_it[['comment_text', 'toxic']].query('toxic==1').sample(n=2000, random_state=1960),
    train_pt[['comment_text', 'toxic']].query('toxic==0').sample(n=5000, random_state=857),
    train_pt[['comment_text', 'toxic']].query('toxic==1').sample(n=1000, random_state=1125),
    train_ru[['comment_text', 'toxic']].query('toxic==0').sample(n=10000, random_state=49),
    train_ru[['comment_text', 'toxic']].query('toxic==1').sample(n=2000, random_state=1964),
    train_tr[['comment_text', 'toxic']].query('toxic==0').sample(n=5000, random_state=925),
    train_tr[['comment_text', 'toxic']].query('toxic==1').sample(n=1000, random_state=53),
    
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=(toxic+(toxic//3)), random_state=520)
])

In [7]:
def clean_usernames_links(text):
    text = text.map(lambda x: re.sub('\\n',' ',str(x)))
    text = text.map(lambda x: re.sub("\[\[User.*",'',str(x)))
    text = text.map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    text = text.map(lambda x: re.sub("\(http://.*?\s\(http://.*\)",'',str(x)))
    return text

mispell_dict = {"aren't" : "are not",          "can't" : "cannot",
                "couldn't" : "could not",      "couldnt" : "could not",
                "didn't" : "did not",          "doesn't" : "does not",
                "doesnt" : "does not",         "don't" : "do not",
                "hadn't" : "had not",          "hasn't" : "has not",
                "haven't" : "have not",        "havent" : "have not",
                "he'd" : "he would",           "he'll" : "he will",
                "he's" : "he is",              "i'd" : "I would",
                "i'd" : "I had",               "i'll" : "I will",
                "i'm" : "I am",                "isn't" : "is not",
                "it's" : "it is",              "it'll":"it will",
                "i've" : "I have",             "let's" : "let us",
                "mightn't" : "might not",      "mustn't" : "must not",
                "shan't" : "shall not",        "she'd" : "she would",
                "she'll" : "she will",         "she's" : "she is",
                "shouldn't" : "should not",    "shouldnt" : "should not",
                "that's" : "that is",          "thats" : "that is",
                "there's" : "there is",        "theres" : "there is",
                "they'd" : "they would",       "they'll" : "they will",
                "they're" : "they are",        "theyre":  "they are",
                "they've" : "they have",       "we'd" : "we would",
                "we're" : "we are",            "weren't" : "were not",
                "we've" : "we have",           "what'll" : "what will",
                "what're" : "what are",        "what's" : "what is",
                "what've" : "what have",       "where's" : "where is",
                "who'd" : "who would",         "who'll" : "who will",
                "who're" : "who are",          "who's" : "who is",
                "who've" : "who have",         "won't" : "will not",
                "wouldn't" : "would not",      "you'd" : "you would",
                "you'll" : "you will",         "you're" : "you are",
                "you've" : "you have",         "'re": " are",
                "wasn't": "was not",           "we'll":" will",
                "didn't": "did not",           "tryin'":"trying"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


def replace_typical_misspell(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)

    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&',
          '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
          '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\xa0', '\t',
          '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑',
          '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
          '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫',
          '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
          '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・',
          '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x).replace("\n","")
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def cln_space(x): 
    return " ".join(x.split())

def preprocess(x):
    x= clean_text(x.lower())
    x= replace_typical_misspell(x)
    x= cln_space(x)
    
    return x

In [8]:
%%time
#train_data["comment_text"] = clean_usernames_links(train_data["comment_text"])
#train_data['comment_text'] = train_data['comment_text'].progress_map(lambda q: preprocess(q))

#valid_data["comment_text"] = clean_usernames_links(valid_data["comment_text"])
#valid_data['comment_text'] = valid_data['comment_text'].progress_map(lambda q: preprocess(q))

#test_data["content"] = clean_usernames_links(test_data["content"])
#test_data['content'] = test_data['content'].progress_map(lambda q: preprocess(q))

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 13.8 µs


In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

save_path = '/kaggle/working/xlmr_large/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

('/kaggle/working/xlmr_large/sentencepiece.bpe.model',
 '/kaggle/working/xlmr_large/special_tokens_map.json',
 '/kaggle/working/xlmr_large/added_tokens.json')

In [10]:
%%time 

x_train = regular_encode(train_data.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid_data.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test_data.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train_data.toxic.values
y_valid = valid_data.toxic.values

CPU times: user 8min 29s, sys: 3.21 s, total: 8min 32s
Wall time: 8min 32s


In [11]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [12]:
from tensorflow.keras.callbacks import Callback
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))
            
def callback():
    cb = []

    reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss',  
                                    factor=0.3, patience=2, 
                                    verbose=1, mode='auto', 
                                    epsilon=0.0001, cooldown=1, min_lr=0.000001)
    cb.append(reduceLROnPlat)
    log = CSVLogger('log.csv')
    cb.append(log)

    RocAuc = RocAucEvaluation(validation_data=(x_valid, y_valid), interval=1)
    cb.append(RocAuc)
    
    return cb

def build_lrfn(lr_start=0.000001, lr_max=0.000002, 
               lr_min=0.0000001, lr_rampup_epochs=7, 
               lr_sustain_epochs=0, lr_exp_decay=.87):
    lr_max = lr_max * strategy.num_replicas_in_sync

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    
    return lrfn

In [13]:
%%time
#load model into TPU
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, loss=focal_loss(gamma=1.5), max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 200)]             0         
_________________________________________________________________
tf_roberta_model (TFRobertaM ((None, 200, 1024), (None 559890432 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1024)]            0         
_________________________________________________________________
dropout_74 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1025      
Total params: 559,891,457
Trainable params: 559,891,457
Non-trainable params: 0
_________________________________________________________________
CPU times: user 1min 46s, sys: 33.2 s, total: 2min 19s
Wall time: 2min 58s


In [14]:
model_path = '/kaggle/working/jigsawMultilingual.h5'
checkpoint = ModelCheckpoint(model_path, monitor='val_accuracy', mode='max', save_best_only=True)
es = EarlyStopping(monitor='val_accuracy', mode='max', patience=2, 
                   restore_best_weights=True, verbose=1)
lrfn = build_lrfn()
lr_callback = LearningRateScheduler(lrfn, verbose=1)

callback_list = [checkpoint, es, lr_callback]

In [15]:
%%time
N_STEPS = x_train.shape[0] // BATCH_SIZE
EPOCHS = 4
train_history = model.fit(
    train_dataset,
    steps_per_epoch=N_STEPS,
    validation_data=valid_dataset,
    callbacks=callback_list,
    epochs=EPOCHS
)

Train for 4261 steps, validate for 63 steps

Epoch 00001: LearningRateScheduler reducing learning rate to 1e-06.
Epoch 1/4

Epoch 00002: LearningRateScheduler reducing learning rate to 3.142857142857143e-06.
Epoch 2/4

Epoch 00003: LearningRateScheduler reducing learning rate to 5.285714285714285e-06.
Epoch 3/4

Epoch 00004: LearningRateScheduler reducing learning rate to 7.4285714285714275e-06.
Epoch 4/4
CPU times: user 10min 24s, sys: 37.6 s, total: 11min 1s
Wall time: 2h 23min 35s


In [16]:
if os.path.exists(model_path):
    model.load_weights(model_path)

In [17]:
%%time
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    callbacks=callback_list,
    epochs= EPOCHS
)

Train for 62 steps

Epoch 00001: LearningRateScheduler reducing learning rate to 1e-06.
Epoch 1/4

Epoch 00002: LearningRateScheduler reducing learning rate to 3.142857142857143e-06.
Epoch 2/4

Epoch 00003: LearningRateScheduler reducing learning rate to 5.285714285714285e-06.
Epoch 3/4

Epoch 00004: LearningRateScheduler reducing learning rate to 7.4285714285714275e-06.
Epoch 4/4
CPU times: user 37.3 s, sys: 1.37 s, total: 38.7 s
Wall time: 5min 36s


In [18]:
if os.path.exists(model_path):
    model.load_weights(model_path)

log_dir = "/kaggle/working/log.csv"
if os.path.exists(log_dir):
    os.remove(log_dir)

In [19]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission.csv', index=False)

