In [None]:
MAX_LEN = 192 
DROPOUT = 0.5   # use aggressive dropout
BATCH_SIZE = 16 # per TPU core
TOTAL_STEPS_STAGE1 = 2000
VALIDATE_EVERY_STAGE1 = 200

TOTAL_STEPS_STAGE2 = 200
VALIDATE_EVERY_STAGE2 = 10

### Different learning rate for transformer and head ###
LR_TRANSFORMER = 5e-6
LR_HEAD = 1e-3

PRETRAINED_TOKENIZER=  'jplu/tf-xlm-roberta-large'
PRETRAINED_MODEL = '/kaggle/input/jigsaw-mlm-finetuned-xlm-r-large'
D = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/'
D_TRANS = '/kaggle/input/jigsaw-train-multilingual-coments-google-api/'


import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import transformers
from transformers import TFRobertaModel, AutoTokenizer, AutoConfig
import logging
import random
# no extensive logging 
logging.getLogger().setLevel(logging.NOTSET)

AUTO = tf.data.experimental.AUTOTUNE

2.2.0




In [None]:
def connect_to_TPU():
    """Detect hardware, return appropriate distribution strategy"""
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    global_batch_size = BATCH_SIZE * strategy.num_replicas_in_sync

    return tpu, strategy, global_batch_size


tpu, strategy, global_batch_size = connect_to_TPU()
print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [None]:
def load_jigsaw_trans(langs=['tr','it','es','ru','fr','pt'], 
                      columns=['comment_text', 'toxic']):
    train_6langs=[]
    for i in range(len(langs)):
        fn = D_TRANS+'jigsaw-toxic-comment-train-google-%s-cleaned.csv'%langs[i]
        train_6langs.append(downsample(pd.read_csv(fn)[columns]))
        
    train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
    train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
    train2.toxic = train2.toxic.round().astype(int)
    df_train2 = pd.concat([train2[['comment_text', 'toxic']].query('toxic==1'),
                           train2[['comment_text', 'toxic']].query('toxic==0').sample(n=int(sum(train2.toxic)))])
    
    train_6langs.append(df_train2)
    del train1,train2
    return train_6langs

def downsample(df):
    ds_df= pd.concat([
        df.query('toxic==1'),
        df.query('toxic==0').sample(n=int(sum(df.toxic)))
    ])
    return ds_df
    

train_df = pd.concat(load_jigsaw_trans()) 
val_df = pd.read_csv(D+'validation.csv')
test_df = pd.read_csv(D+'test.csv')
sub_df = pd.read_csv(D+'sample_submission.csv')

In [None]:
%%time

def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])
    

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_TOKENIZER)
X_train = regular_encode(train_df.comment_text.values, tokenizer, maxlen=MAX_LEN)
X_val = regular_encode(val_df.comment_text.values, tokenizer, maxlen=MAX_LEN)
X_test = regular_encode(test_df.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train_df.toxic.values.reshape(-1,1)
y_val = val_df.toxic.values.reshape(-1,1)

CPU times: user 6min 39s, sys: 1.87 s, total: 6min 41s
Wall time: 6min 42s


In [None]:
def create_dist_dataset(X, y=None, training=False):
    dataset = tf.data.Dataset.from_tensor_slices(X)

    ### Add y if present ###
    if y is not None:
        dataset_y = tf.data.Dataset.from_tensor_slices(y)
        dataset = tf.data.Dataset.zip((dataset, dataset_y))
        
    ### Repeat if training ###
    if training:
        dataset = dataset.shuffle(len(X)).repeat()

    dataset = dataset.batch(global_batch_size).prefetch(AUTO)

    ### make it distributed  ###
    dist_dataset = strategy.experimental_distribute_dataset(dataset)

    return dist_dataset
    
    
train_dist_dataset = create_dist_dataset(X_train, y_train, True)
val_dist_dataset   = create_dist_dataset(X_val)
test_dist_dataset  = create_dist_dataset(X_test)

In [None]:
%%time

def create_model_and_optimizer():
    with strategy.scope():
        config = AutoConfig.from_pretrained(PRETRAINED_MODEL)
        config.output_hidden_states = True
        transformer_layer = TFRobertaModel.from_pretrained(PRETRAINED_MODEL,config=config)                
        model = build_model(transformer_layer)
        optimizer_transformer = Adam(learning_rate=LR_TRANSFORMER)
        optimizer_head = Adam(learning_rate=LR_HEAD)
    return model, optimizer_transformer, optimizer_head


def build_model(transformer):
    inp = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
    # Huggingface transformers have multiple outputs, embeddings are the first one
    # let's slice out the first position, the paper says its not worse than pooling
    #x = transformer(inp)[0][:, 0, :]
    embedding, pooler_output, hidden_states = transformer(inp)
    pooler = tf.reshape(pooler_output,(-1,1,1024))
    h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,1024))
    h11 = tf.reshape(hidden_states[-1][:,0],(-1,1,1024))
    concat_hidden = tf.keras.layers.Concatenate(axis=2)([pooler, h12, h11])
    x = tf.keras.layers.GlobalAveragePooling1D()(concat_hidden)
    x = Dropout(DROPOUT)(x)
    ### note, adding the name to later identify these weights for different LR
    out = Dense(1, activation='sigmoid', name='custom_head')(x)
    model = Model(inputs=[inp], outputs=[out])
    
    return model


model, optimizer_transformer, optimizer_head = create_model_and_optimizer()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 192)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 192, 1024),  559890432   input_word_ids[0][0]             
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 1024)]       0           tf_roberta_model[0][26]          
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 1024)]       0           tf_roberta_model[0][26]          
______________________________________________________________________________________________

In [None]:
def define_losses_and_metrics():
    with strategy.scope():
        loss_object = tf.keras.losses.BinaryCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE, from_logits=False)

        def compute_loss(labels, predictions):
            per_example_loss = loss_object(labels, predictions)
            loss = tf.nn.compute_average_loss(
                per_example_loss, global_batch_size = global_batch_size)
            return loss

        train_accuracy_metric = tf.keras.metrics.AUC(name='training_AUC')

    return compute_loss, train_accuracy_metric


def train(train_dist_dataset, val_dist_dataset=None, y_val=None,
          total_steps=2000, validate_every=200):
    best_weights, history = None, []
    step = 0
    ### Training lopp ###
    for tensor in train_dist_dataset:
        distributed_train_step(tensor) 
        step+=1

        if (step % validate_every == 0):   
            ### Print train metrics ###  
            train_metric = train_accuracy_metric.result().numpy()
            print("Step %d, train AUC: %.5f" % (step, train_metric))   
            
            ### Test loop with exact AUC ###
            if val_dist_dataset:
                val_metric = roc_auc_score(y_val, predict(val_dist_dataset))
                print("Step %d,   val AUC: %.5f" %  (step,val_metric))   
                
                # save weights if it is the best yet
                history.append(val_metric)
                if history[-1] == max(history):
                    best_weights = model.get_weights()

            ### Reset (train) metrics ###
            train_accuracy_metric.reset_states()
            
        if step  == total_steps:
            break
    
    ### Restore best weighths ###
    model.set_weights(best_weights)



@tf.function
def distributed_train_step(data):
    strategy.experimental_run_v2(train_step, args=(data,))

def train_step(inputs):
    features, labels = inputs
    
    ### get transformer and head separate vars
    # get rid of pooler head with None gradients
    transformer_trainable_variables = [ v for v in model.trainable_variables 
                                       if (('pooler' not in v.name)  and 
                                           ('custom' not in v.name))]
    head_trainable_variables = [ v for v in model.trainable_variables 
                                if 'custom'  in v.name]

    # calculate the 2 gradients ( note persistent, and del)
    with tf.GradientTape(persistent=True) as tape:
        predictions = model(features, training=True)
        loss = compute_loss(labels, predictions)
    gradients_transformer = tape.gradient(loss, transformer_trainable_variables)
    gradients_head = tape.gradient(loss, head_trainable_variables)
    del tape
        
    ### make the 2 gradients steps
    optimizer_transformer.apply_gradients(zip(gradients_transformer, 
                                              transformer_trainable_variables))
    optimizer_head.apply_gradients(zip(gradients_head, 
                                       head_trainable_variables))

    train_accuracy_metric.update_state(labels, predictions)



def predict(dataset):  
    predictions = []
    for tensor in dataset:
        predictions.append(distributed_prediction_step(tensor))
    ### stack replicas and batches
    predictions = np.vstack(list(map(np.vstack,predictions)))
    return predictions

@tf.function
def distributed_prediction_step(data):
    predictions = strategy.experimental_run_v2(prediction_step, args=(data,))
    return strategy.experimental_local_results(predictions)

def prediction_step(inputs):
    features = inputs  # note datasets used in prediction do not have labels
    predictions = model(features, training=False)
    return predictions


compute_loss, train_accuracy_metric = define_losses_and_metrics()

In [None]:
%%time
train(train_dist_dataset, val_dist_dataset, y_val,
      TOTAL_STEPS_STAGE1, VALIDATE_EVERY_STAGE1)

  num_elements)


Step 200, train AUC: 0.84823
Step 200,   val AUC: 0.93511
Step 400, train AUC: 0.95363
Step 400,   val AUC: 0.93679
Step 600, train AUC: 0.95802
Step 600,   val AUC: 0.94154
Step 800, train AUC: 0.96180
Step 800,   val AUC: 0.94209
Step 1000, train AUC: 0.96359
Step 1000,   val AUC: 0.94165
Step 1200, train AUC: 0.96421
Step 1200,   val AUC: 0.94294
Step 1400, train AUC: 0.96633
Step 1400,   val AUC: 0.94309
Step 1600, train AUC: 0.96609
Step 1600,   val AUC: 0.94547
Step 1800, train AUC: 0.96788
Step 1800,   val AUC: 0.94201
Step 2000, train AUC: 0.96673
Step 2000,   val AUC: 0.94264
CPU times: user 3min 25s, sys: 56 s, total: 4min 21s
Wall time: 17min 48s


In [None]:
%%time

# decrease LR for second stage in the head
optimizer_head.learning_rate.assign(5e-5)

# split validation data into train test
X_train, X_val, y_train, y_val = train_test_split(X_val, y_val, test_size = 0.1)

# make a datasets
train_dist_dataset = create_dist_dataset(X_train, y_train, training=True)
val_dist_dataset = create_dist_dataset(X_val, y_val)

# train again
train(train_dist_dataset, val_dist_dataset, y_val,
      total_steps = TOTAL_STEPS_STAGE2, 
      validate_every = VALIDATE_EVERY_STAGE2)  # not validating but printing now

Step 10, train AUC: 0.90205
Step 10,   val AUC: 0.94612
Step 20, train AUC: 0.93556
Step 20,   val AUC: 0.95038
Step 30, train AUC: 0.93457
Step 30,   val AUC: 0.95192
Step 40, train AUC: 0.94269
Step 40,   val AUC: 0.95195
Step 50, train AUC: 0.94830
Step 50,   val AUC: 0.95394
Step 60, train AUC: 0.94289
Step 60,   val AUC: 0.95425
Step 70, train AUC: 0.95567
Step 70,   val AUC: 0.95519
Step 80, train AUC: 0.95110
Step 80,   val AUC: 0.95384
Step 90, train AUC: 0.94708
Step 90,   val AUC: 0.95588
Step 100, train AUC: 0.96031
Step 100,   val AUC: 0.95654
Step 110, train AUC: 0.95329
Step 110,   val AUC: 0.95485
Step 120, train AUC: 0.96566
Step 120,   val AUC: 0.95479
Step 130, train AUC: 0.97294
Step 130,   val AUC: 0.95384
Step 140, train AUC: 0.95738
Step 140,   val AUC: 0.95486
Step 150, train AUC: 0.96516
Step 150,   val AUC: 0.95561
Step 160, train AUC: 0.96447
Step 160,   val AUC: 0.95398
Step 170, train AUC: 0.95656
Step 170,   val AUC: 0.95372
Step 180, train AUC: 0.97225
Ste

In [None]:
%%time
sub_df['toxic'] = predict(test_dist_dataset)[:,0]
sub_df.to_csv('submission.csv', index=False)

CPU times: user 21.9 s, sys: 4.12 s, total: 26.1 s
Wall time: 1min 28s
