In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train-processed-seqlen128.csv


In [2]:
import tensorflow as tf
print(tf.__version__)

2.1.0


In [3]:
import pandas as pd
import os

data_path = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/'

TEST_PATH = os.path.join(data_path, "test.csv")
VAL_PATH = os.path.join(data_path, "validation.csv")
TRAIN_PATH = os.path.join(data_path, "jigsaw-toxic-comment-train.csv")

val_data = pd.read_csv(VAL_PATH)
test_data = pd.read_csv(TEST_PATH)
train_data = pd.read_csv(TRAIN_PATH)

sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [4]:
test_data.head()

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr


In [5]:
#Clean Text
import re

def clean(text):
    text = text.fillna("fillna").str.lower()
    #replace newline characters with space
    text = text.map(lambda x: re.sub('\\n',' ', str(x)))
    text = text.map(lambda x: re.sub('\[\[User.*', '', str(x)))
    text = text.map(lambda x: re.sub("\(http://.*?\s\(http://.*\)",'',str(x)))
    return text

val_data["comment_text"] = clean(val_data["comment_text"])
test_data["content"] = clean(test_data["content"])
train_data["comment_text"] = clean(train_data["comment_text"])                    

In [6]:
# Load DistilBERT tokenizer
import transformers

tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




In [7]:
import numpy
import tqdm

def create_bert_input(tokenizer, docs, max_seq_len):
    all_input_ids, all_mask_ids = [], []
    for doc in tqdm.tqdm(docs, desc="Converting docs to features"):
        tokens = tokenizer.tokenize(doc)
        if len(tokens) > max_seq_len - 2:
            tokens = tokens[0: (max_seq_len-2)]
        tokens = ['[CLS]']+tokens+['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1]*len(ids)
        while len(ids) < max_seq_len:
            ids.append(0)
            masks.append(0)
        all_input_ids.append(ids)
        all_mask_ids.append(masks)
    
    encoded = np.array([all_input_ids, all_mask_ids])
    return encoded
            

In [8]:
train_comments = train_data.comment_text.astype(str).values
val_comments = val_data.comment_text.astype(str).values
test_comments = test_data.content.astype(str).values

y_valid = val_data.toxic.values
y_train = train_data.toxic.values

In [9]:
import gc
gc.collect()

0

In [24]:
#Encode the comments in train_set
MAX_SEQ_LENGTH = 500

train_feature_ids, train_feature_masks = create_bert_input(tokenizer, train_comments, max_seq_len=MAX_SEQ_LENGTH)

val_feature_ids, val_feature_masks = create_bert_input(tokenizer, val_comments, max_seq_len=MAX_SEQ_LENGTH)

test_feature_ids, test_feature_masks = create_bert_input(tokenizer, test_comments, max_seq_len=MAX_SEQ_LENGTH)


Converting docs to features: 100%|██████████| 223549/223549 [11:06<00:00, 335.44it/s]
Converting docs to features: 100%|██████████| 8000/8000 [00:24<00:00, 332.17it/s]
Converting docs to features: 100%|██████████| 63812/63812 [03:17<00:00, 323.40it/s]


In [11]:
# Verify the shapes
print(train_feature_ids.shape, train_feature_masks.shape, y_train.shape)
print(val_feature_ids.shape, val_feature_masks.shape, y_valid.shape)

(223549, 500) (223549, 500) (223549,)
(8000, 500) (8000, 500) (8000,)


In [12]:
#Configure TPU
from kaggle_datasets import KaggleDatasets

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

GCS_DS_PATH = KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')

EPOCHS = 1
BATCH_SIZE = 32 * strategy.num_replicas_in_sync

In [13]:
# Create TensorFlow datasets for better performance
train_ds = (
    tf.data.Dataset
    .from_tensor_slices(((train_feature_ids, train_feature_masks), y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE)
)
    
valid_ds = (
    tf.data.Dataset
    .from_tensor_slices(((val_feature_ids, val_feature_masks), y_valid))
    .repeat()
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

test_ds = (
    tf.data.Dataset
    .from_tensor_slices((test_feature_ids, test_feature_masks))
    .repeat()
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

In [14]:
#Create training ready model
def get_training_model():
    inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int64, name="bert_input_ids")
    inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int64, name="bert_input_masks")
    inputs = [inp_id, inp_mask]
    
    hidden_state = transformers.TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')(inputs)[0]
    pooled_output = hidden_state[:, 0]
    dense1 = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense1)
    model = tf.keras.Model(inputs=inputs, outputs=output)
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=2e-5, 
                                            epsilon=1e-08), 
                loss='binary_crossentropy', metrics=['accuracy'])
    return model
    
    

In [15]:
# Authorize wandb
import wandb

wandb.login()
from wandb.keras import WandbCallback

# Initialize wandb
wandb.init(project="jigsaw-toxic", id="distilbert-tpu-kaggle-weighted")

[34m[1mwandb[0m: [32m[41mERROR[0m Not authenticated.  Copy a key from https://app.wandb.ai/authorize


API Key: ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[34m[1mwandb[0m: Wandb version 0.8.33 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


W&B Run: https://app.wandb.ai/ahmeriq09/jigsaw-toxic/runs/distilbert-tpu-kaggle-weighted

In [16]:
# Create 32 random indices from the English only test comments
RANDOM_INDICES = np.random.choice(test_comments.shape[0], 32)

In [17]:
!pip install -q googletrans

In [18]:
# Demo examples of translations
from googletrans import Translator

sample_comment = test_comments[48649]
print("Original comment:", sample_comment)
translated_comment = Translator().translate(sample_comment)
print("\n")
print("Translated comment:", translated_comment.text)

Original comment:  ¡ah! sí, ya lo sé... pero como que no puedo sacarme ciertos argentinismos de encima a la hora de escribir. —   kved    (discusión)    pd: aunque no sé si lo correcto no es escribir  bloqueé  en lugar de  bloquee . para solucionar ese tema, es más fácil decir   bloquié   y que la rae se vaya a tomar por culo.   ;)


Translated comment: Ah! Yes, I know ... but I can not get me out certain argentinismos off when writing. - kved (discussion) pd: I do not know if right not write blocked instead of blocking. to solve this issue, it is easier to say rae bloquié and that is to take the ass. ;)


In [19]:
# Create a sample prediction logger
# A custom callback to view predictions on the above samples in real-time
class TextLogger(tf.keras.callbacks.Callback):
    def __init__(self):
        super(TextLogger, self).__init__()

    def on_epoch_end(self, logs, epoch):
        samples = []
        for index in RANDOM_INDICES:
            # Grab the comment and translate it
            comment = test_comments[index]
            translated_comment = Translator().translate(comment).text
            # Create BERT features
            comment_feature_ids, comment_features_masks = create_bert_input(tokenizer,  
                                    comment, max_seq_len=MAX_SEQ_LENGTH)
            # Employ the model to get the prediction and parse it
            predicted_label = self.model.predict([comment_feature_ids, comment_features_masks])
            predicted_label = np.argmax(predicted_label[0])
            if predicted_label==0: predicted_label="Non-Toxic"
            else: predicted_label="Toxic"
            
            sample = [comment, translated_comment, predicted_label]
            
            samples.append(sample)
        wandb.log({"text": wandb.Table(data=samples, 
                                       columns=["Comment", "Translated Comment", "Predicted Label"])})

In [20]:
# Garbage collection
gc.collect()

68

In [21]:
# Account for the class imbalance
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights

array([0.55288749, 5.22701553])

In [22]:
# Train the model
import time

start = time.time()

# Compile the model with TPU Strategy
with strategy.scope():
    model = get_training_model()
    
model.fit(train_ds, 
          steps_per_epoch=train_data.shape[0] // BATCH_SIZE,
          validation_data=valid_ds,
          validation_steps=val_data.shape[0] // BATCH_SIZE,
          epochs=EPOCHS,
          class_weight=class_weights,
          callbacks=[WandbCallback(), TextLogger()],
          verbose=1)
end = time.time() - start
print("Time taken ",end)
wandb.log({"training_time":end})

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…




[34m[1mwandb[0m: Wandb version 0.8.33 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Train for 873 steps, validate for 31 steps

[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: 
Converting docs to features: 100%|██████████| 814/814 [00:00<00:00, 3581.99it/s]
Converting docs to features: 100%|██████████| 515/515 [00:00<00:00, 3549.53it/s]
Converting docs to features: 100%|██████████| 810/810 [00:00<00:00, 3543.38it/s]
Converting docs to features: 100%|██████████| 147/147 [00:00<00:00, 2976.28it/s]
Converting docs to features: 100%|██████████| 760/760 [00:00<00:00, 3609.58it/s]
Converting docs to features: 100%|██████████| 531/531 [00:00<00:00, 3465.18it/s]
Converting docs to features: 100%|██████████| 850/850 [00:00<00:00, 3421.70it/s]
Converting docs to features: 100%|██████████| 307/307 [00:00<00:00, 3538.12it/s]
Converting docs to features: 100%|██████████| 1094/1094 [00:00<00:00, 3538.25it/s]
Converting docs to features: 100%|██████████| 678/678 [00:00<00:00, 3459.82it/s]
Converting docs to features: 100%|██████████| 142/142 [00:00<00:00, 3534.46it/s]
Converting docs to features

Time taken  632.8558239936829


In [27]:
#sub['toxic'] = model.predict(test_ds, verbose=1)
sub['toxic'] = model.predict([test_feature_ids, test_feature_masks], verbose=1)
sub.to_csv('submission.csv', index=False)

ValueError: Error when checking input: expected bert_input_ids to have shape (500,) but got array with shape (512,)