# Model<a id="1"></a>

In [1]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams['axes.grid'] = False
%matplotlib inline

In [2]:
import os

import numpy as np
import pandas as pd
try:
    from google.colab import drive
    drive.mount('/content/drive')
    %tensorflow_version 2.x
    IS_COLAB = True
except:
    IS_COLAB = False
    pass
import tensorflow as tf
from tensorflow.keras.layers import (Dense, Input, Embedding, 
                                     GlobalAveragePooling1D,
                                     Dropout)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import (ModelCheckpoint, ReduceLROnPlateau, 
                                        CSVLogger, Callback)
import tensorflow.keras.backend as K
assert tf.__version__ >= '2.0'
import warnings
warnings.filterwarnings("ignore")

In [3]:
import transformers as ppb
from tokenizers import BertWordPieceTokenizer
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
tqdm.pandas()

In [4]:
DATA_FOLDER = "/content/drive/My Drive/course/DATA 2040/DATA2040_FinalProject/data" if IS_COLAB else "../../data"

In [5]:
import sys
HELPER_PATH = "/content/drive/My Drive/course/DATA 2040/DATA2040_FinalProject/utils" if IS_COLAB else "../utils"
sys.path.append(HELPER_PATH)
from helpers import fast_encode, regular_encode, roc_auc, RocAucEvaluation

In [6]:
os.listdir(DATA_FOLDER)

['jigsaw-toxic-comment-train-google-pt-cleaned.csv',
 'validation.csv',
 'jigsaw-toxic-comment-train-google-it-cleaned.csv',
 'jigsaw-toxic-comment-train-google-ru-cleaned.csv',
 'jigsaw-toxic-comment-train-google-ru.csv',
 'jigsaw-unintended-bias-train.csv',
 'jigsaw-toxic-comment-train-google-es.csv',
 'jigsaw-toxic-comment-train-google-fr-cleaned.csv',
 'jigsaw-toxic-comment-train-google-es-cleaned.csv',
 'validation-processed-seqlen128.csv',
 'jigsaw-train-multilingual-coments-google-api.zip',
 'jigsaw-toxic-comment-train-processed-seqlen128.csv',
 'jigsaw-toxic-comment-train-google-fr.csv',
 'jigsaw-toxic-comment-train-google-it.csv',
 'jigsaw-unintended-bias-train-processed-seqlen128.csv',
 'jigsaw-toxic-comment-train-google-tr-cleaned.csv',
 'jigsaw-toxic-comment-train-google-tr.csv',
 'jigsaw-toxic-comment-train-google-pt.csv',
 'test.csv',
 'test-processed-seqlen128.csv',
 'jigsaw-toxic-comment-train.csv',
 'sample_submission.csv',
 'submissions']

## Load data

In [7]:
train = pd.read_csv(DATA_FOLDER + "/jigsaw-toxic-comment-train.csv")
valid = pd.read_csv(DATA_FOLDER + '/validation.csv')
test = pd.read_csv(DATA_FOLDER + '/test.csv')
sub = pd.read_csv(DATA_FOLDER + '/sample_submission.csv')

In [8]:
# We drop the subtypes of toxic comments because we do not care about them
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [9]:
## Use a small data size for demonstration
train = train.loc[:20000,:] 

## Define the tokenizer and transformer

In [10]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.TFDistilBertModel, ppb.DistilBertTokenizer, 
                                                    'distilbert-base-uncased')

In [11]:
# tokenizer and model
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
save_path = './tokenizer/distilbert_base_uncased'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

('./tokenizer/distilbert_base_uncased/vocab.txt',
 './tokenizer/distilbert_base_uncased/special_tokens_map.json',
 './tokenizer/distilbert_base_uncased/added_tokens.json')

In [12]:
fast_tokenizer = BertWordPieceTokenizer(save_path + '/vocab.txt', lowercase=True)
# transformer = model_class.from_pretrained(pretrained_weights)

In [45]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 64
MAX_LEN = 192

In [14]:
# %%time
# x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
# x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
# x_test = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)

# y_train = train.toxic.values
# y_valid = valid.toxic.values

In [15]:
%%time
x_train = fast_encode(train.comment_text.values, fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(valid.comment_text.values, fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test.content.values, fast_tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


CPU times: user 30.6 s, sys: 621 ms, total: 31.2 s
Wall time: 6.23 s


### Build the Dataset objects

In [16]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)
valid_dataset

<PrefetchDataset shapes: ((None, 192), (None,)), types: (tf.int64, tf.int64)>

## Baseline Model

In [40]:
embedding_dim = 1024
def build_baseline_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     sequence_output = transformer(input_word_ids)[0]
#     cls_token = sequence_output[:, 0, :]
#     out = Dense(1, activation='sigmoid')(cls_token)
    embed = transformer.weights[0].numpy()
    out = Embedding(np.shape(embed)[0], np.shape(embed)[1],
                          input_length=max_len, weights=[embed],
                          trainable=False)(input_word_ids)
    out = GlobalAveragePooling1D()(out)
    out = Dense(1, activation='sigmoid')(out)
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

### Define callbacks

In [41]:
def callback():
    cb = []

    reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss',  
                                    factor=0.3, patience=3, 
                                    verbose=1, mode='auto', 
                                    epsilon=0.0001, cooldown=1, min_lr=0.000001)
    cb.append(reduceLROnPlat)
    log = CSVLogger('log.csv')
    cb.append(log)

    RocAuc = RocAucEvaluation(validation_data=(x_valid, y_valid), interval=1)
    cb.append(RocAuc)
    
    return cb

In [42]:
transformer_layer = model_class.from_pretrained(pretrained_weights)
baseline_model = build_baseline_model(transformer_layer, max_len=MAX_LEN)
baseline_model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 192, 768)          23440896  
_________________________________________________________________
global_average_pooling1d_5 ( (None, 768)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 769       
Total params: 23,441,665
Trainable params: 769
Non-trainable params: 23,440,896
_________________________________________________________________


### Two-stage training

In [46]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = baseline_model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
#     callbacks=callback(),
    epochs=EPOCHS
)

Epoch 1/2
Epoch 2/2


In [None]:
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = baseline_model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
#     callbacks=callback(),
    epochs=EPOCHS
)

Epoch 1/2
Epoch 2/2

## Inference and Submission

In [37]:
sub['toxic'] = baseline_model.predict(test_dataset, verbose=1)



In [38]:
sub.to_csv(DATA_FOLDER + '/submission.csv', index=False)

In [39]:
sub.head()

Unnamed: 0,id,toxic
0,0,0.137686
1,1,0.085052
2,2,0.135775
3,3,0.108894
4,4,0.112335
