### Postprocess

In [0]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams['axes.grid'] = False
%matplotlib inline

In [1]:
import os

import numpy as np
import pandas as pd
try:
    from google.colab import drive
    drive.mount('/content/drive')
    %tensorflow_version 2.x
    IS_COLAB = True
except:
    IS_COLAB = False
    pass
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import (ModelCheckpoint, ReduceLROnPlateau, 
                                        CSVLogger, Callback)
import tensorflow.keras.backend as K
assert tf.__version__ >= '2.0'

In [2]:
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
tqdm.pandas()
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

In [3]:
DATA_FOLDER = "/content/drive/My Drive/course/DATA 2040/DATA2040_FinalProject/data" if IS_COLAB else "../../data"

In [7]:
import sys
HELPER_PATH = "/content/drive/My Drive/course/DATA 2040/DATA2040_FinalProject/utils" if IS_COLAB else "../utils"
sys.path.append(HELPER_PATH)
from helpers import regular_encode, roc_auc, RocAucEvaluation

In [4]:
os.listdir(DATA_FOLDER)

['jigsaw-toxic-comment-train-google-pt-cleaned.csv',
 'validation.csv',
 'jigsaw-toxic-comment-train-google-it-cleaned.csv',
 'jigsaw-toxic-comment-train-google-ru-cleaned.csv',
 'jigsaw-toxic-comment-train-google-ru.csv',
 'jigsaw-unintended-bias-train.csv',
 'train_cleaned.csv',
 'jigsaw-toxic-comment-train-google-es.csv',
 'jigsaw-toxic-comment-train-google-fr-cleaned.csv',
 'jigsaw-toxic-comment-train-google-es-cleaned.csv',
 'validation-processed-seqlen128.csv',
 'jigsaw-train-multilingual-coments-google-api.zip',
 'jigsaw-toxic-comment-train-processed-seqlen128.csv',
 'jigsaw-toxic-comment-train-google-fr.csv',
 'jigsaw-toxic-comment-train-google-it.csv',
 'jigsaw-unintended-bias-train-processed-seqlen128.csv',
 'validation_cleaned.csv',
 'jigsaw-toxic-comment-train-google-tr-cleaned.csv',
 'jigsaw-toxic-comment-train-google-tr.csv',
 'jigsaw-toxic-comment-train-google-pt.csv',
 'test.csv',
 'submission.csv',
 'test-processed-seqlen128.csv',
 'jigsaw-toxic-comment-train.csv',
 's

In [21]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


## Load data

In [5]:
train1 = pd.read_csv(DATA_FOLDER + "/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv(DATA_FOLDER + "/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv(DATA_FOLDER + '/validation.csv')

In [6]:
train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=60000, random_state=0))
])

## Data Preprocessing

### Tokenize the dataset

In [8]:
MODEL = 'jplu/tf-xlm-roberta-base'
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




In [12]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 64
MAX_LEN = 192

In [None]:
%%time
x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)

In [13]:
y_train = train.toxic.values
y_valid = valid.toxic.values

### Build the `Dataset` objects

In [15]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)


## Baseline Model

In [16]:
def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])    
    return model

In [23]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)

CPU times: user 2.28 s, sys: 1.24 s, total: 3.52 s
Wall time: 4.07 s


In [24]:
SAVED_PATH = "../models/savedmodels/jigsawMultilingual.h5"

In [25]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_roberta_model_1 (TFRobert ((None, 192, 768), (None, 278043648 
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 769       
Total params: 278,044,417
Trainable params: 278,044,417
Non-trainable params: 0
_________________________________________________________________


In [26]:
model.load_weights(SAVED_PATH)

ValueError: Layer #0 (named "tf_roberta_model_1" in the current model) was found to correspond to layer tf_roberta_model in the save file. However the new layer tf_roberta_model_1 expects 199 weights, but the saved weights have 391 elements.

## Prediction results

In [None]:
y_pred_train = model.predict(train_dataset)

In [None]:
y_pred_val = model.predict(valid_dataset)