In [1]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4264  100  4264    0     0  99162      0 --:--:-- --:--:-- --:--:-- 99162
Updating TPU and VM. This may take around 2 minutes.
Updating TPU runtime to pytorch-dev20200515 ...
Found existing installation: torch 1.5.0
Uninstalling torch-1.5.0:
Done updating TPU runtime: <Response [200]>
  Successfully uninstalled torch-1.5.0
Found existing installation: torchvision 0.6.0a0+82fd1c8
Uninstalling torchvision-0.6.0a0+82fd1c8:
  Successfully uninstalled torchvision-0.6.0a0+82fd1c8
Copying gs://tpu-pytorch/wheels/torch-nightly+20200515-cp37-cp37m-linux_x86_64.whl...
- [1 files][ 91.0 MiB/ 91.0 MiB]                                                
Operation completed over 1 objects/91.0 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-nightly+20200515-cp37-cp37m-linux_x86_64.whl...
| [1 files][119.5 M

In [2]:
import os
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from sklearn import model_selection
from sklearn import metrics
import torch
import torch.nn as nn
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [3]:
# to use TPU's using PyTorch, we have to use PyTorch-XLA library
import warnings
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl

import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.utils.utils as xu
import torch_xla.test.test_utils as test_utils
import warnings

warnings.filterwarnings("ignore")

In [4]:
MAX_LEN = 224
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 8
EPOCHS = 2

PRETRAINED_BERT = "bert-base-multilingual-uncased"
MODEL_PATH = "mBERT.bin" 
TRAINING_FILE_1 = "../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv" 
TRAINING_FILE_2 = "../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv"  
VALIDATION_FILE = "../input/jigsaw-multilingual-toxic-comment-classification/validation.csv" 
# Importing pre-trained BERT tokenizer
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    PRETRAINED_BERT,
    do_lower_case=True
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




In [5]:
class BERTDatasetTraining:
  def __init__(self, comment_text, targets, tokenizer, max_length):
    self.comment_text = comment_text
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.targets = targets

  def __len__(self):
    return len(self.comment_text)

  def __getitem__(self, item):
    comment_text = str(self.comment_text[item])
    comment_text = " ".join(comment_text.split())

    inputs = self.tokenizer.encode_plus(
        comment_text,
        None,
        add_special_tokens=True,
        max_length=self.max_length,
        pad_to_max_length = True
    )

    ids = inputs["input_ids"]
    token_type_ids = inputs["token_type_ids"]
    mask = inputs["attention_mask"]
    
    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.targets[item], dtype=torch.float)
    }

In [6]:
class BERTBaseUncased(nn.Module):
  def __init__(self, pretrained_bert):
    super(BERTBaseUncased, self).__init__()
    self.pretrained_bert = pretrained_bert  
    # Importing pre-trained BERT model
    self.bert = transformers.BertModel.from_pretrained(self.pretrained_bert) 
    self.bert_drop = nn.Dropout(0.3)
    self.out = nn.Linear(768 * 2, 1)

  def forward(self, ids, mask, token_type_ids):
    o1, o2 = self.bert(ids,
                       attention_mask=mask,
                       token_type_ids=token_type_ids)
    
    apool = torch.mean(o1, 1)
    mpool, _ = torch.max(o1, 1)
    cat = torch.cat((apool, mpool), 1)

    bo = self.bert_drop(cat)
    output = self.out(bo)

    return output

In [7]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
  model.train()
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.float)

    optimizer.zero_grad()
    outputs = model(
        ids=ids,
        mask=mask,
        token_type_ids=token_type_ids
    )

    loss = loss_fn(outputs, targets)
    if bi % 10 == 0:
        xm.master_print(f'bi={bi}, loss={loss}')

    loss.backward()

    ####################################### CHANGE HAPPENS HERE #######################################################
    xm.optimizer_step(optimizer)
    ###################################################################################################################

    if scheduler is not None:
        scheduler.step()


def eval_loop_fn(data_loader, model, device):
  model.eval()
  fin_targets = []
  fin_outputs = []
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.float)

    outputs = model(
        ids=ids,
        mask=mask,
        token_type_ids=token_type_ids
    )

    targets_np = targets.cpu().detach().numpy().tolist()
    outputs_np = outputs.cpu().detach().numpy().tolist()
    fin_targets.extend(targets_np)
    fin_outputs.extend(outputs_np)    

  return fin_outputs, fin_targets

In [8]:
model = BERTBaseUncased(pretrained_bert=PRETRAINED_BERT)

df1 = pd.read_csv(TRAINING_FILE_1, usecols=["comment_text", "toxic"]).fillna("none")
df2 = pd.read_csv(TRAINING_FILE_2, usecols=["comment_text", "toxic"]).fillna("none")

df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)
df_train = df_train.sample(frac=1).reset_index(drop=True).head(200000) 

df_valid = pd.read_csv(VALIDATION_FILE, usecols=["comment_text", "toxic"])

df_train = pd.concat([df_train, df_valid], axis=0).reset_index(drop=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=672271273.0, style=ProgressStyle(descri…




In [9]:
def _run():
    
  train_dataset = BERTDatasetTraining(
      comment_text=df_train.comment_text.values,
      targets=df_train.toxic.values,
      tokenizer=TOKENIZER,
      max_length=MAX_LEN
  )

  # We have to use DistributedSampler to use TPU's. It will distribute the data on different TPU cores.
  train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)

  train_data_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size=TRAIN_BATCH_SIZE,
      sampler=train_sampler,
      drop_last=True,
      num_workers=4
  )

  valid_dataset = BERTDatasetTraining(
      comment_text=df_valid.comment_text.values,
      targets=df_valid.toxic.values,
      tokenizer=TOKENIZER,
      max_length=MAX_LEN
  )

  valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=False)

  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size=VALID_BATCH_SIZE,
      sampler=valid_sampler,
      drop_last=False,
      num_workers=4
  )

  ##################################### Change occurs Here ####################################################################
  device = xm.xla_device()
  model.to(device)
  #############################################################################################################################

  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

  # Number of training steps will get divided my number of cores
  lr = 0.4 * 1e-5 * xm.xrt_world_size()
  num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS)
  xm.master_print(f'num_train_steps = {num_train_steps}, world_size={xm.xrt_world_size()}')

  optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=num_train_steps
  )
 
  best_roc_auc = 0
  ########################################## Change occur In this Loop #################################################################
  for epoch in range(EPOCHS):
    # train_data_loader has to be wrapped inside ParallelLoader 
    para_loader = pl.ParallelLoader(train_data_loader, [device])
    train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=scheduler)

    para_loader = pl.ParallelLoader(valid_data_loader, [device])
    o, t = eval_loop_fn(para_loader.per_device_loader(device), model, device)
  ########################################################################################################################################  

    roc_auc = metrics.roc_auc_score(np.array(t) >= 0.5, o)
    xm.master_print(f'AUC = {roc_auc}')
    if roc_auc > best_roc_auc:
      # Instead of using torch.save, we will be saving using xm.save
      xm.save(model.state_dict(), MODEL_PATH)
      best_roc_auc = roc_auc

In [10]:
# Start training processes
def _multiprocessing_function(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_multiprocessing_function, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 812, world_size=8
bi=0, loss=0.6500879526138306
bi=10, loss=0.38799771666526794
bi=20, loss=0.3137851357460022
bi=30, loss=0.3322542607784271
bi=40, loss=0.2917523980140686
bi=50, loss=0.26058685779571533
bi=60, loss=0.3470531404018402
bi=70, loss=0.23828867077827454
bi=80, loss=0.20857523381710052
bi=90, loss=0.2835109531879425
bi=100, loss=0.22540879249572754
bi=110, loss=0.2524469792842865
bi=120, loss=0.26607745885849
bi=130, loss=0.21546605229377747
bi=140, loss=0.20279334485530853
bi=150, loss=0.21931655704975128
bi=160, loss=0.25631076097488403
bi=170, loss=0.1354462057352066
bi=180, loss=0.3117203414440155
bi=190, loss=0.16671505570411682
bi=200, loss=0.2922610640525818
bi=210, loss=0.20948097109794617
bi=220, loss=0.30595624446868896
bi=230, loss=0.30060774087905884
bi=240, loss=0.2394719123840332
bi=250, loss=0.21053344011306763
bi=260, loss=0.19130487740039825
bi=270, loss=0.2283458113670349
bi=280, loss=0.2802594006061554
bi=290, loss=0.15899936854839325
b