## TPU-8-CORES

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  4264  100  4264    0     0  56853      0 --:--:-- --:--:-- --:--:-- 56853
Updating TPU and VM. This may take around 2 minutes.
Updating TPU runtime to pytorch-dev20200515 ...
Uninstalling torch-1.5.0+cu101:
Done updating TPU runtime: <Response [200]>
  Successfully uninstalled torch-1.5.0+cu101
Uninstalling torchvision-0.6.0+cu101:
  Successfully uninstalled torchvision-0.6.0+cu101
Copying gs://tpu-pytorch/wheels/torch-nightly+20200515-cp36-cp36m-linux_x86_64.whl...
\ [1 files][ 91.0 MiB/ 91.0 MiB]                                                
Operation completed over 1 objects/91.0 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-nightly+20200515-cp36-cp36m-linux_x86_64.whl...
\ [1 files][119.5 MiB/119.5 MiB] 

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 3.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 8.3MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 15.0MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/gdrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [None]:
import os
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from sklearn import model_selection
from sklearn import metrics
import torch
import torch.nn as nn
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [None]:
# to use TPU's using PyTorch, we have to use PyTorch-XLA library
import warnings
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl

import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.utils.utils as xu
import torch_xla.test.test_utils as test_utils
import warnings

warnings.filterwarnings("ignore")

In [None]:
MAX_LEN = 224
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 2

DIR_INPUT = '/content/gdrive/My Drive/Jigsaw/Toxic Comment'
BERT_PATH = f'{DIR_INPUT}/input/bert_base_multilingual_uncased/' 
MODEL_PATH = f'{DIR_INPUT}/output/mBERT-Training-with-validation/model.bin' 
TRAINING_FILE_1 = f'{DIR_INPUT}/input/jigsaw-toxic-comment-train.csv' 
TRAINING_FILE_2 = f'{DIR_INPUT}/input/jigsaw-unintended-bias-train.csv' 
VALIDATION_FILE = f'{DIR_INPUT}/input/validation.csv' 

TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=True
)

In [None]:
class BERTDatasetTraining:
  def __init__(self, comment_text, targets, tokenizer, max_length):
    self.comment_text = comment_text
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.targets = targets

  def __len__(self):
    return len(self.comment_text)

  def __getitem__(self, item):
    comment_text = str(self.comment_text[item])
    comment_text = " ".join(comment_text.split())

    inputs = self.tokenizer.encode_plus(
        comment_text,
        None,
        add_special_tokens=True,
        max_length=self.max_length,
        pad_to_max_length = True
    )

    ids = inputs["input_ids"]
    token_type_ids = inputs["token_type_ids"]
    mask = inputs["attention_mask"]
    
    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.targets[item], dtype=torch.float)
    }

In [None]:
class BERTBaseUncased(nn.Module):
  def __init__(self, bert_path):
    super(BERTBaseUncased, self).__init__()
    self.bert_path = bert_path
    self.bert = transformers.BertModel.from_pretrained(self.bert_path)
    self.bert_drop = nn.Dropout(0.3)
    self.out = nn.Linear(768 * 2, 1)

  def forward(self, ids, mask, token_type_ids):
    o1, o2 = self.bert(ids,
                       attention_mask=mask,
                       token_type_ids=token_type_ids)
    
    apool = torch.mean(o1, 1)
    mpool, _ = torch.max(o1, 1)
    cat = torch.cat((apool, mpool), 1)

    bo = self.bert_drop(cat)
    output = self.out(bo)

    return output

In [None]:
def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

In [None]:
def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
  model.train()
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.float)

    optimizer.zero_grad()
    outputs = model(
        ids=ids,
        mask=mask,
        token_type_ids=token_type_ids
    )

    loss = loss_fn(outputs, targets)
    if bi % 10 == 0:
        xm.master_print(f'bi={bi}, loss={loss}')

    loss.backward()

    ####################################### CHANGE HAPPENS HERE #######################################################
    xm.optimizer_step(optimizer)
    ###################################################################################################################

    if scheduler is not None:
        scheduler.step()

In [None]:
def eval_loop_fn(data_loader, model, device):
  model.eval()
  fin_targets = []
  fin_outputs = []
  for bi, d in enumerate(data_loader):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.float)

    outputs = model(
        ids=ids,
        mask=mask,
        token_type_ids=token_type_ids
    )

    targets_np = targets.cpu().detach().numpy().tolist()
    outputs_np = outputs.cpu().detach().numpy().tolist()
    fin_targets.extend(targets_np)
    fin_outputs.extend(outputs_np)    

  return fin_outputs, fin_targets

In [None]:
model = BERTBaseUncased(bert_path=BERT_PATH)

df1 = pd.read_csv(TRAINING_FILE_1, usecols=["comment_text", "toxic"]).fillna("none")
df2 = pd.read_csv(TRAINING_FILE_2, usecols=["comment_text", "toxic"]).fillna("none")

df_train = pd.concat([df1, df2], axis=0).reset_index(drop=True)
df_train = df_train.sample(frac=1).reset_index(drop=True).head(200000) 

df_valid = pd.read_csv(VALIDATION_FILE, usecols=["comment_text", "toxic"])

df_train = pd.concat([df_train, df_valid], axis=0).reset_index(drop=True)
df_train = df_train.sample(frac=1).reset_index(drop=True)

In [None]:
def _run():
    
  train_dataset = BERTDatasetTraining(
      comment_text=df_train.comment_text.values,
      targets=df_train.toxic.values,
      tokenizer=TOKENIZER,
      max_length=MAX_LEN
  )

  #-----------------------------#---------------------#-----------------------------------#-----------------------------------#
  ##################################### Change occurs Here ####################################################################
  # We have to use DistributedSampler to use TPU's. It will distribute the data on different TPU cores.
  # DistributedSampler: Sampler that restricts data loading to a subset of the dataset.
  # https://kite.com/python/docs/torch.utils.data.DistributedSampler
  train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)

  # DataLoader: Combines a dataset and a sampler, and provides an iterable over the given dataset.  
  train_data_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size=TRAIN_BATCH_SIZE,
      sampler=train_sampler,
      drop_last=True,
      num_workers=4
  )

  valid_dataset = BERTDatasetTraining(
      comment_text=df_valid.comment_text.values,
      targets=df_valid.toxic.values,
      tokenizer=TOKENIZER,
      max_length=MAX_LEN
  )

  valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=False)

  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size=VALID_BATCH_SIZE,
      sampler=valid_sampler,
      drop_last=False,
      num_workers=4
  )

  #-----------------------------#---------------------#-----------------------------------#-----------------------------------#
  ##################################### Change occurs Here ####################################################################
  device = xm.xla_device()
  model.to(device)
  #############################################################################################################################
  #----------------------------#----------------------#------------------------------------#-----------------------------------#

  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

  # Calculate the number of training steps	
  # Number of training steps will get divided my number of cores
  lr = 0.4 * 1e-5 * xm.xrt_world_size()
  num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS)
  xm.master_print(f'num_train_steps = {num_train_steps}, world_size={xm.xrt_world_size()}')

  optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=num_train_steps
  )
 
  best_roc_auc = 0
  ########################################## Change occur In this Loop #################################################################
  for epoch in range(EPOCHS):
    # train_data_loader has to be wrapped inside ParallelLoader 
    para_loader = pl.ParallelLoader(train_data_loader, [device])
    train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler=scheduler)

    para_loader = pl.ParallelLoader(valid_data_loader, [device])
    o, t = eval_loop_fn(para_loader.per_device_loader(device), model, device)
  ########################################################################################################################################  

    roc_auc = metrics.roc_auc_score(np.array(t) >= 0.5, o)
    xm.master_print(f'AUC = {roc_auc}')
    if roc_auc > best_roc_auc:
      # Instead of using torch.save, we will be saving using xm.save
      xm.save(model.state_dict(), MODEL_PATH)
      best_roc_auc = roc_auc

In [None]:
# Start training processes
def _multiprocessing_function(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run()

FLAGS={}
xmp.spawn(_multiprocessing_function, args=(FLAGS,), nprocs=8, start_method='fork')

num_train_steps = 1625, world_size=8
bi=0, loss=0.7023844122886658
bi=10, loss=0.2372722029685974
bi=20, loss=0.4337332248687744
bi=30, loss=0.36015355587005615
bi=40, loss=0.2171345204114914
bi=50, loss=0.25924423336982727
bi=60, loss=0.25696200132369995
bi=70, loss=0.24473780393600464
bi=80, loss=0.36457887291908264
bi=90, loss=0.34517255425453186
bi=100, loss=0.2834688425064087
bi=110, loss=0.3369932472705841
bi=120, loss=0.3206125795841217
bi=130, loss=0.2613537907600403
bi=140, loss=0.29763197898864746
bi=150, loss=0.2566080093383789
bi=160, loss=0.21656261384487152
bi=170, loss=0.23266170918941498
bi=180, loss=0.23586498200893402
bi=190, loss=0.25252076983451843
bi=200, loss=0.23837001621723175
bi=210, loss=0.1991264522075653
bi=220, loss=0.23199643194675446
bi=230, loss=0.3661993443965912
bi=240, loss=0.14286498725414276
bi=250, loss=0.15331502258777618
bi=260, loss=0.23158037662506104
bi=270, loss=0.17085039615631104
bi=280, loss=0.21439027786254883
bi=290, loss=0.1919163018465