In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/My Drive/Kaggle"
# /content/gdrive/My Drive/Kaggle is the path where kaggle.json is present in the Google Drive

In [0]:
#changing the working directory
%cd /content/drive/My Drive/Kaggle
#Check the present working directory using pwd command

/content/drive/My Drive/Kaggle


In [0]:
!pip install kaggle



In [0]:
!kaggle competitions download jigsaw-multilingual-toxic-comment-classification
#!kaggle datasets download -d datasnaek/youtube-new

Downloading test-processed-seqlen128.csv.zip to /content/gdrive/My Drive/Kaggle
 80% 24.0M/29.8M [00:00<00:00, 65.1MB/s]
100% 29.8M/29.8M [00:00<00:00, 85.8MB/s]
Downloading jigsaw-toxic-comment-train-processed-seqlen128.csv.zip to /content/gdrive/My Drive/Kaggle
 85% 68.0M/79.6M [00:00<00:00, 108MB/s]
100% 79.6M/79.6M [00:00<00:00, 109MB/s]
Downloading validation.csv.zip to /content/gdrive/My Drive/Kaggle
  0% 0.00/1.35M [00:00<?, ?B/s]
100% 1.35M/1.35M [00:00<00:00, 44.7MB/s]
Downloading validation-processed-seqlen128.csv.zip to /content/gdrive/My Drive/Kaggle
  0% 0.00/3.44M [00:00<?, ?B/s]
100% 3.44M/3.44M [00:00<00:00, 56.8MB/s]
Downloading sample_submission.csv to /content/gdrive/My Drive/Kaggle
  0% 0.00/612k [00:00<?, ?B/s]
100% 612k/612k [00:00<00:00, 41.0MB/s]
Downloading jigsaw-unintended-bias-train-processed-seqlen128.csv.zip to /content/gdrive/My Drive/Kaggle
 99% 644M/650M [00:08<00:00, 45.0MB/s]
100% 650M/650M [00:08<00:00, 84.1MB/s]
Downloading jigsaw-unintended-bias-tr

In [0]:
pwd

'/content/gdrive/My Drive/Kaggle'

In [0]:
!pip install transformers
!pip install tensorflow==2.1.0



In [0]:
import tensorflow
import torch, torchvision
import torch.nn as nn
import time
import random
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()

import transformers
from transformers import XLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig, AutoTokenizer, TFAutoModel, AutoModelWithLMHead
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

import re
print(torch.__version__)

1.5.0+cu101


In [0]:
import pandas as pd
import os

data_path = '/content/drive/My Drive/Kaggle/'

# TEST_PATH = os.path.join(data_path, "test.csv")
# VAL_PATH = os.path.join(data_path, "validation.csv")
# TRAIN_PATH = os.path.join(data_path, "jigsaw-toxic-comment-train.csv")

# val_data = pd.read_csv(VAL_PATH)
# test_data = pd.read_csv(TEST_PATH)
# train_data = pd.read_csv(TRAIN_PATH)

# sub = pd.read_csv('/content/gdrive/My Drive/Kaggle/sample_submission.csv')

train1 = pd.read_csv("/content/drive/My Drive/Kaggle/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/content/drive/My Drive/Kaggle/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv('/content/drive/My Drive/Kaggle/validation.csv')
test = pd.read_csv('/content/drive/My Drive/Kaggle/test.csv')
sub = pd.read_csv('/content/drive/My Drive/Kaggle/sample_submission.csv')

# Combine train1 with a subset of train2
train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
])

In [0]:
MAX_LEN = 512
MODEL = 'xlm-roberta-large'
TOKENIZER = AutoTokenizer.from_pretrained(MODEL, do_lower_case=True)
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 150
device = 'cuda' if torch.cuda.is_available() else 'cpu'


class BERTDataset:
    def __init__(self, comment_text, target):
        self.comment_text = comment_text
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]        

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.target[item], dtype=torch.float)
        }

In [0]:
train_dataset = BERTDataset(comment_text=train.comment_text.values, target=train.toxic.values)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4)

valid_dataset = BERTDataset(comment_text=valid.comment_text.values, target=valid.toxic.values)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1)

In [0]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = AutoModelWithLMHead.from_pretrained(MODEL)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768*2, 1)

    def forward(self, ids, mask):
        o1, _ = self.bert(
            ids,
            attention_mask=mask            
        )

        mean_pooling = torch.mean(o1, 1)
        max_pooling, _ = torch.max(o1, 1)
        cat = torch.cat((mean_pooling, max_pooling), 1)


        bo = self.bert_drop(cat)

        output = self.out(bo)

        return output

In [0]:
model = BERTBaseUncased()
model.to(device)


BERTBaseUncased(
  (bert): XLMRobertaForMaskedLM(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=102

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

In [0]:
import gc; 
gc.collect()

833

In [0]:
num_train_steps = int(len(train) / TRAIN_BATCH_SIZE * EPOCHS)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

model = nn.DataParallel(model)

In [0]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask)

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()


def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(
                ids=ids,
                mask=mask
            )
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [0]:
best_accuracy = 0
#RuntimeError: CUDA out of memory
torch.cuda.empty_cache()
model.zero_grad()

for epoch in range(EPOCHS):
  train_fn(train_data_loader, model, optimizer, device, scheduler)
  outputs, targets = eval_fn(valid_data_loader, model, device)
  targets = np.array(targets) >= 0.5
  accuracy = metrics.roc_auc_score(targets, outputs)
  print(f"AUC Score = {accuracy}")
  if accuracy > best_accuracy:
    torch.save(model.state_dict(), config.MODEL_PATH)
    best_accuracy = accuracy

  0%|          | 0/108944 [00:00<?, ?it/s]


RuntimeError: ignored