# Jigsaw Unintended Bias in Toxicity Classification

Inference only. This notebook runs on Kaggle.

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import torch
import torch.utils.data
from tqdm import tqdm, tqdm_notebook

import os
import re
import subprocess
import sys
import time

In [2]:
import platform
print(f'Python version: {platform.python_version()}')
print(f'PyTorch version: {torch.__version__}')

Python version: 3.6.6
PyTorch version: 1.0.1.post2


## 1. Initialize Environment

In [3]:
# This notebook runs on GPU
assert torch.cuda.is_available()

DEVICE = torch.device('cuda')
NUM_GPUS = torch.cuda.device_count()
assert NUM_GPUS > 0

In [4]:
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

**Define flags**:

In [5]:
import argparse

def define_args(str_list):
    '''
      A lite version of args at https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py#L565
    '''
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--model_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The model directory.")

    ## Other parameters
    parser.add_argument("--max_seq_length",
                        default=None,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")

    args = parser.parse_args(str_list)
    args.do_lower_case = 'uncased' in args.model_dir

    m = re.search(r'^.+cased-len-(\d+)-fp(16|32)\/epoch-\d+$', args.model_dir)
    args.max_seq_length = int(m.group(1))
    
    return args

In [6]:
args = define_args([
    '--data_dir', '../input/jigsaw-unintended-bias-in-toxicity-classification',
    '--model_dir', '../input/bert-fine-tuned-for-jigsaw/jigsaw-bert-large-uncased-len-220-fp16/epoch-1',
])
args

Namespace(data_dir='../input/jigsaw-unintended-bias-in-toxicity-classification', do_lower_case=True, eval_batch_size=8, max_seq_length=220, model_dir='../input/bert-fine-tuned-for-jigsaw/jigsaw-bert-large-uncased-len-220-fp16/epoch-1')

In [7]:
assert os.path.exists(args.model_dir)

## 2. Install requirements

### 2.1 Install pytorch-pretrained-bert

In [8]:
try:
    from pytorch_pretrained_bert import BertTokenizer, BertModel
except ModuleNotFoundError:
    print('Installing pytorch-pretrained-bert ...')
    if 'KAGGLE_URL_BASE' in os.environ:  # kaggle kernel
        bert_lib = '../input/pytorchpretrainedbert/pytorch-pretrained-bert-master/pytorch-pretrained-BERT-master'
        assert os.path.exists(bert_lib)
        sys.path.insert(0, bert_lib)
    else:
        print(subprocess.check_output('sudo -u jupyter conda install -y -c conda-forge pytorch-pretrained-bert', shell=True).decode('utf-8'))
    print('Installed pytorch-pretrained-bert successfully')

Installing pytorch-pretrained-bert ...
Installed pytorch-pretrained-bert successfully


In [9]:
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.tokenization import BertTokenizer

INFO:pytorch_pretrained_bert.modeling:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


## 3. Loading test dataset

In [10]:
y_columns=['target']

In [11]:
tokenizer = BertTokenizer.from_pretrained(args.model_dir, do_lower_case=args.do_lower_case, cache_dir=None)

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file ../input/bert-fine-tuned-for-jigsaw/jigsaw-bert-large-uncased-len-220-fp16/epoch-1/vocab.txt


In [12]:
def convert_lines(lines, max_seq_length, tokenizer):
    '''
      Converting the lines to BERT format.
      
      Copied from https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
    '''
    max_seq_length -= 2  # CLS, SEP
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(lines):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(f'longer: {longer}')
    return np.array(all_tokens)

In [13]:
test_df = pd.read_csv(os.path.join(args.data_dir, "test.csv"))

In [14]:
# Make sure all comment_text values are strings
test_df['comment_text'] = test_df['comment_text'].astype(str).fillna("DUMMY_VALUE")

In [15]:
X_test = convert_lines(test_df["comment_text"], args.max_seq_length, tokenizer)

HBox(children=(IntProgress(value=0, max=97320), HTML(value='')))


longer: 2191


In [16]:
print(X_test.shape)
print(X_test.dtype)

(97320, 220)
int64


In [17]:
test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))

## 4. Inference

In [18]:
def load_eval_model(model_dir):
    # Load a trained model and vocabulary that you have fine-tuned
    model = BertForSequenceClassification.from_pretrained(model_dir, num_labels=len(y_columns), cache_dir=None)
    model.to(DEVICE)
    model.eval()
    for param in model.parameters():
        param.requires_grad = False
    return model

In [19]:
def predict(model, valid_dataset):
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.eval_batch_size, shuffle=False)
    batch_size=args.eval_batch_size
    valid_preds = np.zeros((len(valid_dataset)))
    
    for step, (x_batch, ) in tqdm_notebook(enumerate(valid_loader), total=len(valid_loader)):
        y_pred = model(x_batch.to(DEVICE), attention_mask=(x_batch>0).to(DEVICE), labels=None)
        valid_preds[step*batch_size:(step+1)*batch_size]=y_pred[:,0].detach().cpu().squeeze().numpy()
    return valid_preds

In [20]:
model = load_eval_model(args.model_dir)

INFO:pytorch_pretrained_bert.modeling:loading archive file ../input/bert-fine-tuned-for-jigsaw/jigsaw-bert-large-uncased-len-220-fp16/epoch-1
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



In [21]:
y_test = predict(model, test_dataset)

HBox(children=(IntProgress(value=0, max=12165), HTML(value='')))

In [22]:
y_test.shape

(97320,)

In [23]:
test_pred = torch.sigmoid(torch.tensor(y_test)).numpy().ravel()

In [24]:
test_pred.shape

(97320,)

In [25]:
submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': test_pred
})
submission.to_csv('submission.csv', index=False)

In [26]:
submission.head()

Unnamed: 0,id,prediction
0,7000000,7.7e-05
1,7000001,4.9e-05
2,7000002,0.000447
3,7000003,8.3e-05
4,7000004,0.999096
