In [1]:
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, log_loss
from datetime import datetime
import pandas as pd
pd.set_option('display.max.columns', 500)
import numpy as np
import os
import matplotlib.pyplot as plt
import gc
from shutil import copyfile
from torch import sigmoid

from torch.utils.data import DataLoader, SubsetRandomSampler,Dataset
batch_size = 128#12

DATA_DIR = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/'

In [2]:
CHECK = True
LOCAL = True
if LOCAL:
    dir_chk = '/home/kb/Documents/jigsaw/checkpoints/v5_best'
else:
    dir_chk = '?'
        
if CHECK:
    test_file = os.path.join(DATA_DIR, 'validation.csv')
    text_column = 'comment_text'
else:
    test_file = os.path.join(DATA_DIR, 'test.csv')
    text_column = 'content'
   

In [3]:
df_inference = pd.read_csv(test_file)
df_inference.head()

Unnamed: 0,id,comment_text,lang,toxic
0,0,Este usuario ni siquiera llega al rango de ...,es,0
1,1,Il testo di questa voce pare esser scopiazzato...,it,0
2,2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,es,1
3,3,Bu maddenin alt başlığı olarak uluslararası i...,tr,0
4,4,Belçika nın şehirlerinin yanında ilçe ve belde...,tr,0


In [4]:
if CHECK:
    print(df_inference.groupby(['lang', 'toxic'], as_index=False)['id'].count())
else:
    print(df_inference.groupby(['lang'], as_index=False)['id'].count())

  lang  toxic    id
0   es      0  2078
1   es      1   422
2   it      0  2012
3   it      1   488
4   tr      0  2680
5   tr      1   320


In [5]:
len(df_inference)

8000

In [6]:
df_sub = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
df_sub.head()

Unnamed: 0,id,toxic
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [7]:
target_columns = 'toxic'

### Define dataset

In [8]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import time
from torch.optim import lr_scheduler

import torch
from tqdm import tqdm
#import torch.utils.data as data
from torchvision import datasets, models, transforms
from transformers import *
import random
from math import floor, ceil
from sklearn.model_selection import GroupKFold

MAX_LEN = 512
SEP_TOKEN_ID = 102

class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, train_mode=True, labeled=True):
        self.df = df
        if train_mode:
            self.labels = df.toxic.values
            
        self.train_mode = train_mode
        self.labeled = labeled
        #self.tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
#         self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
        #distilbert-base-multilingual-cased

    def __getitem__(self, index):
        row = self.df.iloc[index]
        token_ids = self.get_token_ids(row)
        
        if self.labeled:
            labels = self.get_label(row)
            return {'features': token_ids, 'targets': labels}

        else:
            return {'features': token_ids}

    def __len__(self):
        return len(self.df)

    def trim_input(self, text, max_sequence_length=MAX_LEN):
        t = self.tokenizer.tokenize(text)
        t_len = len(t)

        if t_len + 2 > max_sequence_length:

            t_new_len = int(max_sequence_length) - 2

            t = t[:t_new_len]

        return t
        
    def get_token_ids(self, row):
        t_tokens = self.trim_input(row[text_column])

#         tokens = ['[CLS]'] + t_tokens  + ['[SEP]']+ t_tokens[-1::-1]+ ['[SEP]']
        tokens = ['[CLS]'] + t_tokens  + ['[SEP]']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        
        if len(token_ids) < MAX_LEN:
            token_ids += [0] * (MAX_LEN - len(token_ids))
            
        ids = torch.tensor(token_ids)
        
        return ids

    def get_label(self, row):
#         label = torch.tensor(row[target_columns].astype(np.long))
        label = np.round(row[target_columns])
        return torch.tensor([1-label, label]).float()
    
    def collate_fn(self, batch):
        token_ids = torch.stack([x[0] for x in batch])

        if self.labeled:
            labels = torch.stack([x[1] for x in batch])
            return {'features': token_ids, 'targets': labels}
        else:
            return {'features': token_ids}

## Build Model

In [9]:
from transformers import *
import torch
import torch.nn as nn
import torch.nn.functional as F


class QuestModel(nn.Module):
    def __init__(self, n_classes=2):
        super(QuestModel, self).__init__()
        self.model_name = 'QuestModel'
        
#         self.bert_model = BertModel.from_pretrained('bert-base-uncased') 
#         self.bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.bert_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
    
        self.fc = nn.Linear(768, n_classes)

    def forward(self, ids):
        attention_mask = (ids > 0)
        layers = self.bert_model(input_ids=ids, attention_mask=attention_mask)
        
        out = F.dropout(layers[-1][:, 0, :], p=0.2, training=self.training)
        logit = self.fc(out)#.unsqueeze(1)
        return logit #, 'for_auc': logit[:, 1]}#[:,1]
    


In [10]:
ds = QuestDataset(df_inference, train_mode=False, labeled=False)
loader = DataLoader(
        ds,
        num_workers=8,
        batch_size=batch_size,
    ) 

In [11]:
# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
checkpoints = os.listdir(dir_chk)
my_models = []

for chk in checkpoints:
    checkpoint = torch.load(os.path.join(dir_chk, chk))#, map_location=device)
    model = QuestModel(2)
    
    
    model.load_state_dict(checkpoint['model_state_dict'])
    
    if LOCAL:
        model = nn.DataParallel(model)
        
    model = model.to(device)
    _ = model.eval()
    
    
    
    my_models.append(model)
    
    del checkpoint
    
    
print(len(my_models))

6


In [13]:
y_pred = torch.zeros(len(my_models), len(ds), 2).to(device)

j = 0
with torch.no_grad():
    for data in tqdm(loader):
        x = data['features'].to(device)
        thish_batch_size = len(x)            
        
        for i, model in enumerate(my_models):
            y_pred[i, j:j+thish_batch_size] = model(x)
            
        j += thish_batch_size
            

100%|██████████| 63/63 [02:20<00:00,  2.23s/it]


In [14]:
y_pred_sigmoid = sigmoid(y_pred)

In [15]:
y_pred_sigmoid_one = y_pred_sigmoid.mean(dim=0)
y_pred_sigmoid_one.shape

torch.Size([8000, 2])

In [16]:
y_pred_sigmoid_one_np = y_pred_sigmoid_one.cpu().numpy()

In [17]:
if CHECK:
    print(roc_auc_score(df_inference['toxic'].values, y_pred_sigmoid_one_np[:,1]), log_loss(df_inference['toxic'], y_pred_sigmoid_one_np[:,1]))

0.9070654849827671 0.2649754977807388


In [18]:
if not CHECK:
    df_sub['toxic'] = y_pred_sigmoid_one_np[:,1]

In [19]:
df_sub.to_csv('submission.csv', index=False)