In [3]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import AUCCallback, F1ScoreCallback
from catalyst.contrib.schedulers import OneCycleLR

from sklearn.metrics import roc_auc_score, classification_report

import numpy as np
import pandas as pd
import re
import copy
from random import shuffle
from tqdm import tqdm

Using TensorFlow backend.
lz4 not available, disabling compression. To install lz4, run `pip install lz4`.
wandb not available, to install wandb, run `pip install wandb`.


In [0]:
r = re.compile(r'[\a-z]+')

In [0]:
# В google colab стоит pytorch версии 1.1.0, в котором ещё не были добалены трансформеры
# Поэтому используется чужой код. Ссылка в отчете

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 200):
        super().__init__()
        self.d_model = d_model
        
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = np.sin(pos / (10000 ** ((2 * i) / d_model)))
                pe[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1)) / d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * np.sqrt(self.d_model)
        seq_len = x.size(1)
        x = x + torch.autograd.Variable(self.pe[:,:seq_len], requires_grad=False).cuda()
        return x

    
def attention(q, k, v, d_model, dropout=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) /  np.sqrt(d_model) 
        scores = F.softmax(scores, dim=-1)
        scores = dropout(scores)
        
        output = torch.matmul(scores, v)
        return output

    
# Изменен алгоритм подсчета score на dot
class Attention(nn.Module):
    def __init__(self, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, q, k, v):
        bs = q.size(0)

        k = k.view(bs, -1, self.d_model).transpose(1,2)
        q = q.view(bs, -1, self.d_model).transpose(1,2)
        v = v.view(bs, -1, self.d_model).transpose(1,2)
        scores = attention(q, k, v, self.d_model, self.dropout)
        
        output = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)
    
        return output


class FeedForward(nn.Module):
    def __init__(self, d_model, dropout = 0.1):
        super().__init__() 
        self.linear_1 = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x
    
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
        self.size = d_model
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [0]:
# Чтение параметров для embeddings.
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('glove.6B.50d.txt'))

In [0]:
train = pd.read_csv('train.csv')
X = [' '.join(r.findall(i.lower())) for i in train['comment_text'].values]
y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()

# Для ускорения работы сокращаем длину до 200 токен
new_X = []
new_y = []
for x, y_i in zip(X, y):
    if len(x) > 200:
        for i in range(0, len(x), 200):
            new_X.append(x[i:min(200+i, len(x))])
            new_y.append(y_i)
    else:
        new_X.append(x)
        new_y.append(y_i)

Xy = list(zip(X, y))
shuffle(Xy)
X, y = [i[0] for i in Xy], [i[1] for i in Xy]


# Готовим embedding

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=200)

word_index = tokenizer.word_index
nb_words = len(word_index)
embedding_matrix = np.zeros((nb_words + 1, 50))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [0]:
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class AttentionLayer(nn.Module):
    def __init__(self, d_model, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = Attention(d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x


class AttentionModel(nn.Module):
    def __init__(self, N, vocab_size=nb_words, d_model=50, output_shape=6):
        super().__init__()
        self.d_model = d_model
        self.N = N
        self.embed = nn.Embedding(vocab_size, d_model)
        self.embed.weight = nn.Parameter(torch.FloatTensor(embedding_matrix))
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(AttentionLayer(d_model), N)
        self.norm = Norm(d_model)
        self.output_shape = output_shape
        self.output = nn.Linear(d_model, output_shape)
        self.sigmoid = nn.Sigmoid()

    def forward(self, src):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x)
        x = torch.sum(self.norm(x), dim=1)
        x = x.view(-1, self.d_model)
        x = self.sigmoid(self.output(x))
        return x

In [9]:
runner = SupervisedRunner()


weights = 143346 / np.array([15294, 1595, 8449, 478, 7877, 1405])
weights = torch.FloatTensor(weights).cuda()

X = [torch.LongTensor(i) for i in X]
y = [torch.FloatTensor(i) for i in y]
X_train, X_test = X[:-len(X)//5], X[-len(X)//5:]
y_train, y_test = y[:-len(X)//5], y[-len(X)//5:]

loader_train = list(zip(X_train, y_train))
loader_test = list(zip(X_test, y_test))


loader_train = DataLoader(loader_train, batch_size=2**6, shuffle=True)
loader_test = DataLoader(loader_test, batch_size=2**6, shuffle=True)

model = AttentionModel(6)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = OneCycleLR(optimizer,
    num_steps=20, 
    lr_range=(0.001, 0.0001),
    warmup_steps=2)

runner.train(model=model, criterion=nn.BCELoss(reduction='mean', weight=weights),
             optimizer=optimizer, scheduler=scheduler,
             loaders={'train': loader_train, 'valid': loader_test}, 
             num_epochs=20, verbose=True, logdir='logs', 
             callbacks=[F1ScoreCallback(activation='none'), AUCCallback()])

0/20 * Epoch (train): 100% 1995/1995 [02:00<00:00, 16.57it/s, _timers/_fps=4946.842, f1_score=0.177, loss=14.122]
0/20 * Epoch (valid): 100% 499/499 [00:09<00:00, 55.30it/s, _timers/_fps=7006.015, f1_score=5.406e-05, loss=16.073]
[2019-09-27 23:28:12,424] 
0/20 * Epoch 0 (train): _base/lr=0.0010 | _base/momentum=0.8750 | _timers/_fps=5637.6076 | _timers/batch_time=0.0115 | _timers/data_time=0.0015 | _timers/model_time=0.0100 | auc/_mean=0.5526 | auc/class_0=0.5526 | f1_score=0.0663 | loss=28.2563
0/20 * Epoch 0 (valid): _base/lr=0.0010 | _base/momentum=0.8437 | _timers/_fps=6928.5570 | _timers/batch_time=0.0094 | _timers/data_time=0.0012 | _timers/model_time=0.0082 | auc/_mean=0.8760 | auc/class_0=0.8760 | f1_score=0.1014 | loss=24.5353
1/20 * Epoch (train): 100% 1995/1995 [01:59<00:00, 16.65it/s, _timers/_fps=5213.453, f1_score=0.375, loss=18.597]
1/20 * Epoch (valid): 100% 499/499 [00:09<00:00, 55.44it/s, _timers/_fps=3962.790, f1_score=0.091, loss=13.641]
[2019-09-27 23:30:22,465] 


In [10]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

9996606

In [11]:
y_pred = []

model.eval()
model.cuda()
with torch.no_grad():
    for x in tqdm(X[-len(X)//5:]):
        x = x.cuda()
        y_pred.append((model(x.view(1, -1))).cpu().numpy())

100%|██████████| 31915/31915 [03:40<00:00, 144.93it/s]


In [12]:
roc_auc_score((np.array([i.numpy() for i in y_test]) > 0) * 1, (np.round(np.array(y_pred).reshape(-1, 6))), average='macro')

0.6872252870144789

In [15]:
roc_auc_score((np.array([i.numpy() for i in y_test]) > 0) * 1, (np.round(np.array(y_pred).reshape(-1, 6))), average='micro')

0.8061638446639087

In [18]:
print(classification_report((np.array([i.numpy() for i in y_test]) > 0) * 1, (np.round(np.array(y_pred).reshape(-1, 6))), 
                           target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

               precision    recall  f1-score   support

        toxic       0.81      0.69      0.75      2965
 severe_toxic       0.48      0.29      0.36       300
      obscene       0.83      0.69      0.75      1592
       threat       0.00      0.00      0.00        89
       insult       0.74      0.62      0.67      1515
identity_hate       0.00      0.00      0.00       270

    micro avg       0.79      0.62      0.69      6731
    macro avg       0.48      0.38      0.42      6731
 weighted avg       0.74      0.62      0.67      6731
  samples avg       0.06      0.05      0.05      6731




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.


Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels.


Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels.

