In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import AUCCallback, F1ScoreCallback
from catalyst.contrib.schedulers import OneCycleLR

from sklearn.metrics import precision_score, recall_score, f1_score

import youtokentome as yttm
import numpy as np
import pandas as pd
import re
import copy
from random import shuffle

In [None]:
r = re.compile(r'[\w]+')
BPE_model = yttm.BPE('BPE_10000.model')

In [None]:
# В google colab стоит pytorch версии 1.1.0, в котором ещё не были добалены трансформеры. 
# Поэтому используется чужой код. Ссылка в отчете

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 200):
        super().__init__()
        self.d_model = d_model
        
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = np.sin(pos / (10000 ** ((2 * i) / d_model)))
                pe[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1)) / d_model)))
                
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * np.sqrt(self.d_model)
        seq_len = x.size(1)
        x = x + torch.autograd.Variable(self.pe[:,:seq_len], requires_grad=False).cuda()
        return x

    
def attention(q, k, v, d_model, dropout=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) /  np.sqrt(d_model) 
        scores = F.softmax(scores, dim=-1)
        scores = dropout(scores)
        
        output = torch.matmul(scores, v)
        return output

    
# Изменен алгоритм подсчета score на dot
class Attention(nn.Module):
    def __init__(self, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, q, k, v):
        bs = q.size(0)

        k = k.view(bs, -1, self.d_model).transpose(1,2)
        q = q.view(bs, -1, self.d_model).transpose(1,2)
        v = v.view(bs, -1, self.d_model).transpose(1,2)
        scores = attention(q, k, v, self.d_model, self.dropout)
        
        output = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)
    
        return output


class FeedForward(nn.Module):
    def __init__(self, d_model, dropout = 0.1):
        super().__init__() 
        self.linear_1 = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x
    
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
        self.size = d_model
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
        
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [None]:
# Чужой код из той же статьи.

class AttentionLayer(nn.Module):
    def __init__(self, d_model, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = Attention(d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x


def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])


class AttentionModel(nn.Module):
    def __init__(self, vocab_size, d_model, N, output_shape):
        super().__init__()
        self.d_model = d_model
        self.N = N
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(AttentionLayer(d_model), N)
        self.norm = Norm(d_model)
        self.output_shape = output_shape
        self.output_1 = nn.Linear(d_model, d_model)
        self.output_2 = nn.Linear(d_model, output_shape)
        self.sigmoid = nn.Sigmoid()

    def forward(self, src):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x)
        x = self.norm(x)
        x = torch.sum(self.output_1(x), dim=1)
        x = self.sigmoid(self.output_2(x))
        return x

In [None]:
train = pd.read_csv('train.csv')
X = BPE_model.encode([' '.join(r.findall(i.lower())) for i in train['comment_text'].values])
y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()

# Для ускорения работы сокращаем длину до 200 токен
new_X = []
new_y = []
for x, y_i in zip(X, y):
    if len(x) > 200:
        for i in range(0, len(x), 200):
            new_X.append(x[i:min(200+i, len(x))])
            new_y.append(y_i)
    else:
        new_X.append(x)
        new_y.append(y_i)

X = new_X
y = [torch.FloatTensor(y_) for y_ in y]

Xy = list(zip(X, y))
shuffle(Xy)
X, y = [i[0] for i in Xy], [i[1] for i in Xy]

max_len = max([len(i) for i in X])
new_X = np.zeros((len(X), max_len))

for i, x in enumerate(X):
    new_X[i, :len(x)] += x
    
X = [torch.LongTensor(x) for x in new_X]

In [None]:
runner = SupervisedRunner()

# Веса полученны из LookAtData
weights = 143346 / np.array([15294, 1595, 8449, 478, 7877, 1405])
weights = torch.FloatTensor(weights).cuda()

X_train, X_test = X[:-len(X)//5], X[-len(X)//5:]
y_train, y_test = y[:-len(X)//5], y[-len(X)//5:]

loader_train = list(zip(X_train, y_train))
loader_test = list(zip(X_test, y_test))

loader_train = DataLoader(loader_train, batch_size=2**10, shuffle=True)
loader_test = DataLoader(loader_test, batch_size=2**10, shuffle=True)

model = AttentionModel(10000, 16, 4, 6) 
# Размер словаря: 10000 токенов, размерность модели(размер вектора после embedding): 16,
# количество attention слоев: 4

optimizer = optim.Adam(model.parameters(), lr=0.00001)
scheduler = OneCycleLR(optimizer,
    num_steps=30, 
    lr_range=(0.00001, 0.000001),
    warmup_steps=10)

runner.train(model=model, criterion=nn.BCELoss(reduction='mean', weight=weights),
             scheduler=scheduler, optimizer=optimizer,
             loaders={'train': loader_train, 'valid': loader_test}, 
             num_epochs=30, verbose=False, logdir='logs', 
             callbacks=[F1ScoreCallback(activation='none'), AUCCallback()])

In [None]:
# Считаем количество параметром модели
sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
y_pred = []

model.eval()
model.cuda()
with torch.no_grad():
    for x in X[-len(X)//5:]:
        x = x.cuda()
        y_pred.append((model(x.view(1, -1))).cpu().numpy())

In [None]:
# precision micro
precision_score((np.array([i.numpy() for i in y_test]) > 0) * 1, (np.round(np.array(y_pred).reshape(-1, 6)) > 0) * 1, average='micro')

In [None]:
# precision macro
precision_score((np.array([i.numpy() for i in y_test]) > 0) * 1, (np.round(np.array(y_pred).reshape(-1, 6)) > 0) * 1, average='macro')

In [None]:
# recall micro
recall_score((np.array([i.numpy() for i in y_test]) > 0) * 1, (np.round(np.array(y_pred).reshape(-1, 6)) > 0) * 1, average='micro')

In [None]:
# recall macro
recall_score((np.array([i.numpy() for i in y_test]) > 0) * 1, (np.round(np.array(y_pred).reshape(-1, 6)) > 0) * 1, average='macro')