## Подготовка данных

In [3]:
import os
import ast
import time
import math
import warnings
from itertools import chain
from typing import List, Tuple, Union

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import torch
import torchtext
import transformers

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import Lowercase

In [2]:
dtype, device, cuda_device_id = torch.float32, None, 0
os.environ["CUDA_VISIBLE_DEVICES"] = '{0}'.format(str(cuda_device_id) if cuda_device_id is not None else '')
if cuda_device_id is not None and torch.cuda.is_available():
    device = 'cuda:{0:d}'.format(0)
else:
    device = torch.device('cpu')
print(f'Using device: {device}, dtype: {dtype}')

Using device: cpu, dtype: torch.float32


In [3]:
embedding_dim = 128
hidden_dim = 128
embedding_dim_ua = 1024
hidden_dim_ua = 1024
num_epochs = 15
batch_size = 64
learning_rate = 1e-3
vocab_size = 1000
nhead = 4
d_hid = 1024
nlayers = 4
dropout = 0.1

In [4]:
train_dataset = pd.read_parquet('train.parquet')
test_dataset = pd.read_parquet('test.parquet')

In [5]:
def f1(inputs):
    return ast.literal_eval(inputs.decode("utf-8"))

test_dataset.curves = test_dataset.curves.apply(f1)
test_dataset.ciphers = test_dataset.ciphers.apply(f1)

In [6]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        self.data.reset_index(drop=True, inplace=True)
        self.rows = []
        for i, row in data.iterrows():
            self.rows.append(row)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.rows[idx]
        ciphers = torch.tensor(ciphers_vocab.lookup_indices(['<sos>']) + ciphers_vocab.lookup_indices(list(row.ciphers)) + ciphers_vocab.lookup_indices(['<eos>']), dtype=torch.long)
        curves = torch.tensor(curves_vocab.lookup_indices(['<sos>']) + curves_vocab.lookup_indices(list(row.curves)) + curves_vocab.lookup_indices(['<eos>']), dtype=torch.long)
        return ciphers, curves, torch.tensor(len(row.ciphers) + 2), torch.tensor(len(row.curves) + 2), torch.tensor(row.label, dtype=torch.float)
    
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        self.data.reset_index(drop=True, inplace=True)
        self.rows = []
        for i, row in data.iterrows():
            self.rows.append(row)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.rows[idx]
        ciphers = torch.tensor(ciphers_vocab.lookup_indices(['<sos>']) + ciphers_vocab.lookup_indices(list(row.ciphers)) + ciphers_vocab.lookup_indices(['<eos>']), dtype=torch.long)
        curves = torch.tensor(curves_vocab.lookup_indices(['<sos>']) + curves_vocab.lookup_indices(list(row.curves)) + curves_vocab.lookup_indices(['<eos>']), dtype=torch.long)
        return ciphers, curves, torch.tensor(len(row.ciphers) + 2), torch.tensor(len(row.curves) + 2)

def collate_to_train_batch(batch):
    ciphers = torch.nn.utils.rnn.pad_sequence([i[0] for i in batch], padding_value=0)
    curves = torch.nn.utils.rnn.pad_sequence([i[1] for i in batch], padding_value=0)
    return ciphers, curves, torch.tensor([i[2] for i in batch], dtype=torch.long), torch.tensor([i[3] for i in batch], dtype=torch.long), torch.tensor([i[4] for i in batch], dtype=torch.float)

def collate_to_test_batch(batch):
    ciphers = torch.nn.utils.rnn.pad_sequence([i[0] for i in batch], padding_value=0)
    curves = torch.nn.utils.rnn.pad_sequence([i[1] for i in batch], padding_value=0)
    return ciphers, curves, torch.tensor([i[2] for i in batch], dtype=torch.long), torch.tensor([i[3] for i in batch], dtype=torch.long)

In [7]:
def gen_dropout_mask(num_objects, input_size, hidden_size, is_training, p, some_existing_tensor):
    if p is None:
        return some_existing_tensor.new_ones((num_objects, input_size)), some_existing_tensor.new_ones((num_objects, hidden_size))
    if is_training:
        return torch.bernoulli((1 - p) * some_existing_tensor.new_ones((num_objects, input_size))), torch.bernoulli((1 - p) * some_existing_tensor.new_ones((num_objects, hidden_size)))
    return (1 - p) * some_existing_tensor.new_ones((num_objects, input_size)), (1 - p) * some_existing_tensor.new_ones((num_objects, hidden_size))

class FastRNNLayer(torch.nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.0, layers_dropout=0.0, num_layers=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.layers_dropout = layers_dropout
        self.module = torch.nn.LSTM(input_size, hidden_size, dropout=layers_dropout, num_layers=num_layers)
        self.layer_names = []
        for layer_n in range(self.num_layers):
            self.layer_names += [f'weight_hh_l{layer_n}', f'weight_ih_l{layer_n}']
        for layer in self.layer_names:
            w = getattr(self.module, layer)
            delattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', torch.nn.Parameter(w.data))

    def _setweights(self, x):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            mask, _ = gen_dropout_mask(1, raw_w.shape[1], self.hidden_size, self.training, self.dropout, x)
            masked_raw_w = raw_w * mask
            setattr(self.module, layer, masked_raw_w)

    def forward(self, x, h_c=None):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self._setweights(x)
            if h_c is not None:
                return self.module.forward(x, h_c)
            return self.module.forward(x)
            
    def reset(self):
        if hasattr(self.module, 'reset'):
            self.module.reset()

## Построение TLS-эмбеддингов

Для построения TLS-эмбеддинга будем использовать автокодировщик. Его идея такая: на вход подается шифр/кривая объекта, по ним проходится LSTM, а потом эта же LSTM должна воспроизвести то, что прочитала. Мы заводим два автокодировщика на шифры и кривые: один читает слева направо, другой справа налево. Автокодировщики обучаются на всех данных: на трейне, тесте и неразмеченном датасете. Код ниже работает достаточно долго, сохраненные веса автокодировщиков есть в архиве

In [None]:
unlab = pd.read_parquet('unlabelled.snappy.parquet')
big_dataset = pd.concat([train_dataset.drop(['id', 'label'], axis=1), test_dataset.drop(['id'], axis=1), unlab])

In [None]:
ciphers_big_dataset = Counter()
for i, row in big_dataset.iterrows():
    ciphers_big_dataset = ciphers_big_dataset + Counter(list(map(str, row.ciphers)))

curves_big_dataset = Counter()
for i, row in big_dataset.iterrows():
    curves_big_dataset = curves_big_dataset + Counter(list(map(str, row.curves)))
    
specials = ['<pad>', '<unk>', '<sos>', '<eos>']
for special in specials:
    ciphers_big_dataset[special] = 0
    curves_big_dataset[special] = 0
ciphers_vocab = torchtext.vocab.vocab(dict(sorted(ciphers_big_dataset.items(), key=lambda x: x[1], reverse=True)), specials=specials)
ciphers_vocab.set_default_index(ciphers_vocab['<unk>'])
curves_vocab = torchtext.vocab.vocab(dict(sorted(curves_big_dataset.items(), key=lambda x: x[1], reverse=True)), specials=specials)
curves_vocab.set_default_index(curves_vocab['<unk>'])

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    TestDataset(big_dataset), batch_size=batch_size, collate_fn=collate_to_test_batch, pin_memory=False, shuffle=True
)

In [None]:
class LMCrossEntropyLoss(torch.nn.CrossEntropyLoss):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def forward(self, outputs, tokens, tokens_lens):
        packed_outputs = torch.nn.utils.rnn.pack_padded_sequence(outputs[:-1, :, :], tokens_lens.cpu() - 1, batch_first=False, enforce_sorted=False).data
        packed_tokens = torch.nn.utils.rnn.pack_padded_sequence(tokens[1:, :], tokens_lens.cpu() - 1, batch_first=False, enforce_sorted=False).data
        return super().forward(packed_outputs, packed_tokens)
  
class LMAccuracy(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, outputs, tokens, tokens_lens):
        packed_outputs = torch.nn.utils.rnn.pack_padded_sequence(outputs[:-1, :, :], tokens_lens.cpu() - 1, batch_first=False, enforce_sorted=False).data
        packed_tokens = torch.nn.utils.rnn.pack_padded_sequence(tokens[1:, :], tokens_lens.cpu() - 1, batch_first=False, enforce_sorted=False).data
        return (packed_outputs.argmax(dim=1) == packed_tokens).sum()

In [None]:
def train_epoch_cipher(dataloader, model, loss_fn, optimizer, device):
    model.train()
    for idx, data in enumerate(dataloader):
        tokens = data[0].to(device)
        tokens_lens = data[2].to(device)         
        optimizer.zero_grad()
        outputs = model(tokens, tokens_lens)
        loss = loss_fn(outputs, tokens, tokens_lens)
        loss.backward()
        optimizer.step()
    
def evaluate_cipher(dataloader, model, loss_fn, device):
    model.eval()
    total_tokens = 0
    total_loss = 0.0
    total_accuracy = 0.0
    accuracy_fn = LMAccuracy()
    with torch.no_grad():
        for idx, data in enumerate(dataloader):
            tokens = data[0].to(device)
            tokens_lens = data[2].to(device) 
            outputs = model(tokens, tokens_lens)
            num_tokens = (tokens_lens - 1).sum().detach().item()
            total_tokens += num_tokens
            total_loss += loss_fn(outputs, tokens, tokens_lens) * num_tokens
            total_accuracy += accuracy_fn(outputs, tokens, tokens_lens)

    return total_loss / total_tokens, total_accuracy / total_tokens

def train_cipher(train_loader, model, loss_fn, optimizer, device, num_epochs):
    test_losses = []
    train_losses = []
    test_accuracies = []
    train_accuracies = []
    for epoch in range(num_epochs):
        train_epoch_cipher(train_loader, model, loss_fn, optimizer, device)
        saver_state(model, 'cipher_new_weights' + str(epoch))
        train_loss, train_acc = evaluate_cipher(train_loader, model, loss_fn, device)
        train_accuracies.append(train_acc)
        train_losses.append(train_loss)
        print('Epoch: {0:d}/{1:d}. Loss (Train): {2:.3f}. Accuracy (Train): {3:.3f}'.format(epoch + 1, num_epochs, train_losses[-1], train_accuracies[-1]))
    return train_losses, train_accuracies

In [None]:
def train_epoch_curves(dataloader, model, loss_fn, optimizer, device):
    model.train()
    for idx, data in enumerate(dataloader):
        tokens = data[1].to(device)
        tokens_lens = data[3].to(device)         
        optimizer.zero_grad()
        outputs = model(tokens, tokens_lens)
        loss = loss_fn(outputs, tokens, tokens_lens)
        loss.backward()
        optimizer.step()
    
def evaluate_curves(dataloader, model, loss_fn, device):
    model.eval()
    total_tokens = 0
    total_loss = 0.0
    total_accuracy = 0.0
    accuracy_fn = LMAccuracy()
    with torch.no_grad():
        for idx, data in enumerate(dataloader):
            tokens = data[1].to(device)
            tokens_lens = data[3].to(device) 
            outputs = model(tokens, tokens_lens)
            num_tokens = (tokens_lens - 1).sum().detach().item()
            total_tokens += num_tokens
            total_loss += loss_fn(outputs, tokens, tokens_lens) * num_tokens
            total_accuracy += accuracy_fn(outputs, tokens, tokens_lens)

    return total_loss / total_tokens, total_accuracy / total_tokens

def train_curves(train_loader, model, loss_fn, optimizer, device, num_epochs):
    test_losses = []
    train_losses = []
    test_accuracies = []
    train_accuracies = []
    for epoch in range(num_epochs):
        train_epoch_curves(train_loader, model, loss_fn, optimizer, device)
        saver_state(model, 'curves_new_weights' + str(epoch))
        train_loss, train_acc = evaluate_curves(train_loader, model, loss_fn, device)
        train_accuracies.append(train_acc)
        train_losses.append(train_loss)
        print('Epoch: {0:d}/{1:d}. Loss (Train): {2:.3f}. Accuracy (Train): {3:.3f}'.format(epoch + 1, num_epochs, train_losses[-1], train_accuracies[-1]))
    return train_losses, train_accuracies

In [8]:
class AutoEncoderCipher(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, ciphers_vocab, device):
        super().__init__()
        self.ciphers_embs = torch.nn.Embedding(num_embeddings=len(ciphers_vocab), embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = FastRNNLayer(embedding_dim, hidden_dim, 0.25)
        self.linear = torch.nn.Linear(hidden_dim, len(ciphers_vocab))
                
    def forward(self, ciphers, ciphers_lens):
        ciphers_embeddings = self.ciphers_embs(ciphers)
        output, (h, c) = self.lstm(ciphers_embeddings)
        output, (h, c) = self.lstm(ciphers_embeddings, (h, c))
        return self.linear(output)
    
class AutoEncoderCurves(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, curves_vocab, device):
        super().__init__()
        self.curves_embs = torch.nn.Embedding(num_embeddings=len(curves_vocab), embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = FastRNNLayer(embedding_dim, hidden_dim, 0.25)
        self.linear = torch.nn.Linear(hidden_dim, len(curves_vocab))
                
    def forward(self, curves, curves_lens):
        curves_embeddings = self.curves_embs(curves)
        output, (h, c) = self.lstm(curves_embeddings)
        output, (h, c) = self.lstm(curves_embeddings, (h, c))
        return self.linear(output)

In [None]:
autoencoder_cipher = AutoEncoderCipher(embedding_dim, hidden_dim, ciphers_vocab, device).to(device=device)
lm_loss_fn = LMCrossEntropyLoss(reduction='mean')
lm_optimizer = torch.optim.Adam(autoencoder_cipher.parameters(), lr=learning_rate)
t = time.time()
train_losses, train_accuracies = train_cipher(
    train_dataloader, autoencoder_cipher, lm_loss_fn, lm_optimizer, device, 20
)
print('time:', time.time() - t)

In [None]:
autoencoder_curves = AutoEncoderCurves(embedding_dim, hidden_dim, curves_vocab, device).to(device=device)
lm_loss_fn = LMCrossEntropyLoss(reduction='mean')
lm_optimizer = torch.optim.Adam(autoencoder_curves.parameters(), lr=learning_rate)
t = time.time()
train_losses, train_accuracies = train_curves(
    train_dataloader, autoencoder_curves, lm_loss_fn, lm_optimizer, device, 20
)
print('time:', time.time() - t)

In [None]:
class LMCrossEntropyLoss(torch.nn.CrossEntropyLoss):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def forward(self, outputs, tokens, tokens_lens):
        tokens_lens = torch.full_like(tokens_lens, outputs.shape[0])
        packed_outputs = torch.nn.utils.rnn.pack_padded_sequence(outputs[:-1, :, :], tokens_lens.cpu() - 1, batch_first=False, enforce_sorted=False).data
        packed_tokens = torch.nn.utils.rnn.pack_padded_sequence(tokens[1:, :], tokens_lens.cpu() - 1, batch_first=False, enforce_sorted=False).data
        return super().forward(packed_outputs, packed_tokens)

class LMAccuracy(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, outputs, tokens, tokens_lens):
        tokens_lens = torch.full_like(tokens_lens, outputs.shape[0])
        packed_outputs = torch.nn.utils.rnn.pack_padded_sequence(outputs[:-1, :, :], tokens_lens.cpu() - 1, batch_first=False, enforce_sorted=False).data
        packed_tokens = torch.nn.utils.rnn.pack_padded_sequence(tokens[1:, :], tokens_lens.cpu() - 1, batch_first=False, enforce_sorted=False).data
        return (packed_outputs.argmax(dim=1) == packed_tokens).sum()

In [None]:
def train_epoch_cipher(dataloader, model, loss_fn, optimizer, scheduler, device):
    model.train()
    for idx, data in enumerate(dataloader):
        tokens = data[0].to(device)
        tokens_lens = data[2].to(device) 
        tokens_lens = torch.full_like(tokens_lens, tokens.shape[0])
        optimizer.zero_grad()
        outputs = model(tokens, tokens_lens)
        loss = loss_fn(outputs, tokens, tokens_lens)
        loss.backward()
        optimizer.step()
        scheduler.step()
    
def evaluate_cipher(dataloader, model, loss_fn, device):
    model.eval()
    total_tokens = 0
    total_loss = 0.0
    total_accuracy = 0.0
    accuracy_fn = LMAccuracy()
    with torch.no_grad():
        for idx, data in enumerate(dataloader):
            tokens = data[0].to(device)
            tokens_lens = data[2].to(device) 
            tokens_lens = torch.full_like(tokens_lens, tokens.shape[0])
            outputs = model(tokens, tokens_lens)
            num_tokens = (tokens_lens - 1).sum().detach().item()
            total_tokens += num_tokens
            total_loss += loss_fn(outputs, tokens, tokens_lens) * num_tokens
            total_accuracy += accuracy_fn(outputs, tokens, tokens_lens)

    return total_loss / total_tokens, total_accuracy / total_tokens

def train_cipher(train_loader, test_loader, model, loss_fn, optimizer, scheduler, device, num_epochs):
    test_losses = []
    train_losses = []
    test_accuracies = []
    train_accuracies = []
    for epoch in range(num_epochs):
        train_epoch_cipher(train_loader, model, loss_fn, optimizer, scheduler, device)
        saver_state(model, 'cipher_back_weights' + str(epoch))
        train_loss, train_acc = evaluate_cipher(test_loader, model, loss_fn, device)
        train_accuracies.append(train_acc)
        train_losses.append(train_loss)
        print('Epoch: {0:d}/{1:d}. Loss (Train): {2:.3f}. Accuracy (Train): {3:.3f}'.format(epoch + 1, num_epochs, train_losses[-1], train_accuracies[-1]))
    return train_losses, train_accuracies

In [None]:
def train_epoch_curves(dataloader, model, loss_fn, optimizer, device):
    model.train()
    for idx, data in enumerate(dataloader):
        tokens = data[1].to(device)
        tokens_lens = data[3].to(device)
        tokens_lens = torch.full_like(tokens_lens, tokens.shape[0])
        optimizer.zero_grad()
        outputs = model(tokens, tokens_lens)
        loss = loss_fn(outputs, tokens, tokens_lens)
        loss.backward()
        optimizer.step()
    
def evaluate_curves(dataloader, model, loss_fn, device):
    model.eval()
    total_tokens = 0
    total_loss = 0.0
    total_accuracy = 0.0
    accuracy_fn = LMAccuracy()
    with torch.no_grad():
        for idx, data in enumerate(dataloader):
            tokens = data[1].to(device)
            tokens_lens = data[3].to(device) 
            tokens_lens = torch.full_like(tokens_lens, tokens.shape[0])
            outputs = model(tokens, tokens_lens)
            num_tokens = (tokens_lens - 1).sum().detach().item()
            total_tokens += num_tokens
            total_loss += loss_fn(outputs, tokens, tokens_lens) * num_tokens
            total_accuracy += accuracy_fn(outputs, tokens, tokens_lens)

    return total_loss / total_tokens, total_accuracy / total_tokens

def train_curves(train_loader, test_loader, model, loss_fn, optimizer, device, num_epochs):
    test_losses = []
    train_losses = []
    test_accuracies = []
    train_accuracies = []
    for epoch in range(num_epochs):
        train_epoch_curves(train_loader, model, loss_fn, optimizer, device)
        saver_state(model, 'curves_back_weights' + str(epoch))
        train_loss, train_acc = evaluate_curves(test_loader, model, loss_fn, device)
        train_accuracies.append(train_acc)
        train_losses.append(train_loss)
        print('Epoch: {0:d}/{1:d}. Loss (Train): {2:.3f}. Accuracy (Train): {3:.3f}'.format(epoch + 1, num_epochs, train_losses[-1], train_accuracies[-1]))
    return train_losses, train_accuracies

In [9]:
class AutoEncoderCipherBack(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, ciphers_vocab, device):
        super().__init__()
        self.ciphers_embs = torch.nn.Embedding(num_embeddings=len(ciphers_vocab), embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = FastRNNLayer(embedding_dim, hidden_dim, 0.25)
        self.linear = torch.nn.Linear(hidden_dim, len(ciphers_vocab))
                
    def forward(self, ciphers, ciphers_lens):
        ciphers_embeddings = self.ciphers_embs(ciphers)
        ciphers_embeddings = torch.flip(ciphers_embeddings, (0, ))
        output, (h, c) = self.lstm(ciphers_embeddings)
        output, (h, c) = self.lstm(ciphers_embeddings, (h, c))
        return self.linear(output)
    
class AutoEncoderCurvesBack(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, curves_vocab, device):
        super().__init__()
        self.curves_embs = torch.nn.Embedding(num_embeddings=len(curves_vocab), embedding_dim=embedding_dim, padding_idx=0)
        self.lstm = FastRNNLayer(embedding_dim, hidden_dim, 0.25)
        self.linear = torch.nn.Linear(hidden_dim, len(curves_vocab))
                
    def forward(self, curves, curves_lens):
        curves_embeddings = self.curves_embs(curves)
        curves_embeddings = torch.flip(curves_embeddings, (0, ))
        output, (h, c) = self.lstm(curves_embeddings)
        output, (h, c) = self.lstm(curves_embeddings, (h, c))
        return self.linear(output)

In [None]:
autoencoder_cipher_back = AutoEncoderCipherBack(embedding_dim, hidden_dim, ciphers_vocab, device).to(device=device)
autoencoder_cipher_back.load_state_dict(torch.load('cipher_back_weights5'))
lm_loss_fn = LMCrossEntropyLoss(reduction='mean')
lm_optimizer = torch.optim.Adam(autoencoder_cipher_back.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(lm_optimizer, gamma = (1e-5/1e-3)**(1/(1243560)))
t = time.time()
train_losses, train_accuracies = train_cipher(
    train_dataloader, train_dataloader, autoencoder_cipher_back, lm_loss_fn, lm_optimizer, scheduler, device, 30
)
print('time:', time.time() - t)

In [None]:
autoencoder_curves_back = AutoEncoderCurvesBack(embedding_dim, hidden_dim, curves_vocab, device).to(device=device)
lm_loss_fn = LMCrossEntropyLoss(reduction='mean')
lm_optimizer = torch.optim.Adam(autoencoder_curves_back.parameters(), lr=learning_rate)
t = time.time()
train_losses, train_accuracies = train_curves(
    train_dataloader, train_dataloader, autoencoder_curves_back, lm_loss_fn, lm_optimizer, device, 20
)
print('time:', time.time() - t)

## Классификатор

В классификаторе LSTM проходятся по построенным эмбеддингам шифров и кривых. Также в классификаторе участвуют эмбеддинги user agent. В конце применяется бэггинг 5 аналогичных моделей

In [None]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[START]", "[END]"], vocab_size=vocab_size)
tokenizer.train_from_iterator(
    [f"{row.ua}" for row in big_dataset.itertuples()], 
    trainer=trainer
)
tokenizer.enable_padding()

In [51]:
class TrainDatasetFinal(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.data.reset_index(drop=True, inplace=True)
        self.tokenizer = tokenizer
        self.rows = []
        for i, row in data.iterrows():
            self.rows.append(row)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.rows[idx]

        token_ids = tokenizer.encode(row.ua).ids
        if (len(token_ids) > 150):
            token_ids = token_ids[:150]
        ua = torch.tensor(tokenizer.encode("[START]").ids + token_ids + tokenizer.encode("[END]").ids, dtype=torch.long)

        ciphers = torch.tensor(ciphers_vocab.lookup_indices(['<sos>']) + ciphers_vocab.lookup_indices(list(row.ciphers)) + ciphers_vocab.lookup_indices(['<eos>']), dtype=torch.long)
        curves = torch.tensor(curves_vocab.lookup_indices(['<sos>']) + curves_vocab.lookup_indices(list(row.curves)) + curves_vocab.lookup_indices(['<eos>']), dtype=torch.long)
        return ua, ciphers, curves, torch.tensor(len(token_ids) + 2), torch.tensor(len(row.ciphers) + 2), torch.tensor(len(row.curves) + 2), torch.tensor(row.label, dtype=torch.float)
    
class TestDatasetFinal(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.data.reset_index(drop=True, inplace=True)
        self.tokenizer = tokenizer
        self.rows = []
        for i, row in data.iterrows():
            self.rows.append(row)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.rows[idx]

        token_ids = tokenizer.encode(row.ua).ids
        if (len(token_ids) > 150):
            token_ids = token_ids[:150]
        ua = torch.tensor(tokenizer.encode("[START]").ids + token_ids + tokenizer.encode("[END]").ids, dtype=torch.long)

        ciphers = torch.tensor(ciphers_vocab.lookup_indices(['<sos>']) + ciphers_vocab.lookup_indices(list(row.ciphers)) + ciphers_vocab.lookup_indices(['<eos>']), dtype=torch.long)
        curves = torch.tensor(curves_vocab.lookup_indices(['<sos>']) + curves_vocab.lookup_indices(list(row.curves)) + curves_vocab.lookup_indices(['<eos>']), dtype=torch.long)
        return ua, ciphers, curves, torch.tensor(len(token_ids) + 2), torch.tensor(len(row.ciphers) + 2), torch.tensor(len(row.curves) + 2)

def collate_to_train_batch_final(batch):
    ua = torch.nn.utils.rnn.pad_sequence([i[0] for i in batch], padding_value=0)
    ciphers = torch.nn.utils.rnn.pad_sequence([i[1] for i in batch], padding_value=0)
    curves = torch.nn.utils.rnn.pad_sequence([i[2] for i in batch], padding_value=0)
    return ua, ciphers, curves, torch.tensor([i[3] for i in batch], dtype=torch.long), torch.tensor([i[4] for i in batch], dtype=torch.long), torch.tensor([i[5] for i in batch], dtype=torch.long), torch.tensor([i[6] for i in batch], dtype=torch.float)

def collate_to_test_batch_final(batch):
    ua = torch.nn.utils.rnn.pad_sequence([i[0] for i in batch], padding_value=0)
    ciphers = torch.nn.utils.rnn.pad_sequence([i[1] for i in batch], padding_value=0)
    curves = torch.nn.utils.rnn.pad_sequence([i[2] for i in batch], padding_value=0)
    return ua, ciphers, curves, torch.tensor([i[3] for i in batch], dtype=torch.long), torch.tensor([i[4] for i in batch], dtype=torch.long), torch.tensor([i[5] for i in batch], dtype=torch.long)

In [52]:
def train_epoch_final(dataloader, model, loss_fn, optimizer, device):
    model.train()
    for idx, data in enumerate(dataloader):
        ua = data[0].to(device)
        ciphers = data[1].to(device)
        curves = data[2].to(device)
        ua_lens = data[3].to(device)
        ciphers_lens = data[4].to(device)
        curves_lens = data[5].to(device)
        labels = data[6].to(device)
        optimizer.zero_grad()
        outputs = model(ua, ciphers, curves, ua_lens, ciphers_lens, curves_lens)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
def evaluate_final(dataloader, model, loss_fn, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for idx, data in enumerate(dataloader):
            ua = data[0].to(device)
            ciphers = data[1].to(device)
            curves = data[2].to(device)
            ua_lens = data[3].to(device)
            ciphers_lens = data[4].to(device)
            curves_lens = data[5].to(device)
            labels = data[6].to(device)
            outputs = model(ua, ciphers, curves, ua_lens, ciphers_lens, curves_lens)
            total_loss += loss_fn(outputs, labels) * len(outputs)
            if not idx:
                y_pred = torch.sigmoid(outputs).cpu().numpy()
                y_true = labels.cpu().numpy()
            else:
                y_pred = np.concatenate((y_pred, torch.sigmoid(outputs).cpu().numpy()))
                y_true = np.concatenate((y_true, labels.cpu().numpy()))
        
    return total_loss / len(dataloader.dataset), roc_auc_score(y_true, y_pred)

def train_final(train_loader, test_loader, model, loss_fn, optimizer, scheduler, device, num_epochs):
    test_losses = []
    train_losses = []
    test_accuracies = []
    train_accuracies = []
    for epoch in range(num_epochs):
        train_epoch_final(train_loader, model, loss_fn, optimizer, device)
        scheduler.step()

        train_loss, train_acc = evaluate_final(train_loader, model, loss_fn, device)
        train_accuracies.append(train_acc)
        train_losses.append(train_loss)
        
        test_loss, test_acc = evaluate_final(test_loader, model, loss_fn, device)
        test_accuracies.append(test_acc)
        test_losses.append(test_loss)
        
        print(
            'Epoch: {0:d}/{1:d}. Loss (Train/Test): {2:.3f}/{3:.3f}. ROC-AUC (Train/Test): {4:.3f}/{5:.3f}'.format(
                epoch + 1, num_epochs, train_losses[-1], test_losses[-1], train_accuracies[-1], test_accuracies[-1]
            )
        )
    return train_losses, train_accuracies, test_losses, test_accuracies

def train_submission(train_loader, test_dataloader, model, loss_fn, optimizer, scheduler, device, num_epochs):
    for epoch in range(num_epochs):
        train_epoch_final(train_loader, model, loss_fn, optimizer, device)
        train_loss, train_acc = evaluate_final(train_loader, model, loss_fn, device)
        scheduler.step()
        print('Epoch: {0:d}/{1:d}. Loss (Train): {2:.3f}. ROC-AUC (Train): {3:.3f}'.format(epoch + 1, num_epochs, train_loss, train_acc))
    
    model.eval()
    with torch.no_grad():
        for idx, data in enumerate(test_dataloader):
            ua = data[0].to(device)
            ciphers = data[1].to(device)
            curves = data[2].to(device)
            ua_lens = data[3].to(device)
            ciphers_lens = data[4].to(device)
            curves_lens = data[5].to(device)
            if not idx:
                outputs = model(ua, ciphers, curves, ua_lens, ciphers_lens, curves_lens)
            else:
                outputs = torch.cat((outputs, model(ua, ciphers, curves, ua_lens, ciphers_lens, curves_lens)), dim=0)
    return outputs

In [54]:
autoencoder_cipher = AutoEncoderCipher(embedding_dim, hidden_dim, ciphers_vocab, device).to(device=device)
autoencoder_cipher.load_state_dict(torch.load('cipher_new_weights16'))
autoencoder_curves = AutoEncoderCurves(embedding_dim, hidden_dim, curves_vocab, device).to(device=device)
autoencoder_curves.load_state_dict(torch.load('curves_new_weights19'))
autoencoder_cipher_back = AutoEncoderCipherBack(embedding_dim, hidden_dim, ciphers_vocab, device).to(device=device)
autoencoder_cipher_back.load_state_dict(torch.load('cipher_back_weights5'))
autoencoder_curves_back = AutoEncoderCurvesBack(embedding_dim, hidden_dim, curves_vocab, device).to(device=device)
autoencoder_curves_back.load_state_dict(torch.load('curves_back_weights15'))

<All keys matched successfully>

In [None]:
class RNNFinal(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, embedding_dim_ua, hidden_dim_ua, ciphers_vocab, curves_vocab, autoencoder_cipher, autoencoder_curves, autoencoder_cipher_back, autoencoder_curves_back):
        super().__init__()
        self.lstm1 = FastRNNLayer(embedding_dim, hidden_dim, 0.25)
        self.lstm2 = FastRNNLayer(embedding_dim, hidden_dim, 0.25)
        self.lstm3 = FastRNNLayer(embedding_dim_ua, hidden_dim_ua, 0.25)
        self.lstm4 = FastRNNLayer(embedding_dim, hidden_dim, 0.25)
        self.lstm5 = FastRNNLayer(embedding_dim, hidden_dim, 0.25)
        self.lstm6 = FastRNNLayer(embedding_dim_ua, hidden_dim_ua, 0.25)
        self.linear1 = torch.nn.Linear(4 * hidden_dim + 2 * hidden_dim_ua, hidden_dim)
        self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = torch.nn.Linear(hidden_dim, 1)
        self.relu1 = torch.nn.ReLU()
        self.relu2 = torch.nn.ReLU()
        self.autoencoder_cipher = autoencoder_cipher
        self.autoencoder_curves = autoencoder_curves
        self.autoencoder_cipher_back = autoencoder_cipher_back
        self.autoencoder_curves_back = autoencoder_curves_back
        self.ua_embs = torch.nn.Embedding(num_embeddings=1000, embedding_dim=embedding_dim_ua, padding_idx=0)
        self.ua_embs_back = torch.nn.Embedding(num_embeddings=1000, embedding_dim=embedding_dim_ua, padding_idx=0)
                
    def forward(self, ua, ciphers, curves, ua_lens, ciphers_lens, curves_lens):
        ciphers_embeddings = self.autoencoder_cipher.ciphers_embs(ciphers)
        curves_embeddings = self.autoencoder_curves.curves_embs(curves)
        ciphers_embeddings_back = self.autoencoder_cipher_back.ciphers_embs(ciphers)
        curves_embeddings_back = self.autoencoder_curves_back.curves_embs(curves)
        ua_embeddings = self.ua_embs(ua)
        ua_embeddings_back = self.ua_embs_back(ua)
        output1 = self.lstm1(ciphers_embeddings)[0]
        output2 = self.lstm2(curves_embeddings)[0]
        output3 = self.lstm3(ua_embeddings)[0]
        output4, (h1, _) = self.lstm4(torch.flip(ciphers_embeddings_back, (0, )))
        output5, (h2, _) = self.lstm5(torch.flip(curves_embeddings_back, (0, )))
        output6, (h3, _) = self.lstm6(torch.flip(ua_embeddings_back, (0, )))
        output = torch.cat((output1[ciphers_lens - 1, torch.arange(output1.shape[1]), :], output2[curves_lens - 1, torch.arange(output2.shape[1]), :], output3[ua_lens - 1, torch.arange(output3.shape[1]), :], torch.squeeze(h1), torch.squeeze(h2), torch.squeeze(h3)), 1)
        output = self.linear3(self.relu2(self.linear2(self.relu1(self.linear1(output)))))
        output = torch.squeeze(output)
        return output

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(TestDatasetFinal(test_dataset, tokenizer), batch_size=batch_size, collate_fn=collate_to_test_batch_final, pin_memory=False, shuffle=False)
for i, (train_index, test_index) in enumerate(skf.split(train_dataset.curves.values, train_dataset.label.values)):
    train_dataloader = torch.utils.data.DataLoader(TrainDatasetFinal(train_dataset.loc[train_index], tokenizer), batch_size=batch_size, collate_fn=collate_to_train_batch_final, pin_memory=False, shuffle=True)
    autoencoder_cipher = AutoEncoderCipher(embedding_dim, hidden_dim, ciphers_vocab, device).to(device=device)
    autoencoder_cipher.load_state_dict(torch.load('cipher_new_weights16'))
    autoencoder_curves = AutoEncoderCurves(embedding_dim, hidden_dim, curves_vocab, device).to(device=device)
    autoencoder_curves.load_state_dict(torch.load('curves_new_weights19'))
    autoencoder_cipher_back = AutoEncoderCipherBack(embedding_dim, hidden_dim, ciphers_vocab, device).to(device=device)
    autoencoder_cipher_back.load_state_dict(torch.load('cipher_back_weights5'))
    autoencoder_cipher_back = AutoEncoderCipher(embedding_dim, hidden_dim, ciphers_vocab, device).to(device=device)
    autoencoder_cipher_back.load_state_dict(torch.load('cipher_new_weights16'))
    autoencoder_curves_back = AutoEncoderCurvesBack(embedding_dim, hidden_dim, curves_vocab, device).to(device=device)
    autoencoder_curves_back.load_state_dict(torch.load('curves_back_weights15'))
    model = RNNFinal(embedding_dim, hidden_dim, embedding_dim_ua, hidden_dim_ua, ciphers_vocab, curves_vocab, autoencoder_cipher, autoencoder_curves, autoencoder_cipher_back, autoencoder_curves_back).to(device)
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean')
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = (5e-5/1e-3)**(1/(num_epochs)))
    submission = train_submission(train_dataloader, test_dataloader, model, loss_fn, optimizer, scheduler, device, num_epochs)
    submission = torch.sigmoid(submission).detach().cpu().numpy()
    if i == 0:
        total_submission = submission
    else:
        total_submission += submission

In [None]:
my_submission = pd.read_csv('sample_submission.csv')
submission = total_submission / 5
my_submission.is_bot = submission
my_submission.to_csv("submission_final.csv", index=None)