# Imports

Если версия numpy отличается, то может возникнуть ошибка при обучении. Нужные версии есть в requirements.txt

In [None]:
import os
import random
import re
from itertools import chain
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from lightning import LightningModule, Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from madgrad import MADGRAD
from sklearn.model_selection import StratifiedKFold
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase
from tokenizers.trainers import BpeTrainer
from torchmetrics import AUROC, MaxMetric, MeanMetric

from tokenizers import Tokenizer


def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


device = "cuda"

train = pd.read_parquet("data/train.parquet")
test = pd.read_parquet("data/test.parquet")


In [None]:
train.head()

# TOKENIZER

In [None]:
train["ciphers_str"] = train.ciphers.apply(lambda x: " ".join(x))
train["curves_str"] = train.curves.apply(lambda x: " ".join(x))

test["ciphers"] = (
    test["ciphers"].astype(str).str.slice(3, -2).str.replace('"', "").str.split(",")
)
test["curves"] = (
    test["curves"].astype(str).str.slice(3, -2).str.replace('"', "").str.split(",")
)
test["ciphers_str"] = test.ciphers.apply(lambda x: " ".join(x))
test["curves_str"] = test.curves.apply(lambda x: " ".join(x))


In [None]:
def clean_ua(data):
    """
    Removes unnecessary characters from a given dataframe column named "ua".

    Args:
    -   data: A pandas dataframe.

    Returns:
    -    A pandas dataframe with the cleaned ua column.
    """
    data["ua_proc"] = data["ua"].apply(lambda x: re.sub("[()/;,]", "", x))
    data["ua_proc"] = data["ua"].str.partition('"')[0]
    return data


def get_tokenizer_curves(
    train: pd.DataFrame, test: pd.DataFrame, min_frequency=10, vocab_size=2048
):
    """
    Trains a Tokenizer model on the "curves_str" column of the given train and test dataframes.

    Args:
    -    train: A pandas dataframe containing the training data.
    -    test: A pandas dataframe containing the testing data.
    -    min_frequency: Minimum frequency of subwords. Defaults to 10.
    -    vocab_size: Size of the final vocabulary. Defaults to 2048.

    Returns:
    -    A trained Tokenizer model.
    """
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.normalizer = Lowercase()

    trainer = BpeTrainer(
        special_tokens=["[PAD]", "[UNK]", " "],
        min_frequency=min_frequency,
        vocab_size=vocab_size,
    )

    tokenizer.train_from_iterator(
        [f"{row.curves_str}" for row in chain(train.itertuples(), test.itertuples())],
        trainer=trainer,
    )
    tokenizer.enable_padding()

    return tokenizer


def get_tokenizer_ua(
    train: pd.DataFrame, test: pd.DataFrame, min_frequency=10, vocab_size=2048
):
    """
    Trains a Tokenizer model on the "ua" column of the given train and test dataframes.

    Args:
    -    train: A pandas dataframe containing the training data.
    -    test: A pandas dataframe containing the testing data.
    -    min_frequency: Minimum frequency of subwords. Defaults to 10.
    -    vocab_size: Size of the final vocabulary. Defaults to 2048.

    Returns:
    -    A trained Tokenizer model.
    """
    clean_ua(train)
    clean_ua(test)

    tokenizer_ua = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer_ua.normalizer = Lowercase()

    trainer = BpeTrainer(
        special_tokens=["[PAD]", "[UNK]", " "],
        min_frequency=min_frequency,
        vocab_size=vocab_size,
    )

    tokenizer_ua.train_from_iterator(
        [f"{row.ua_proc}" for row in chain(train.itertuples(), test.itertuples())],
        trainer=trainer,
    )
    tokenizer_ua.enable_padding()

    return tokenizer_ua


def get_tokenizer_ciphers(
    train: pd.DataFrame, test: pd.DataFrame, min_frequency=10, vocab_size=2048
):
    """
    Trains a Tokenizer model on the "ciphers_str" column of the given train and test dataframes.

    Args:
    -    train: A pandas dataframe containing the training data.
    -    test: A pandas dataframe containing the testing data.
    -    min_frequency: Minimum frequency of subwords. Defaults to 10.
    -    vocab_size: Size of the final vocabulary. Defaults to 2048.

    Returns:
    -    A trained Tokenizer model.
    """
    tokenizer_ciphers = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer_ciphers.normalizer = Lowercase()

    trainer = BpeTrainer(
        special_tokens=["[PAD]", "[UNK]", " "],
        min_frequency=min_frequency,
        vocab_size=vocab_size,
    )

    tokenizer_ciphers.train_from_iterator(
        [f"{row.ciphers_str}" for row in chain(train.itertuples(), test.itertuples())],
        trainer=trainer,
    )
    tokenizer_ciphers.enable_padding()

    return tokenizer_ciphers


In [None]:
tokenizer_ciphers = get_tokenizer_ciphers(train, test)
tokenizer_curves = get_tokenizer_curves(train, test)
tokenizer_ua = get_tokenizer_ua(train, test)

CIPHERS_PADDING_IDX = tokenizer_ciphers.token_to_id("[PAD]")
CIPHERS_VOCAB_SIZE = tokenizer_ciphers.get_vocab_size()

CURVES_PADDING_IDX = tokenizer_curves.token_to_id("[PAD]")
CURVES_VOCAB_SIZE = tokenizer_curves.get_vocab_size()

UA_PADDING_IDX = tokenizer_ua.token_to_id("[PAD]")
UA_VOCAB_SIZE = tokenizer_ua.get_vocab_size()













# DATASET

In [None]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        self.data = data
        self.data.reset_index(drop=True, inplace=True)

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[int, str, int]:
        row = self.data.loc[idx]
        return row.ciphers_str, row.curves_str, row.ua_proc, row.label


class TestDataset(torch.utils.data.Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        self.data = data
        self.data.reset_index(drop=True, inplace=True)

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[int, str, int]:
        row = self.data.loc[idx]
        return row.ciphers_str, row.curves_str, row.ua_proc


In [None]:
def tokenize(texts: List[str], tokenizer: str) -> torch.Tensor:
    """
    Tokenizes a list of text using given tokenizer model.

    Args:
    -    texts: A list of strings to tokenize.
    -    tokenizer: A string specifying the tokenizer to use.

    Returns:
    -    A torch tensor representing the tokenized texts.
    """
    return torch.tensor(
        [x.ids for x in tokenizer.encode_batch(texts, add_special_tokens=True)]
    )


def collate_to_train_batch(
    batch: List[Tuple[str, str, str, int]]
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Collate a batch of training data.

    Args:
    -    batch: a list of tuples representing the train data to collate.

    Returns:
    -    A tuple of torch tensors representing the train collated data.
    """
    ciphers, curves, ua, labels = zip(*batch)

    ciphers_tensor = tokenize(ciphers, tokenizer=tokenizer_ciphers)
    curves_tensor = tokenize(curves, tokenizer=tokenizer_curves)
    ua_tensor = tokenize(ua, tokenizer=tokenizer_ua)
    label_tensor = torch.Tensor(labels).view(-1, 1)

    return (ciphers_tensor, curves_tensor, ua_tensor), label_tensor


def collate_to_test_batch(
    batch: List[Tuple[str, str, str]]
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Collate a batch of training data.

     Args:
    -    batch: a list of tuples representing the test data to collate.

    Returns:
    -    A tuple of torch tensors representing the collated test data.
    """
    ciphers, curves, ua = zip(*batch)

    ciphers_tensor = tokenize(ciphers, tokenizer=tokenizer_ciphers)
    curves_tensor = tokenize(curves, tokenizer=tokenizer_curves)
    ua_tensor = tokenize(ua, tokenizer=tokenizer_ua)

    return ciphers_tensor, curves_tensor, ua_tensor


# MODELS

In [None]:
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens in the sequence.
        The positional encodings have the same dimension as the embeddings, so that the two can be summed.
        Here, we use sine and cosine functions of different frequencies.
    .. math:
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.max_len = max_len

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [batch size, sequence length, embed dim]
            output: [batch size, sequence length, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """
        t = self.pe.permute(1, 0, 2)[
            :,
            : x.size(1),
        ]
        x = x + t
        return self.dropout(x)


class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder

    Args:
        ntoken (int): vocabulary size (num_embeddings in Embedding)
        ninp (int): embedding size (embedding_dim in Embedding)
        nhead (int): number of heads in TransformerEncoderLayer
        nhid (int): feedforward size in TransformerEncoderLayer
        noutp (int): output size
        nlayers (int): number of TransformerEncoderLayer
        padding_idx (int): padding token
        dropout (float, optional): dropout for TransformerEncoderLayer. Defaults to 0.5.
        pos_max_len (int, optional): maximum sequence length for positional encoding. Defaults to 700.

    Raises:
        ImportError: ImportError
    """

    def __init__(
        self,
        ntoken: int,
        ninp: int,
        nhead: int,
        nhid: int,
        noutp: int,
        nlayers: int,
        padding_idx: int,
        dropout: float = 0.5,
        pos_max_len: int = 700,
    ):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except BaseException as e:
            raise ImportError(
                "TransformerEncoder module does not exist in PyTorch 1.1 or " "lower."
            ) from e
        self.model_type = "Transformer"
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout, pos_max_len)
        encoder_layers = TransformerEncoderLayer(
            ninp, nhead, nhid, dropout, batch_first=True
        )
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp, padding_idx=padding_idx)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, noutp)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz: int) -> torch.Tensor:
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = (
            mask.float()
            .masked_fill(mask == 0, float("-inf"))
            .masked_fill(mask == 1, float(0.0))
        )
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src: torch.Tensor, has_mask: bool = False) -> torch.Tensor:
        r"""Inputs of forward function
        Args:
            src: the sequence for processing.
        Shape:
            src: [batch size, sequence length, ninp]
            output: [batch size, sequence length, noutp]
        """
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * np.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output


In [None]:
class SubModel(nn.Module):
    """SubModel for handling each part of input data separately, based on Transformer

    Args:
        padding_idx (int): padding token
        vocab_size (int): vocabulary size
        embed_size (int): embedding size
        hidden_size (int): output size
        pos_max_len (int): maximum sequence length for positional encoding
        dropout (float): dropout for Transformer
    """
    def __init__(
        self,
        padding_idx: int,
        vocab_size: int,
        embed_size: int,
        hidden_size: int,
        pos_max_len: int,
        dropout: float,
    ) -> None:

        super().__init__()

        self.transformer = TransformerModel(
            ntoken=vocab_size,
            ninp=embed_size,
            nhead=2,
            nhid=256,
            noutp=hidden_size,
            nlayers=3,
            padding_idx=padding_idx,
            dropout=dropout,
            pos_max_len=pos_max_len,
        )

    def get_embed(self, tensor: torch.Tensor) -> torch.Tensor:
        embeds = self.transformer(tensor)
        embed_max = torch.nn.functional.max_pool2d(
            embeds, kernel_size=(embeds.size(1), 1)
        ).squeeze(dim=1)
        return embed_max - embeds.mean(dim=1)

    def forward(self, tensor: torch.Tensor) -> torch.Tensor:
        r"""Inputs of forward function
        Args:
            src: the sequence for processing.
        Shape:
            src: [batch size, sequence length]
            output: [batch size, sequence length, hidden_size]
        """
        embeds = self.get_embed(tensor)
        return embeds


In [None]:
class Model(nn.Module):
    def __init__(
        self,
        vocab_ciphers: int,
        vocab_curves: int,
        vocab_ua: int,
        length_ciphers: int,
        length_curves: int,
        length_ua: int,
        hidden_size: int,
        pad_ciphers: int,
        pad_curves: int,
        pad_ua: int,
    ):
        """Main Model

        Args:
            vocab_ciphers (int): vocabulary size for ciphers
            vocab_curves (int): vocabulary size for ciphers
            vocab_ua (int): vocabulary size for ciphers
            length_ciphers (int): maximum sequence length for positional encoding for ciphers
            length_curves (int): maximum sequence length for positional encoding for curves
            length_ua (int): maximum sequence length for positional encoding for ua
            hidden_size (int): hidden size for fc
            pad_ciphers (int): padding token for ciphers
            pad_curves (int): padding token for ciphers
            pad_ua (int): padding token for ciphers
        """
        super().__init__()
        self.embed_ciphers = SubModel(
            vocab_size=vocab_ciphers,
            embed_size=128,
            hidden_size=64,
            padding_idx=pad_ciphers,
            pos_max_len=length_ciphers,
            dropout=0.1,
        ) # на выходе эмбеддинг шифров
        self.embed_curves = SubModel(
            vocab_size=vocab_curves,
            embed_size=128,
            hidden_size=64,
            padding_idx=pad_curves,
            pos_max_len=length_curves,
            dropout=0.1,
        ) # на выходе эмбеддинг кривых
        self.embed_ua = SubModel(
            vocab_size=vocab_ua,
            embed_size=128,
            hidden_size=64,
            padding_idx=pad_ua,
            pos_max_len=length_ua,
            dropout=0.1,
        ) # на выходе эмбеддинг юзер агента

        self.fc1 = nn.Sequential(
            nn.Linear(64 * 3, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.BatchNorm1d(hidden_size // 2),
        ) # на выходе получаем финальный TLS эмбеддинг
        self.activation = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size // 2, 1) # финальный классификатор

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        r"""Inputs of forward function
        Args:
            src: the sequence for processing.
        Shape:
            src: [batch size, sequence length]
            output: [batch size, 1]
        """
        embed_ciphers = self.embed_ciphers(x[0])
        embed_curves = self.embed_curves(x[1])
        embed_ua = self.embed_ua(x[2])
        embed = torch.cat((embed_ciphers, embed_curves, embed_ua), dim=1)

        outp = self.fc1(embed)
        outp = self.fc2(self.activation(outp))
        return outp


In [None]:
class LitModule(LightningModule):
    def __init__(
        self,
        net: torch.nn.Module,
    ):
        super().__init__()
        self.save_hyperparameters(logger=False)

        self.net = net
        self.criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([0.5]))

        self.train_rocauc = AUROC(
            task="binary",
        )
        self.val_rocauc = AUROC(
            task="binary",
        )
        self.train_loss = MeanMetric()
        self.val_loss = MeanMetric()
        self.val_rocauc_best = MaxMetric()

    def configure_optimizers(self) -> torch.optim.Optimizer:
        optim = MADGRAD(self.parameters(), lr=0.0001)
        self.sch = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.9)
        return optim

    def forward(self, x: torch.Tensor):
        return self.net(x)

    def on_train_start(self):
        self.val_loss.reset()
        self.val_rocauc.reset()
        self.val_rocauc_best.reset()

    def model_step(self, batch):
        x, y = batch
        logits = self.forward(x)
        loss = self.criterion(logits, y)
        preds = torch.sigmoid(logits)
        return loss, preds.squeeze(), y.squeeze()

    def training_step(self, batch):
        loss, preds, targets = self.model_step(batch)

        self.train_loss(loss)
        self.train_rocauc(preds, targets)
        self.log(
            "train_loss", self.train_loss, on_step=False, on_epoch=True, prog_bar=True
        )
        self.log(
            "train_rocauc",
            self.train_rocauc,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )

        return loss

    def on_train_epoch_end(self):
        self.sch.step()

    def validation_step(self, batch, batch_idx):
        loss, preds, targets = self.model_step(batch)

        self.val_loss(loss)
        self.val_rocauc(preds, targets)
        self.log("val_loss", self.val_loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log(
            "val_rocauc", self.val_rocauc, on_step=False, on_epoch=True, prog_bar=True
        )

    def on_validation_epoch_end(self):
        rocauc = self.val_rocauc.compute()
        self.val_rocauc_best(rocauc)
        self.log("val_rocauc_best", self.val_rocauc_best.compute(), prog_bar=True)


# TRAIN

In [None]:
skf = StratifiedKFold(n_splits=5)
for i, (train_index, val_index) in enumerate(skf.split(train, train.label)):
    print(f"FOLD {i}", "-" * 20)
    train_fold = train.loc[train_index]
    val_fold = train.loc[val_index]

    train_dl = torch.utils.data.DataLoader(
        TrainDataset(train_fold),
        batch_size=128,
        collate_fn=collate_to_train_batch,
        pin_memory=False,
        shuffle=True,
    )
    val_dl = torch.utils.data.DataLoader(
        TrainDataset(val_fold),
        batch_size=128,
        collate_fn=collate_to_train_batch,
        pin_memory=False,
    )

    model = LitModule(
        Model(
            length_ciphers=248,
            length_curves=65,
            length_ua=156,
            hidden_size=256,
            vocab_ciphers=CIPHERS_VOCAB_SIZE,
            vocab_curves=CURVES_VOCAB_SIZE,
            vocab_ua=UA_VOCAB_SIZE,
            pad_ciphers=CIPHERS_PADDING_IDX,
            pad_curves=CURVES_PADDING_IDX,
            pad_ua=UA_PADDING_IDX,
        )
    )

    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath="models/37/",
        filename=f"model-{i}" + "-{val_loss:.3f}",
        save_top_k=1,
        mode="min",
    )

    trainer = Trainer(
        max_epochs=16,
        accelerator=device,
        callbacks=[checkpoint_callback],
        enable_model_summary=False,
    )
    trainer.fit(model=model, train_dataloaders=train_dl, val_dataloaders=val_dl)


# SUBMISSION

In [None]:
model_paths = Path("models/37/").iterdir()
all_probs = []
for model_path in model_paths:
    cur_model = LitModule.load_from_checkpoint(model_path)

    test_dl = torch.utils.data.DataLoader(
        TestDataset(test),
        batch_size=128,
        collate_fn=collate_to_test_batch,
        pin_memory=False,
        shuffle=False,
    )
    probs = torch.concat(trainer.predict(model, dataloaders=test_dl)).numpy()
    all_probs.append(probs)
all_probs = np.array(all_probs)


In [None]:
subm = pd.DataFrame(
    {
        "id": test.id,
        "is_bot": torch.sigmoid(torch.tensor(all_probs))
        .mean(dim=0)
        .squeeze(dim=1)
        .numpy(),
    }
)
subm


Unnamed: 0,id,is_bot
0,5,0.907719
1,6,0.565657
2,12,0.904699
3,20,0.618101
4,21,0.565810
...,...,...
14384,62317,0.286175
14385,62319,0.946781
14386,62322,0.874187
14387,62333,0.997575


In [None]:
subm.to_csv("subm37.csv")
