In [None]:
import io
import os
import gc
import pickle
import random
import termcolor
import warnings
import shutil
from functools import partial
from datetime import datetime
from dataclasses import dataclass
from pathlib import Path
from typing import List

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Sampler

In [None]:
!pip uninstall -y torchtext

Uninstalling torchtext-0.9.0:
  Successfully uninstalled torchtext-0.9.0


In [None]:
!pip install git+https://github.com/facebookresearch/fastText.git
!pip install -q pytorch-lightning
!pip install -q transformers
!pip install -q datasets
!pip install -q sentencepiece
!pip install -q gensim

Collecting git+https://github.com/facebookresearch/fastText.git
  Cloning https://github.com/facebookresearch/fastText.git to /tmp/pip-req-build-e7tdrkjf
  Running command git clone -q https://github.com/facebookresearch/fastText.git /tmp/pip-req-build-e7tdrkjf
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3091552 sha256=82bf672838c43c2657318e04ec281e5fc464836745c24f90f3744e9d51964351
  Stored in directory: /tmp/pip-ephem-wheel-cache-u76z5qsn/wheels/69/f8/19/7f0ab407c078795bc9f86e1f6381349254f86fd7d229902355
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2
[K     |████████████████████████████████| 829kB 16.0MB/s 
[K     |████████████████████████████████| 276kB 17.7MB/s 
[K     |████████████████████████████████| 829kB 30.1MB/s 
[K     |████████████████████████████████| 112kB 55

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.loggers.csv_logs import CSVLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.metrics.classification import Accuracy
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.loggers import NeptuneLogger

from datasets import Dataset
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

import sentencepiece as spm
import gensim
import fasttext

# Config

In [None]:
DEBUG = True
SAVE_PATH = None
SEED = 42
NOW = datetime.strftime(datetime.now() , "%m%d")

SAVE_PATH = Path(f'/content/lightning-logs/')
SAVE_PATH.mkdir(exist_ok=True)

NUM_WORKERS = os.cpu_count()
GPUS = 1 if torch.cuda.is_available() else None

print("DEBUG:\t", DEBUG)
print("SAVE_PATH:\t", SAVE_PATH)
print("NUM_WORKERS:\t", NUM_WORKERS)
print("GPUS:\t", GPUS)

DEBUG:	 True
SAVE_PATH:	 /content/lightning-logs
NUM_WORKERS:	 2
GPUS:	 1


In [None]:
@dataclass
class Config:
    vocab_size:int = 8000

    max_seq_len:int = 192
    num_fold:int = 5

    lr:float = 3e-5
    batch_size: int = 128
    num_epoch:int = 10
    max_grad_norm:float = 1.0
    gradient_accumulation_steps: int = 1
    warmup_steps: int = 0
    weight_decay: float = 0.0
    adam_beta1: float = 0.9
    adam_beta2: float = 0.999
    adam_epsilon: float= 1e-8
    max_grad_norm: float = 1.0

    save_top_k:int = 1

    def __post_init__(self):
        pass


cfg = Config()
cfg

Config(vocab_size=8000, max_seq_len=192, num_fold=5, lr=3e-05, batch_size=128, num_epoch=10, max_grad_norm=1.0, gradient_accumulation_steps=1, warmup_steps=0, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, save_top_k=1)

# Helper

In [None]:
def save_pkl(dir, name, obj):
    dir.mkdir(exist_ok=True)
    with open(dir / name, 'wb') as f:
        pickle.dump(obj, f)

def load_pkl(dir, name):
    with open(dir / name, 'rb') as f:
        return pickle.load(f)

def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
set_seed(SEED)

In [None]:
class SentenceItertor:
    def __init__(self, texts: List[str]):
        self.texts = texts
    def __iter__(self):
        for text in self.texts:
            yield text

# Load Data

In [None]:
train_df = pd.read_csv("../input/Train.csv")
test_df = pd.read_csv("../input/Test.csv")

In [None]:
LABEL2ID = {label:i for i, label in enumerate(train_df['label'].unique())}
ID2LABEL = {v:k for k, v in LABEL2ID.items()}

train_df['label_ids'] = train_df['label'].map(LABEL2ID)

In [None]:
train_df

Unnamed: 0,ID,text,label,label_ids
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1,0
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1,0
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1,0
3,U0TTYY8,ak slouma,1,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1,0
...,...,...,...,...
69995,ZRSR7TZ,pff bayna beli kbira f wejhakk yakhiii rouhi r...,-1,0
69996,QNQVEIH,aman lmara jeya zidou t3am9ou fel a7deeth akth...,-1,0
69997,LJ2K9MD,winha nakhtabha hhhhh,-1,0
69998,5RZ1T7I,fachel enta w houwa,-1,0


In [None]:
all_texts = pd.concat([train_df['text'].str.lower(), test_df['text'].str.lower()])

# Train setnecepiece

In [None]:
def spm_train(all_texts, vocab_size=8000, model_type='unigram', pad_id=0, unk_id=1):
  spm_model = io.BytesIO()
  spm.SentencePieceTrainer.train(sentence_iterator=iter(all_texts), 
                                model_writer=spm_model, 
                                model_type=model_type,
                                vocab_size=vocab_size,
                                pad_id=pad_id,
                                unk_id=unk_id,
                                bos_id=-1,
                                eos_id=-1,
                                character_coverage=1.0,
                                )

  return spm.SentencePieceProcessor(model_proto=spm_model.getvalue())

In [None]:
sp = spm_train(all_texts, vocab_size=cfg.vocab_size)

In [None]:
for text in train_df.sample(10)['text']:
    print(sp.encode_as_pieces(text))

['▁rit', 'ha', '▁allah', '▁ila', '▁bir', 'itha', '▁waldin']
['▁baya', '▁ye', '▁9', 'ahba']
['▁gh', 'asra', '▁w', '▁ta', 'adet']
['▁abonne', 'z', '▁vous', '▁lmkach', '5', 'in']
['▁mridha', '▁rabi', '▁yechf', 'ha']
['▁ta', '7', '7', 'an', 'it', '▁in', 'nahdha', '▁bda', 'w', '▁in', 'ab', 'brou']
['▁rabi', '▁m', '3', 'ak', '▁docteur']
['▁brabi', '▁n', 'j', 'm', '▁na', 't', 's', 'l', '▁bik']
['▁slim', '▁riahi', '▁va', '▁t', '▁so', 'i', 'g', 'n', 'er', '▁es', 'p', 'è', 'ce', 's', '▁d', '▁impo', 's', 'teur']
['▁sy', 'és', 'iyin', 'e', '▁mo', 'b', 't', 'ad', '2', 'i', 'ine', '▁9', 'a', '3', 'dine', '▁yet', '3', 'almou', '▁fi', '▁sé', 'yé', 'sa', '▁fi', '▁cha', '3', 'eb', '▁tunis', '▁kol', '▁youm', '▁yet', 'e', '7', 'founa', '▁b', '▁3', 'afsa', '▁si', 'yé', 'siya', '▁j', 'diii', 'da', '▁meskina', '▁thawra', '▁w', '▁meskin', 'e', '▁cha', '3', 'eb', '▁eza', 'wé', 'li', '▁howa', '▁li', '▁9', 'a', '3', 'ed', '▁y', '3', 'ani', '▁kol', '▁youm']


In [None]:
all_texts_pieces = []
for text in all_texts:
    all_texts_pieces.append(sp.encode_as_pieces(text))

# Word2Vec

In [None]:
%%time
gensim_iter = SentenceItertor(all_texts_pieces)
gensim_model = gensim.models.Word2Vec(gensim_iter, size=300, min_count=0, seed=SEED, iter=10, sg=1)

In [None]:
for text in all_texts_pieces:
    for token in text:
        if token not in gensim_model.wv.vocab.keys():
            raise ValueError

In [None]:
print(sp.unk_id())
print(sp.pad_id())

1
0


In [None]:
vocabs = [[sp.id_to_piece(id), id] for id in range(sp.get_piece_size())]

In [None]:
print(len(vocabs))

8000


In [None]:
emb_mean = np.mean(gensim_model.wv.vectors)
emb_std = np.std(gensim_model.wv.vectors)

In [None]:
embedding_matrix = np.random.normal(emb_mean, emb_std, (cfg.vocab_size, gensim_model.wv.vectors.shape[1]))

In [None]:
# re-indexing
add_tokens_count = 0
for token, index in vocabs:
    if token in gensim_model.wv.vocab:
        embedding_matrix[index] = gensim_model.wv.get_vector(token)
        add_tokens_count += 1

add_tokens_count

7996

In [None]:
def tokenize(sp, text, out_type=int, enable_sampling=True, alpha=0.2, nbest_size=None):
    # nbest_size が指定されると、nbest からサンプリングを行います。探索空間が限定されることで、最適解に近い分割に制限しやすくなる
    # alpha パラメータで、分布の偏りを変更できます。小さい値ほど、多様な解を出しやすくなる
    encoded = sp.encode(
        text,
        out_type=out_type,
        enable_sampling=enable_sampling,
        alpha=alpha,
        nbest_size=nbest_size,
    )
    return encoded

In [None]:
# subword
for _ in range(5):
    tokens = tokenize(sp, train_df.iloc[1]['text'], out_type=str)
    print("|".join(tokens[:30]))

▁|c|ha|3|e|b|▁|f|e|y|9|el|kou|m|▁m|e|ng|hir|▁ta|7|ay|oul|▁o|u|▁kr|es|s|i
▁c|h|a|3|e|b|▁f|ey|9|e|l|k|ou|m|▁m|e|n|ghir|▁t|a|7|a|y|oul|▁|o|u|▁kr|ess|i
▁c|h|a|3|e|b|▁f|e|y|9|el|koum|▁|me|n|g|hir|▁t|a|7|ay|oul|▁o|u|▁kr|e|ssi
▁cha|3|eb|▁|fe|y|9|e|lk|o|u|m|▁|men|g|hir|▁t|a|7|ay|oul|▁ou|▁kr|ess|i
▁ch|a|3|eb|▁f|ey|9|el|k|oum|▁men|g|h|i|r|▁ta|7|ay|oul|▁|ou|▁k|r|es|s|i


# make cv

In [None]:
cv = list(StratifiedKFold(n_splits=cfg.num_fold).split(train_df, train_df['label_ids']))

# Make Dataset

In [None]:
def encode(tokenizer, text, max_seq_length):
    input_ids = tokenizer.encode_as_ids(text)

    # truncation
    if len(input_ids) > max_seq_length:
        half = max_seq_length // 2
        head = input_ids[:half]
        tail = input_ids[-half:]

        input_ids = head + tail
        assert len(input_ids) == max_seq_length

    return {
        'input_ids': input_ids,
    }

encode_func = partial(encode, tokenizer=sp, max_seq_length=cfg.max_seq_len)

In [None]:
def prepare_dataset(train_df, test_df, text_column, encode_func):
    train_ds = Dataset.from_pandas(train_df)
    test_ds = Dataset.from_pandas(test_df)

    # only encode test
    test_ds = test_ds.map(lambda ex: encode_func(text=ex[text_column]))
    test_ds.set_format("numpy", columns=["input_ids"])


    return train_ds, test_ds

In [None]:
train_ds, test_ds = prepare_dataset(train_df, test_df, text_column='text', encode_func=encode_func)

HBox(children=(FloatProgress(value=0.0, max=30000.0), HTML(value='')))




In [None]:
train_ds[0]

{'ID': '13P0QT0',
 'label': -1,
 'label_ids': 0,
 'text': '3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi ou 3sbaaaaaaaaaaaaaaaaaaaaaaaaaaa le ca'}

In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, ds, sp, is_valid=False):
        self.ds = ds
        self.sp = sp
        self.is_valid = is_valid

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, index):
        item = self.ds[index]

        if not self.is_valid:
            input_ids = tokenize(self.sp, item['text'], out_type=int)
        else:
            input_ids = self.sp.encode_as_ids(item['text'])


        label_ids = item['label_ids']

        return {
            'input_ids' : np.array(input_ids, dtype=np.int64),
            'label_ids' : label_ids
        }

In [None]:
train_dataset = TextDataset(train_ds, sp)

In [None]:
train_dataset[0]

{'input_ids': array([   6,   16,   36,  916, 1973,  387,    3,    3,  387,   20,   34,
          60,   52,  352,  288,  154,   75,   39,  167,    6,   16,   83,
         387,    3,    3,    3,    3, 2683,  563,    3,   52,   51]),
 'label_ids': 0}

In [None]:
def pad_sequences(seqs):
    lens = [len(seq) for seq in seqs]
    max_len = max(lens)

    # 最初にPADDINGの配列用意
    padded_seqs = torch.zeros(len(seqs), max_len).long()
    for i, seq in enumerate(seqs):
        start = max_len - lens[i]
        padded_seqs[i, :lens[i]] = torch.LongTensor(seq)
    return padded_seqs

In [None]:
def default_data_collator(features):
    """
    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
    potential keys named:
        - ``label``: handles a single value (int or float) per object
        - ``label_ids``: handles a list of values per object
    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
    to the model. See glue and ner for example of how it's useful.
    """

    first = features[0]
    batch = {}

    # Special handling for labels.
    # Ensure that tensor is created with the correct type
    # (it should be automatically the case, but let's make sure of it.)
    if "label" in first and first["label"] is not None:
        label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
        dtype = torch.long if isinstance(label, int) else torch.float
        batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
    elif "label_ids" in first and first["label_ids"] is not None:
        if isinstance(first["label_ids"], torch.Tensor):
            batch["labels"] = torch.stack([f["label_ids"] for f in features])
        else:
            dtype = torch.long if type(first["label_ids"]) is int else torch.float
            batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)

    if "ID" in first:
        batch["ID"] = np.array([f["ID"] for f in features])

    # Handling of all other possible keys.
    # Again, we will use the first element to figure out which key/values are not None for this model.
    for k, v in first.items():
        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
            if isinstance(v, torch.Tensor):
                batch[k] = pad_sequences([f[k] for f in features])
            else:
                batch[k] = pad_sequences([f[k] for f in features])

    return batch

# DataModule

In [None]:
class DataModule(pl.LightningDataModule):
    def __init__(self, ds, trn_idx, val_idx, batch_size, num_workers, collate_fn, sp, val_batch_size=None):
        super().__init__()
        self._trn_ds = ds.select(trn_idx)
        self._val_ds = ds.select(val_idx)

        self.batch_size = batch_size
        if val_batch_size is None:
            self.val_batch_size = self.batch_size

        self.num_workers = num_workers
        self.collate_fn = collate_fn
        self.sp = sp

    def train_dataloader(self) -> DataLoader:
        text_dataset = TextDataset(self._trn_ds, self.sp)

        return DataLoader(
            text_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collate_fn,
        )

    def val_dataloader(self) -> DataLoader:
        text_dataset = TextDataset(self._val_ds, self.sp, is_valid=True)

        return DataLoader(
            text_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=True,
            collate_fn=self.collate_fn,
        )

In [None]:
datamodule = DataModule(
    ds = train_ds,
    trn_idx = cv[0][0],
    val_idx = cv[0][1],
    batch_size = 4,
    num_workers = 1,
    collate_fn = default_data_collator,
    sp = sp
)

In [None]:
for item in datamodule.train_dataloader():
    break

In [None]:
item

{'input_ids': tensor([[ 665,  305,   24,    2,  110,   17,   21,    3,    4, 6065,  201,   75,
            22,   75,   26,  218,   36,   76,   22,    8,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0],
         [6580,    7,   75,   79,    4, 1828,  167,  178,    4,  632,  167,  490,
            89,  212,   50,    7,   26,   52,    8,    4,   63,    6,  156,  139,
            58, 1457, 1039, 1426,  262,   62,   18, 3839,   36, 2002,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0],
         [   6,   45,   11,  215,  216,    4,   75,    2,   4

# Model

In [None]:
class MulithSampleDropoutHead(nn.Module):
    def __init__(self, hidden_dim, out, dropout=0.5, num=5):
        super().__init__()
        
        self.num = num
        self.dropout = nn.Dropout(dropout)
        self.head = nn.Linear(hidden_dim, out)

        nn.init.normal_(self.head.weight, std=0.02)
        nn.init.normal_(self.head.bias, 0)
        
    def forward(self, x):
        x = torch.stack([self.head(self.dropout(x)) for _ in range(self.num)], dim=0)
        x = torch.mean(x, dim=0)
        
        return x

In [None]:
class LstmUnit(nn.Module):
    def __init__(self, embedding_matrix,  lstm_hidden_size=120,  gru_hidden_size=60):
        super(LstmUnit, self).__init__()
        self.embedding = nn.Embedding(*embedding_matrix.shape)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = True
        self.embedding_dropout = nn.Dropout2d(0.2)

        self.lstm = nn.LSTM(embedding_matrix.shape[1], lstm_hidden_size, bidirectional=True, batch_first=True)
        # self.lstm2 = nn.LSTM(lstm_hidden_size * 2, gru_hidden_size, bidirectional=True, batch_first=True)
        
    def apply_spatial_dropout(self, h_embedding):
        h_embedding = h_embedding.transpose(1, 2).unsqueeze(2)
        h_embedding = self.embedding_dropout(h_embedding).squeeze(2).transpose(1, 2)
        return h_embedding

    def forward(self, x):
        batch_size = x.size(0)
        h_embedding = self.embedding(x)
        h_embedding = self.apply_spatial_dropout(h_embedding)

        h_lstm, _ = self.lstm(h_embedding)
        # h_lstm, _ = self.lstm2(h_lstm)

        # avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)

        # out = torch.cat((avg_pool, max_pool), 1)
        return max_pool

In [None]:
class LSTMClassifier(nn.Module):
    
    def __init__(self, embedding_matrix, num_classes):
        super().__init__()
        lstm_size = 60
        # lstm2_size = 120
        
        self.lstm = LstmUnit(embedding_matrix, lstm_size)

        self.fc = nn.Sequential(
            nn.Linear(lstm_size  * 2 ,  60),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(60, num_classes)
        )

    def forward(self,  input_ids, **kwargs):
        out = self.lstm(input_ids)
        out = self.fc(out)

        return out

In [None]:
model =LSTMClassifier(
    embedding_matrix=embedding_matrix,
    num_classes=len(ID2LABEL),
)

In [None]:
for item in datamodule.train_dataloader():
    with torch.no_grad():
        out = model(**item)
        break

In [None]:
out.shape

torch.Size([4, 3])

# Litning Task

In [None]:
class LabelSmoothing(nn.Module):
    def __init__(self, eps=0.1):
        super().__init__()
        self.eps = eps
        
    def forward(self, pred, gold):
        gold = gold.contiguous().view(-1)
        
        if self.eps <= 0:
            return F.cross_entropy(pred, gold)
        
        if self.training:
            n_class = pred.size(1)
            one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
            one_hot = one_hot * (1 - self.eps) + (1 - one_hot) * self.eps / (n_class - 1)
            log_prb = F.log_softmax(pred, dim=1)
            loss = -(one_hot * log_prb).sum(dim=1).mean()
            return loss
        else:
            return F.cross_entropy(pred, gold)

In [None]:
def get_metrics_names_dict(metrics):
    return {metric.__class__.__name__: metric for metric in metrics}

class Task(pl.LightningModule):
    def __init__(
        self,
        model,
        loss_fct,
        cfg,
        metrics=[Accuracy()],
        ):
        super().__init__()

        self.save_hyperparameters()

        self.model = model
        self.loss_fct = loss_fct
        self.cfg = cfg
        self.metrics = nn.ModuleDict(get_metrics_names_dict(metrics))

    def get_lr_scheduler(self):
        scheduler = get_linear_schedule_with_warmup(
            self.opt,  num_warmup_steps=self.cfg.warmup_steps, num_training_steps=self.total_steps()
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return scheduler

    def configure_optimizers(self):
        # self.dataset_size = len(self.train_dataloader().dataset)

        # model = self.model
        # no_decay = ["bias", "LayerNorm.weight"]
        # optimizer_grouped_parameters = [
        #         {
        #             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        #             "weight_decay": self.cfg.weight_decay,
        #         },
        #         {
        #             "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        #             "weight_decay": 0.0,
        #         },
        #     ]

        # optimizer = AdamW(
        #     optimizer_grouped_parameters,
        #     lr=self.cfg.lr,
        #     eps=self.cfg.adam_epsilon
        # )

        # self.opt = optimizer
        # scheduler = self.get_lr_scheduler()

        return optim.Adam(self.parameters(), lr=1e-3)

    @auto_move_data
    def forward(self, batch_dict):
        return self.model(**batch_dict)

    def compute_loss(self, y_hat, y):
        loss = self.loss_fct(y_hat, y)
        return loss

    def step(self, batch_dict, batch_idx):
        y = batch_dict['labels']
        y_hat = self.forward(batch_dict)
        loss = self.compute_loss(y_hat, y)

        logs = {}
        logs['loss'] = loss
        for name, metric in self.metrics.items():
            metric(torch.softmax(y_hat, dim=-1), y)
            logs[name] = metric
        
        output = {}
        output['loss'] = loss
        output['logs'] = logs
        output['y'] = y
        output['y_hat'] = y_hat

        return output

    def training_step(self, batch, batch_idx):
        output = self.step(batch, batch_idx)
        self.log_dict({f"train_{k}": v for k, v in output['logs'].items()}, on_step=True, on_epoch=True, prog_bar=True)
        return output['loss']

    def validation_step(self, batch, batch_idx):
        output = self.step(batch, batch_idx)
        self.log_dict({f"val_{k}": v for k, v in output["logs"].items()}, on_step=False, on_epoch=True, prog_bar=True)

    def total_steps(self) -> int:
        # self.dataset_size = len(trainer.datamodule.train_dataloader().dataset)
        effective_batch_size = self.cfg.batch_size * self.cfg.gradient_accumulation_steps
        return (self.dataset_size / effective_batch_size) * self.cfg.num_epoch

In [None]:
task = Task(
    model = model,
    loss_fct = nn.CrossEntropyLoss(),
    cfg = cfg,
)

In [None]:
trainer = pl.Trainer(
    gpus=GPUS,
    fast_dev_run=True,
    logger=None,
    checkpoint_callback=False,
    progress_bar_refresh_rate=20,
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Running in fast_dev_run mode: will run a full train, val and test loop using 1 batch(es).


In [None]:
trainer.fit(task, datamodule)


  | Name     | Type             | Params
----------------------------------------------
0 | model    | LSTMClassifier   | 2.6 M 
1 | loss_fct | CrossEntropyLoss | 0     
2 | metrics  | ModuleDict       | 0     
----------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.325    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [None]:
del datamodule, model, task
gc.collect()

70

# Train

In [None]:
test_dataloder = DataLoader(
    test_ds,
    batch_size=cfg.batch_size,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    collate_fn=default_data_collator,
)

In [None]:
def inferance(model, test_dataloder):
    preds = []

    model.eval()
    model = model.to('cuda')

    torch.set_grad_enabled(False)
    
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        with torch.no_grad():
            for item in tqdm(test_dataloder, desc='inferance'):
                logit = model.forward(item)
                logit = logit.cpu().numpy()

                preds.append(logit)

    torch.set_grad_enabled(True)

    preds = np.vstack(preds)
    return preds

In [None]:
val_scores = []
val_preds = np.zeros((len(train_df), len(ID2LABEL)), dtype="float32")
test_preds = np.zeros((cfg.num_fold, len(test_df), len(ID2LABEL)), dtype="float32")

for fold in range(cfg.num_fold):
    print('='*30)
    print(f'======fold: {fold} start======')

    datamodule = DataModule(
        ds = train_ds,
        trn_idx = cv[fold][0],
        val_idx = cv[fold][1],
        batch_size = cfg.batch_size,
        num_workers = NUM_WORKERS,
        collate_fn = default_data_collator,
        sp = sp,
    )

    filename = f'fold-{str(fold)}'
    filename += "-{epoch:02d}-{val_Accuracy:.3f}"
    checkpoint_callback = ModelCheckpoint(
        dirpath = SAVE_PATH,
        filename = filename,
        save_top_k = cfg.save_top_k,
        monitor="val_Accuracy",
        mode="max",
    )

    early_stop_callback = EarlyStopping(
        patience=2,
        verbose=False,
        monitor='val_Accuracy',
        mode='max',
    )
    
    model = LSTMClassifier(
        embedding_matrix=embedding_matrix,
        num_classes=len(ID2LABEL),
    )

    task = Task(
        model = model,
        loss_fct = nn.CrossEntropyLoss(),
        cfg = cfg,
    )

    trainer = pl.Trainer(
        gpus=GPUS,
        max_epochs=30,
        num_sanity_val_steps=0,
        # gradient_clip_val=cfg.max_grad_norm,
        logger=None,
        progress_bar_refresh_rate=20,
        precision=16,
        callbacks=[checkpoint_callback, early_stop_callback]
    )

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        trainer.fit(task, datamodule)

    # resotre best weight and inference
    print(f'load weights: {checkpoint_callback.best_model_path}')

    val_scores.append(float(checkpoint_callback.best_model_score.cpu()))

    task = Task.load_from_checkpoint(checkpoint_callback.best_model_path)
    val_pred = inferance(task, datamodule.val_dataloader())
    test_pred = inferance(task, test_dataloder)

    val_preds[cv[fold][1]] = val_pred
    test_preds[fold] = test_pred

    del model, task, trainer, datamodule
    gc.collect()

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.

  | Name     | Type             | Params
----------------------------------------------
0 | model    | LSTMClassifier   | 2.6 M 
1 | loss_fct | CrossEntropyLoss | 0     
2 | metrics  | ModuleDict       | 0     
----------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.325    Total estimated model params size (MB)




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


load weights: /content/lightning-logs/fold-0-epoch=02-val_Accuracy=0.804.ckpt


HBox(children=(FloatProgress(value=0.0, description='inferance', max=110.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='inferance', max=235.0, style=ProgressStyle(description_wi…




GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.

  | Name     | Type             | Params
----------------------------------------------
0 | model    | LSTMClassifier   | 2.6 M 
1 | loss_fct | CrossEntropyLoss | 0     
2 | metrics  | ModuleDict       | 0     
----------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.325    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


load weights: /content/lightning-logs/fold-1-epoch=03-val_Accuracy=0.798.ckpt


HBox(children=(FloatProgress(value=0.0, description='inferance', max=110.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='inferance', max=235.0, style=ProgressStyle(description_wi…




GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.

  | Name     | Type             | Params
----------------------------------------------
0 | model    | LSTMClassifier   | 2.6 M 
1 | loss_fct | CrossEntropyLoss | 0     
2 | metrics  | ModuleDict       | 0     
----------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.325    Total estimated model params size (MB)




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


load weights: /content/lightning-logs/fold-2-epoch=02-val_Accuracy=0.787.ckpt


HBox(children=(FloatProgress(value=0.0, description='inferance', max=110.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='inferance', max=235.0, style=ProgressStyle(description_wi…




GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.

  | Name     | Type             | Params
----------------------------------------------
0 | model    | LSTMClassifier   | 2.6 M 
1 | loss_fct | CrossEntropyLoss | 0     
2 | metrics  | ModuleDict       | 0     
----------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.325    Total estimated model params size (MB)




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


load weights: /content/lightning-logs/fold-3-epoch=02-val_Accuracy=0.803.ckpt


HBox(children=(FloatProgress(value=0.0, description='inferance', max=110.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='inferance', max=235.0, style=ProgressStyle(description_wi…




GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.

  | Name     | Type             | Params
----------------------------------------------
0 | model    | LSTMClassifier   | 2.6 M 
1 | loss_fct | CrossEntropyLoss | 0     
2 | metrics  | ModuleDict       | 0     
----------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.325    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


load weights: /content/lightning-logs/fold-4-epoch=02-val_Accuracy=0.804.ckpt


HBox(children=(FloatProgress(value=0.0, description='inferance', max=110.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='inferance', max=235.0, style=ProgressStyle(description_wi…




In [None]:
accuracy_score(train_df['label_ids'], np.argmax(val_preds, axis=-1))

0.7993428571428571

In [None]:
save_pkl(Path("../output"), "91-sentencepiece-lstm_test_preds.pkl", test_preds)
save_pkl(Path("../output"), "91-sentencepiece-lstm_val_preds.pkl", val_preds)