In [None]:
 ! nvidia-smi

Sat Jun 18 12:34:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    46W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# ! gdown --id 1PUSJ56k93B42XW9oszcPDefM8HeJ2BOg
# ! pip -q uninstall -y kaggle
# ! pip -q install --upgrade pip
# ! pip -q install kaggle --upgrade
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download us-patent-phrase-to-phrase-matching
# ! kaggle datasets download yasufuminakama/cpc-data

In [None]:
# ! unzip -q /content/us-patent-phrase-to-phrase-matching.zip -d data
# ! rm /content/us-patent-phrase-to-phrase-matching.zip
# ! unzip -q /content/cpc-data.zip -d cpc_data
# ! rm /content/cpc-data.zip

In [None]:
# ! pip -q install sentencepiece
# ! pip -q install transformers --upgrade
# ! pip -q install tokenizers --upgrade

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
from torch.cuda import amp
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.11.0+cu113
env: TOKENIZERS_PARALLELISM=true


In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='PPPM'
    debug=False
    apex=True
    num_workers=4
    model="microsoft/deberta-v3-small"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=1
    num_warmup_steps=1000
    epochs=10
    encoder_lr=16e-5
    decoder_lr=16e-5
    min_lr=1e-7
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size = 128
    fc_dropout=0.0
    loss1 = "XE"
    loss2 = "NA"
    target_size=5
    max_len=512
    weight_decay=0.01
    n_accum=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    test_fold = 5
    train=True
    MV=2.1
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1, 2, 3, 4]

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [None]:
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('/content/cpc_data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'/content/cpc_data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results

In [None]:
def get_folds(df):
    df['score_map'] = df['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    encoder = LabelEncoder()
    df['anchor_map'] = encoder.fit_transform(df['anchor'])
    Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
    for n, (df_index, val_index) in enumerate(Fold.split(df, df['score_map'], groups = df['anchor_map'])):
        df.loc[val_index, 'fold'] = int(n)
    df['fold'] = df['fold'].astype(int)
    return df

In [None]:
def get_max_len(cpc_texts, train):
    lengths_dict = {}

    lengths = []
    tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict['context_text'] = lengths

    for text_col in ['anchor', 'target']:
        lengths = []
        tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
        for text in tk0:
            length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
            lengths.append(length)
        lengths_dict[text_col] = lengths
        
    CFG.max_len = max(lengths_dict['anchor']) + max(lengths_dict['target'])\
                    + max(lengths_dict['context_text']) + 4 # CLS + SEP + SEP + SEP

In [None]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class usppmDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['score_map'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.long)
        return {"inputs" : inputs, 
                "labels" : label}

In [None]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def _resize_token_embeddings(self, length):
        self.model.resize_token_embeddings(length)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
def scoring(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score

In [None]:
class CustomLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, outputs, targets):
        x = outputs - outputs.mean()
        y = targets - targets.mean()
        first = x / (torch.linalg.norm(x) + CFG.eps)
        second = y / (torch.linalg.norm(y) + CFG.eps)
        r = (first * second).sum()
        return -r

In [None]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [None]:
def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps= CFG.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles, last_epoch = -1
        )
    return scheduler

In [None]:
class CorrLoss(nn.Module):
    """
    use 1 - correlational coefficience between the output of the network and the target as the loss
    input (o, t):
        o: Variable of size (batch_size, 1) output of the network
        t: Variable of size (batch_size, 1) target value
    output (corr):
        corr: Variable of size (1)
    """
    def __init__(self):
        super(CorrLoss, self).__init__()

    def forward(self, o, t):
        assert(o.size() == t.size())
        # calcu z-score for o and t
        o_m = o.mean(dim = 0)
        o_s = o.std(dim = 0)
        o_z = (o - o_m)/o_s

        t_m = t.mean(dim =0)
        t_s = t.std(dim = 0)
        t_z = (t - t_m)/t_s

        # calcu corr between o and t
        tmp = o_z * t_z
        corr = tmp.mean(dim = 0)
        return  1 - corr

In [None]:
class Trainer:
    def __init__(self, config, dataloaders, optimizer, model, criterion, scheduler, device=CFG.device, apex=CFG.apex):
        self.train_loader, self.valid_loader = dataloaders
        self.criterion = criterion
        self.scheduler = scheduler
        self.optimizer = optimizer
        self.model = model
        self.device = device
        self.apex = apex
        self.Config = config
    
    def train_one_epoch(self):
        scaler = amp.GradScaler()

        self.model.train()
        train_pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader))
        dataset_size = 0
        running_loss = 0.0

        for step, data in train_pbar:        
            for k, v in data["inputs"].items():
                data["inputs"][k] = v.to(self.device)

            data["labels"] = data["labels"].to(self.device)
            batch_size = data["labels"].size(0)

            with(amp.autocast(enabled = self.apex)):
                preds = self.model(data["inputs"])
                
                loss = self.criterion(preds, data["labels"])
                loss /= self.Config.n_accum

            scaler.scale(loss).backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), self.Config.max_grad_norm)

            if (step + 1) % self.Config.n_accum == 0:
                    scaler.step(self.optimizer)
                    scaler.update()
                    self.optimizer.zero_grad()
                    self.scheduler.step()


            running_loss += (loss.item() * batch_size)  
            dataset_size += batch_size
            epoch_loss = running_loss / dataset_size
            mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0

            train_pbar.set_postfix(Train_Loss = epoch_loss, LR = self.optimizer.param_groups[0]['lr'], GPU_mem = f"{mem:.02f} GB")
        torch.cuda.empty_cache()
        gc.collect()
        return epoch_loss

    @torch.no_grad()
    def valid_one_epoch(self):

        self.model.eval()
        valid_pbar = tqdm(enumerate(self.valid_loader), total=len(self.valid_loader))

        dataset_size = 0
        running_loss = 0.0
        val_preds = []

        for step , data in valid_pbar:
            
            for k, v in data["inputs"].items():
                data["inputs"][k] = v.to(self.device)

            data["labels"] = data["labels"].to(self.device)
            batch_size = data["labels"].size(0)

            preds = self.model(data["inputs"])

            loss = self.criterion(preds, data["labels"])
            
            preds = nn.Softmax()(preds)
            val_preds.append(preds.to("cpu").numpy())
            running_loss += (loss.item()*batch_size)
            dataset_size += batch_size
        
            epoch_loss = running_loss / dataset_size


            mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
            valid_pbar.set_postfix(Valid_loss = epoch_loss,
                                   LR = self.optimizer.param_groups[0]['lr'],
                                   GPU_mem = f"{mem:.02f} GB" )
            
        val_preds  = np.concatenate(val_preds)
        torch.cuda.empty_cache()
        gc.collect()
        return val_preds, epoch_loss   

    @torch.no_grad()
    def test_one_epoch(self):

        self.model.eval()
        test_pbar = tqdm(enumerate(self.test_loader), total=len(self.test_loader))

        test_preds = []

        for step , data in test_pbar:
            
            for k, v in data["inputs"].items():
                data["inputs"][k] = v.to(self.device)

            data["labels"] = data["labels"].to(self.device)

            preds = self.model(data["inputs"])
            preds = nn.Softmax()(preds)
            test_preds.append(preds.to("cpu").numpy())

            mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
            test_pbar.set_postfix(GPU_mem = f"{mem:.02f} GB" )
            
        test_preds  = np.concatenate(test_preds)
        test_preds = np.matmul(test_preds, np.array([0.0, 0.25, 0.50, 0.75, 1.0]))
        torch.cuda.empty_cache()
        gc.collect()
        return test_preds            

    def fit(self, valid_labels, fold: str, epochs: int = 10, output_dir: str = "/content/models/", custom_name: str = 'model.pth',):
        """
        Low-effort alternative for doing the complete training and validation process
        """
        best_score = int(-1e+7)
        oof_df = pd.DataFrame()
        custom_name = f"model_{fold}.pth"
        for epx in range(epochs):
            print(f"{'='*20} Epoch: {epx+1} / {epochs} {'='*20}")

            train_loss = self.train_one_epoch()
            print(f"Training loss: {train_loss:.4f}")

            valid_preds, val_loss = self.valid_one_epoch()
            valid_preds = np.matmul(valid_preds, np.array([0.0, 0.25, 0.50, 0.75, 1.0]))
            val_score = scoring(valid_labels, valid_preds)
            
            print(f'Validation Score: {val_score:.4f}')
            
            if val_score > best_score:
                print(f"Valid Score Improved ({best_score:0.4f} ---> {val_score:0.4f})")
                best_score = val_score
                self.save_model(output_dir, custom_name)
                print(f"Saved model with val_score: {best_score:.4f}")
                oof_df["preds"] = valid_preds
                oof_df["labels"] = valid_labels

            print(f"Best score is {best_score:0.5f}")

        oof_df.to_csv(f"models/oof_{fold}.csv", index = None)   

            

    def save_model(self, path, name, verbose=False):
        """
        Saves the model at the provided destination
        """
        
        try:
            if not os.path.exists(path):
                os.makedirs(path)
        except:
            print("Errors encountered while making the output directory")

        torch.save(self.model.state_dict(), os.path.join(path, name))
        if verbose:
            print(f"Model Saved at: {os.path.join(path, name)}")

In [None]:
if __name__ == "__main__":
    train = pd.read_csv("/content/data/train.csv")
    cpc_texts = get_cpc_texts()
    torch.save(cpc_texts, "cpc_texts.pth")
    train['context_text'] = train['context'].map(cpc_texts)
    train["context_tags"] = '<' + train['context'] + '>'
    context_tags = list(train['context_tags'].unique())
    train['text'] = train['context_tags'] + '<anchor>' + train['anchor'] + '</anchor><target>' + train['target'] + '</target><context>'  + train['context_text'] + '</context>'
    train["text"] = train["text"].map(lambda x: x.lower())
    if CFG.debug:
        train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    train = get_folds(train)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.model)
    tokenizer.add_special_tokens({'additional_special_tokens': ['<anchor>', '</anchor>', '<target>', '</target>', '<context>', '</context>'] + context_tags})
    CFG.tokenizer = tokenizer
    get_max_len(cpc_texts, train)

    criterion = nn.CrossEntropyLoss()
    oof_df = pd.DataFrame()
    for fold in CFG.trn_fold:
        print("*" * 20)
        print(f"Training fold {fold}")
        print("*" * 20)
        val_df = train[train.fold == fold].reset_index(drop = True)
        train_df = train[(train.fold != fold)].reset_index(drop = True)

        train_dataset = usppmDataset(CFG, train_df)
        val_dataset = usppmDataset(CFG, val_df)

        train_loader = DataLoader(train_dataset,
                                batch_size=CFG.batch_size,
                                shuffle=True,
                                num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(val_dataset,
                                batch_size=CFG.batch_size,
                                shuffle=False,
                                num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
        
        
        model = CustomModel(CFG, config_path=None, pretrained=True)
        model._resize_token_embeddings(len(CFG.tokenizer))
        model.to(device)

        optimizer_parameters = get_optimizer_params(model,
                                                    encoder_lr=CFG.encoder_lr, 
                                                    decoder_lr=CFG.decoder_lr,
                                                    weight_decay=CFG.weight_decay)
        optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

        num_train_steps = int(len(train) / CFG.batch_size * CFG.epochs)
        scheduler = get_scheduler(CFG, optimizer, num_train_steps)

        trainer = Trainer(config = CFG,
                          dataloaders = (train_loader, valid_loader),
                          optimizer = optimizer,
                          model = model,
                          criterion= criterion,
                          scheduler = scheduler)
        trainer.fit(valid_labels = val_df["score"].values,
                    epochs = CFG.epochs,
                    fold = fold,
                    output_dir = f"/content/models/")
        

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/36473 [00:00<?, ?it/s]

  0%|          | 0/36473 [00:00<?, ?it/s]

********************
Training fold 0
********************


Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 1.2362


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.7802
Valid Score Improved (-10000000.0000 ---> 0.7802)
Saved model with val_score: 0.7802
Best score is 0.78020


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.8247


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8244
Valid Score Improved (0.7802 ---> 0.8244)
Saved model with val_score: 0.8244
Best score is 0.82442


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.7053


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8341
Valid Score Improved (0.8244 ---> 0.8341)
Saved model with val_score: 0.8341
Best score is 0.83407


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.6103


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8285
Best score is 0.83407


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.5190


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8348
Valid Score Improved (0.8341 ---> 0.8348)
Saved model with val_score: 0.8348
Best score is 0.83481


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.3890


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8472
Valid Score Improved (0.8348 ---> 0.8472)
Saved model with val_score: 0.8472
Best score is 0.84719


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.2772


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8502
Valid Score Improved (0.8472 ---> 0.8502)
Saved model with val_score: 0.8502
Best score is 0.85025


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1588


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8464
Best score is 0.85025


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1262


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8468
Best score is 0.85025


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1315


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8449
Best score is 0.85025
********************
Training fold 1
********************


Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 1.2689


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.7674
Valid Score Improved (-10000000.0000 ---> 0.7674)
Saved model with val_score: 0.7674
Best score is 0.76743


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.8317


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8069
Valid Score Improved (0.7674 ---> 0.8069)
Saved model with val_score: 0.8069
Best score is 0.80692


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.6872


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8289
Valid Score Improved (0.8069 ---> 0.8289)
Saved model with val_score: 0.8289
Best score is 0.82887


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.6085


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8330
Valid Score Improved (0.8289 ---> 0.8330)
Saved model with val_score: 0.8330
Best score is 0.83299


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.5348


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8314
Best score is 0.83299


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.4668


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8341
Valid Score Improved (0.8330 ---> 0.8341)
Saved model with val_score: 0.8341
Best score is 0.83407


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.3263


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8404
Valid Score Improved (0.8341 ---> 0.8404)
Saved model with val_score: 0.8404
Best score is 0.84040


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.2150


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8417
Valid Score Improved (0.8404 ---> 0.8417)
Saved model with val_score: 0.8417
Best score is 0.84165


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1755


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8418
Valid Score Improved (0.8417 ---> 0.8418)
Saved model with val_score: 0.8418
Best score is 0.84183


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1801


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8360
Best score is 0.84183
********************
Training fold 2
********************


Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 1.2366


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.7669
Valid Score Improved (-10000000.0000 ---> 0.7669)
Saved model with val_score: 0.7669
Best score is 0.76686


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.8477


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8129
Valid Score Improved (0.7669 ---> 0.8129)
Saved model with val_score: 0.8129
Best score is 0.81287


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.7126


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8201
Valid Score Improved (0.8129 ---> 0.8201)
Saved model with val_score: 0.8201
Best score is 0.82008


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.6069


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8304
Valid Score Improved (0.8201 ---> 0.8304)
Saved model with val_score: 0.8304
Best score is 0.83040


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.5182


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8244
Best score is 0.83040


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.4293


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8324
Valid Score Improved (0.8304 ---> 0.8324)
Saved model with val_score: 0.8324
Best score is 0.83242


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.2633


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8405
Valid Score Improved (0.8324 ---> 0.8405)
Saved model with val_score: 0.8405
Best score is 0.84046


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1751


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8417
Valid Score Improved (0.8405 ---> 0.8417)
Saved model with val_score: 0.8417
Best score is 0.84166


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1441


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8407
Best score is 0.84166


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1491


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8383
Best score is 0.84166
********************
Training fold 3
********************


Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 1.2667


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.7551
Valid Score Improved (-10000000.0000 ---> 0.7551)
Saved model with val_score: 0.7551
Best score is 0.75506


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.8623


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.7884
Valid Score Improved (0.7551 ---> 0.7884)
Saved model with val_score: 0.7884
Best score is 0.78835


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.6957


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8263
Valid Score Improved (0.7884 ---> 0.8263)
Saved model with val_score: 0.8263
Best score is 0.82626


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.5774


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8221
Best score is 0.82626


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.4499


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8377
Valid Score Improved (0.8263 ---> 0.8377)
Saved model with val_score: 0.8377
Best score is 0.83770


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.3423


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8354
Best score is 0.83770


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.2488


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8359
Best score is 0.83770


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1692


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8357
Best score is 0.83770


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1354


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8354
Best score is 0.83770


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1388


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8336
Best score is 0.83770
********************
Training fold 4
********************


Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 1.2397


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.7557
Valid Score Improved (-10000000.0000 ---> 0.7557)
Saved model with val_score: 0.7557
Best score is 0.75569


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.8465


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8092
Valid Score Improved (0.7557 ---> 0.8092)
Saved model with val_score: 0.8092
Best score is 0.80920


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.7169


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8206
Valid Score Improved (0.8092 ---> 0.8206)
Saved model with val_score: 0.8206
Best score is 0.82061


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.6069


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8258
Valid Score Improved (0.8206 ---> 0.8258)
Saved model with val_score: 0.8258
Best score is 0.82576


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.5225


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8254
Best score is 0.82576


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.3999


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8327
Valid Score Improved (0.8258 ---> 0.8327)
Saved model with val_score: 0.8327
Best score is 0.83268


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.2448


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8432
Valid Score Improved (0.8327 ---> 0.8432)
Saved model with val_score: 0.8432
Best score is 0.84317


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1585


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8433
Valid Score Improved (0.8432 ---> 0.8433)
Saved model with val_score: 0.8433
Best score is 0.84328


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1259


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8415
Best score is 0.84328


  0%|          | 0/227 [00:00<?, ?it/s]

Training loss: 0.1335


  0%|          | 0/57 [00:00<?, ?it/s]

Validation Score: 0.8384
Best score is 0.84328


In [None]:
! mkdir /content/models/tokenizer
CFG.tokenizer.save_pretrained("/content/models/tokenizer/")

mkdir: cannot create directory ‘/content/models/tokenizer’: File exists


('/content/models/tokenizer/tokenizer_config.json',
 '/content/models/tokenizer/special_tokens_map.json',
 '/content/models/tokenizer/spm.model',
 '/content/models/tokenizer/added_tokens.json',
 '/content/models/tokenizer/tokenizer.json')

In [None]:
model = CFG.model.split("/")[-1]
save_file = f"/content/drive/MyDrive/USPPM-{model}-{CFG.loss1}-{CFG.loss2}-{CFG.MV}-{CFG.max_len}-tags.zip"
! zip -r $save_file  models

  adding: models/ (stored 0%)
  adding: models/model_0.pth (deflated 25%)
  adding: models/oof_4.csv (deflated 60%)
  adding: models/model_2.pth (deflated 25%)
  adding: models/oof_3.csv (deflated 59%)
  adding: models/model_3.pth (deflated 25%)
  adding: models/oof_0.csv (deflated 60%)
  adding: models/tokenizer/ (stored 0%)
  adding: models/tokenizer/vocab.json (deflated 59%)
  adding: models/tokenizer/vocab.txt (deflated 53%)
  adding: models/tokenizer/tokenizer.json (deflated 77%)
  adding: models/tokenizer/special_tokens_map.json (deflated 75%)
  adding: models/tokenizer/added_tokens.json (deflated 76%)
  adding: models/tokenizer/tokenizer_config.json (deflated 45%)
  adding: models/tokenizer/merges.txt (deflated 53%)
  adding: models/tokenizer/spm.model (deflated 50%)
  adding: models/model_4.pth (deflated 25%)
  adding: models/oof_2.csv (deflated 60%)
  adding: models/model_1.pth (deflated 25%)
  adding: models/oof_1.csv (deflated 60%)
