In [None]:
!nvidia-smi

In [None]:
!pip install -q wandb sentencepiece datasets
!pip install git+https://github.com/huggingface/transformers.git
!pip install -q --upgrade --force-reinstall --no-deps kaggle

In [None]:
!mkdir /root/.kaggle
!cp /content/drive/MyDrive/Colab/kaggle/kaggle.json /root/.kaggle/kaggle.json

In [None]:
import os
import gc
import sys
import json
from pathlib import Path
import itertools
from tqdm.auto import tqdm
import logging
import datetime
import ast
import numpy as np
import pandas as pd
import math
import re
from sklearn import model_selection as sms
from sklearn.preprocessing import LabelEncoder
import scipy as sp

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from transformers import AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

from datasets import load_dataset
import tokenizers
import transformers
from transformers import DataCollatorForLanguageModeling, AutoModelForMaskedLM, DebertaV2ForMaskedLM, Trainer, TrainingArguments
from transformers.utils import logging

import wandb

%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [None]:
class Config:
    # ==============================
    # Globals #
    # ==============================
    competition_name = "us-patent-phrase-to-phrase-matching"
    group = "MLM"
    job_type = "DeBERTa-v3-large"
    exp_id = "001"
    debug = False
    inference_only = False
    upload_from_colab = False
    colab_dir = "/content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching"
    kaggle_json_path = "/root/.kaggle/kaggle.json"
    kaggle_dataset_path = None
    gpus = 1
    seed = 2434
    num_epochs = 5
    gradient_accumulation_steps = 2
    num_fold = 5
    mlm_probability = 0.15
    param_freeze = False
    fp16 = True
    # ==============================
    # Dataloader #
    # ==============================
    batch_size = 4
    num_workers = 8
    # ==============================
    # Split #
    # ==============================
    split_name = "StratifiedGroupKFold"
    split_params = {
        "n_splits": num_fold,
        "shuffle": True,
        "random_state": seed,
    }
    # ==============================
    # Model #
    # ==============================
    model_name = "microsoft/deberta-v3-large"
    max_length = 133
    hidden_size = 1024
    use_backbone_dropout = True
    dropout = 0.2
    initializer_range = 0.02
    # ==============================
    # Loss #
    # ==============================
    loss_name = "BCEWithLogitsLoss"
    loss_params = {
        "reduction": "mean",
    }
    # ==============================
    # Optimizer #
    # ==============================
    lr = 2e-5
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 2e-5,
        "eps": 1e-6,
        "betas": (0.9, 0.999)
    }
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01
    # ==============================
    # Scheduler #
    # ==============================
    warmup_ratio = 0.1
    scheduler_name = "cosine-warmup"
    scheduler_warmup_ratio = 0.1
    scheduler_params = {}
    scheduler_interval = "step"
    scheduler_cycle = "one-cycle" # epoch or one-cycle
    # ==============================
    # Callbacks #
    # ==============================
    checkpoint_params = {
        "monitor": "val/pearson_corr",
        "save_top_k": 1,
        "save_weights_only": True,
        "mode": "max",
        "verbose": True,
    }
    early_stopping = False
    early_stopping_params = {
        "monitor": "val/pearson_corr",
        "min_delta": 0.0,
        "patience": 8,
        "verbose": False,
        "mode": "min",
    }

In [None]:
# ====================================
# Setup #
# ====================================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def setup(cfg):
    cfg.on_colab = "google.colab" in sys.modules
    if cfg.on_colab:
        # kaggle api
        f = open(Config.kaggle_json_path, 'r')
        json_data = json.load(f)
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        # set input/output dir
        cfg.input_dir = os.path.join(cfg.colab_dir, "input")
        cfg.train_csv = os.path.join(cfg.input_dir, "train.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.cpc_data = os.path.join(cfg.input_dir, "cpc-data")
        cfg.cpc_codes_csv = os.path.join(cfg.input_dir, "cpc-codes/cpc_codes.csv")
        cfg.cpc_titles_csv = os.path.join(cfg.input_dir, "cpc-codes/titles.csv")
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.output_dir = os.path.join(cfg.colab_dir, "output")
        cfg.exp_output_dir = os.path.join(cfg.output_dir, f"mlm_exp{cfg.exp_id}")
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        for d in [cfg.output_dir, cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)
            
    else:
        cfg.input_dir = f"../input/{cfg.competition_name}"
        cfg.train_csv = os.path.join(cfg.input_dir, "train.csv")
        cfg.test_csv = os.path.join(cfg.input_dir, "test.csv")
        cfg.cpc_data = "../input/cpc-data"
        cfg.cpc_codes_csv = "../input/cpc-codes/cpc_codes.csv"
        cfg.sample_submission = os.path.join(cfg.input_dir, "sample_submission.csv")
        cfg.submission = "./"
        cfg.exp_output_dir = f"exp{cfg.exp_id}"
        cfg.model_dir = os.path.join(cfg.exp_output_dir, "model")

        if cfg.kaggle_dataset_path is not None:
            cfg.model_dir = os.path.join(cfg.kaggle_dataset_path, "model")

        for d in [cfg.exp_output_dir, cfg.model_dir]:
            os.makedirs(d, exist_ok=True)

    return cfg



# ====================================
# Preprocess #
# ====================================
def get_tokenizer(cfg):
    if cfg.kaggle_dataset_path is None:
        pretrained_dir = os.path.join(cfg.exp_output_dir, "pretrain_tokenizer")
    else:
        pretrained_dir = os.path.join(cfg.kaggle_dataset_path, "pretrain_tokenizer")

    if not os.path.isdir(pretrained_dir):
        # except for ("roberta", "deberta-v2", "deberta-v3")
        if "roberta" not in cfg.model_name:
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        # roberta
        else:
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, trim_offsets=False)

        tokenizer.save_pretrained(pretrained_dir)

    else:
        # deberta-v2 or deberta-v3
        if ("deberta-v2" in cfg.model_name) or ("deberta-v3" in cfg.model_name):
            tokenizer = DebertaV2TokenizerFast.from_pretrained(pretrained_dir)
        # except for ("roberta", "deberta-v2", "deberta-v3")
        elif "roberta" not in cfg.model_name:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)
        # roberta
        else:
            tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, trim_offsets=False)

    return tokenizer


def get_backbone_config(cfg):
    filename = "model_config"
    filelist = get_filname_listdir(cfg.exp_output_dir if cfg.on_colab else cfg.kaggle_dataset_path)

    if not filename in filelist:
        model_config = AutoConfig.from_pretrained(cfg.model_name, output_hidden_states=True)
        torch.save(model_config, os.path.join(cfg.exp_output_dir if cfg.on_colab else cfg.kaggle_dataset_path, f"{filename}.pth"))
    else:
        cfg_path = os.path.join(cfg.exp_output_dir if cfg.on_colab else cfg.kaggle_dataset_path, f"{filename}.pth")
        model_config = torch.load(cfg_path)

    return model_config


def get_filname_listdir(directory):
    listdir = os.listdir(directory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    
    return out_lst


# ====================================
# Dataset #
# ====================================
def tokenize_function(examples):
    return tokenizer(examples["text"])


In [None]:
# setup
Config = setup(Config)
# wandb.login()
# wandb.init(project=Config.competition_name, group=Config.group, job_type=Config.job_type, name=f"exp-{Config.exp_id}")

cpc_titles = pd.read_csv(Config.cpc_titles_csv)
cpc_titles["len_title"] = cpc_titles["title"].apply(lambda x: len(x.split()))
cpc_titles = cpc_titles[cpc_titles["len_title"] >= 5].reset_index(drop=True)
# cpc_titles = cpc_titles.iloc[:1000]

# split
cv = sms.StratifiedKFold(n_splits=Config.num_fold, shuffle=True, random_state=Config.seed)
cpc_titles["fold"] = -1
for fold_id, (train_idx, valid_idx) in enumerate(cv.split(cpc_titles, cpc_titles["section"])):
    cpc_titles.loc[valid_idx, "fold"] = fold_id

train = cpc_titles[cpc_titles["fold"] != 0].reset_index(drop=True)
valid = cpc_titles[cpc_titles["fold"] == 0].reset_index(drop=True)
train_text_list = train["title"].to_list()
valid_text_list = valid["title"].to_list()

# dataset
mlm_train_json_path = Path(Config.exp_output_dir) / "train_mlm.json"
mlm_valid_json_path = Path(Config.exp_output_dir) / "valid_mlm.json"

for json_path, list_ in zip([mlm_train_json_path, mlm_valid_json_path],[train_text_list, valid_text_list]):
    with open(str(json_path), 'w') as f:
        for sentence in list_:
            row_json = {'text': sentence}
            json.dump(row_json, f)
            f.write('\n')

datasets = load_dataset(
        'json',
        data_files={'train': str(mlm_train_json_path),
                    'valid': str(mlm_valid_json_path)},
        )

tokenizer = get_tokenizer(Config)

tokenizerd_datasets = datasets.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=["text"],
    batch_size=Config.batch_size
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=Config.mlm_probability)

model_config = get_backbone_config(Config)

if "deberta-v3" in Config.model_name:
    model = DebertaV2ForMaskedLM.from_pretrained(Config.model_name, config=model_config)
else:
    model = AutoModelForMaskedLM.from_pretrained(Config.model_name, config=model_config)

if Config.param_freeze:
    # deberta-v3-large
    model.deberta.embeddings.requires_grad_(False)
    model.deberta.encoder.layer[:24].requires_grad_(False)

    for name, p in model.named_parameters():
        print(name, p.requires_grad)

training_args = TrainingArguments(
    output_dir=Config.model_dir,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=Config.lr,
    weight_decay=Config.weight_decay,
    save_strategy="epoch",
    # metric_for_best_model="eval_loss",
    # greater_is_better=False,
    per_device_train_batch_size=Config.batch_size,
    per_device_eval_batch_size=Config.batch_size,
    num_train_epochs=Config.num_epochs,
    # report_to="wandb",
    # run_name=f"exp-{Config.exp_id}",
    lr_scheduler_type="cosine",
    warmup_ratio=Config.warmup_ratio,
    fp16=Config.fp16,
    logging_strategy="epoch",
    # logging_steps=500,
    gradient_accumulation_steps=Config.gradient_accumulation_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizerd_datasets["train"],
    eval_dataset=tokenizerd_datasets["valid"],
    data_collator=data_collator,   
)

trainer.train()
trainer.model.save_pretrained(Config.model_dir)
# wandb.finish()

Using custom data configuration default-f65180a79cb60a2e


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-f65180a79cb60a2e/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-f65180a79cb60a2e/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/31627 [00:00<?, ?ba/s]

  0%|          | 0/7907 [00:00<?, ?ba/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['deberta.embeddings.position_embeddings.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mode

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
0,4.5821,
1,3.2353,
2,2.9222,
3,2.699,
4,2.6138,


***** Running Evaluation *****
  Num examples = 31627
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching/output/mlm_exp001/model/checkpoint-15813
Configuration saved in /content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching/output/mlm_exp001/model/checkpoint-15813/config.json
Model weights saved in /content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching/output/mlm_exp001/model/checkpoint-15813/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 31627
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching/output/mlm_exp001/model/checkpoint-31626
Configuration saved in /content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching/output/mlm_exp001/model/checkpoint-31626/config.json
Model weights saved in /content/drive/MyDrive/Colab/kaggle/us-patent-phrase-to-phrase-matching/output/mlm_exp001/model/checkpoin