# BERT for Patents Baseline

- [kfold strategy](https://www.kaggle.com/code/abhishek/phrase-matching-folds)
- Utilize [Cooperative Patent Classification Codes Meaning](https://www.kaggle.com/datasets/xhlulu/cpc-codes)
- reference [phantivia'Notebook](https://www.kaggle.com/code/phantivia/uspppm-huggingface-train-inference-baseline)
- [BERT for Patents](https://www.kaggle.com/datasets/ksork6s4/bert-for-patents) from [huggingface page](https://huggingface.co/anferico/bert-for-patents)


### Please refer to [Inference Notebook](https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-inference/edit/run/91272728) as well.

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil
import torch
import gc

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DebertaV2Config, DebertaV2Model

os.environ["WANDB_DISABLED"] = "true"

# Config

In [2]:
class CFG:
    name = 'deberta-v3-small'
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../finetune_lm/deberta-v3-small+finetuned-patent/checkpoint-100000'
    tokenizer_path = '../finetune_lm/tokenizer'
    learning_rate = 5e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 64
    
    save_total_limit = 1

# Preproc

In [3]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
train_df = train_df.merge(titles, left_on='context', right_on='code')

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    # data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [4]:
train_df['input'] = train_df['title']+' '+ train_df['anchor']
train_df = create_folds(train_df, CFG.num_fold)

# Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_path)

# Dataset

In [6]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer( inputs, targets ),
        'label':label.astype(np.float32)
    }

# Train

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [8]:
def get_model(checkpoint=None):
    if checkpoint is None:
        return  AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    else:
        deberta_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
        return deberta_model

In [None]:
oof_df = pd.DataFrame()
for fold in range(CFG.num_fold):
    
    tr_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
    va_data = train_df[train_df['fold']==fold].reset_index(drop=True)
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)
    
    args = TrainingArguments(
        output_dir=f"./tmp/uspppm",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model="pearson",
        load_best_model_at_end=True,
        save_total_limit=CFG.save_total_limit,
    )
    
    model = get_model(CFG.model_path)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    shutil.rmtree(f"./tmp/uspppm")
    trainer.save_model(f"uspppm_{CFG.name}_{fold}")
    
    outputs = trainer.predict(va_dataset)
    predictions = outputs.predictions.reshape(-1)
    va_data['preds'] = predictions
    oof_df = pd.concat([oof_df, va_data])
    del model
    torch.cuda.empty_cache()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of the model checkpoint at ../finetune_lm/deberta-v3-small+finetuned-patent/checkpoint-100000 were not used when initializing DebertaV2ForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a mode

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.059907,0.334273
2,0.069200,0.037875,0.658037
3,0.045600,0.034925,0.700923
4,0.031900,0.036778,0.715235
5,0.025600,0.035775,0.72184


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 64
Saving model checkpoint to ./tmp/uspppm/checkpoint-456
Configuration saved in ./tmp/uspppm/checkpoint-456/config.json
Model weights saved in ./tmp/uspppm/checkpoint-456/pytorch_model.bin
tokenizer config file saved in ./tmp/uspppm/checkpoint-456/tokenizer_config.json
Special tokens file saved in ./tmp/uspppm/checkpoint-456/special_tokens_map.json
added tokens file saved in ./tmp/uspppm/checkpoint-456/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 64
Saving model checkpoint to ./tmp/uspppm/checkpoint-912
Configuration saved in ./tmp/uspppm/checkpoint-912/config.json
Model weights saved in ./tmp/uspppm/checkpoint-912/pytorch_model.bin
tokenizer config file saved in ./tmp/uspppm/checkpoint-912/tokenizer_config.json
Special tokens file saved in ./tmp/uspppm/checkpoint-912/special_tokens_map.json
added tokens file saved in ./tmp/uspppm/checkpoint-912/added_tokens.json
***** Running 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file ../finetune_lm/deberta-v3-small+finetuned-patent/checkpoint-100000/config.json
Model config DebertaV2Config {
  "_name_or_path": "../finetune_lm/deberta-v3-small+finetuned-patent/checkpoint-100000",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": 

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.060588,0.329549
2,0.068500,0.0388,0.653759
3,0.045400,0.036826,0.699552
4,0.032000,0.034268,0.717141
5,0.025900,0.033962,0.725771


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 64
Saving model checkpoint to ./tmp/uspppm/checkpoint-456
Configuration saved in ./tmp/uspppm/checkpoint-456/config.json
Model weights saved in ./tmp/uspppm/checkpoint-456/pytorch_model.bin
tokenizer config file saved in ./tmp/uspppm/checkpoint-456/tokenizer_config.json
Special tokens file saved in ./tmp/uspppm/checkpoint-456/special_tokens_map.json
added tokens file saved in ./tmp/uspppm/checkpoint-456/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 64
Saving model checkpoint to ./tmp/uspppm/checkpoint-912
Configuration saved in ./tmp/uspppm/checkpoint-912/config.json
Model weights saved in ./tmp/uspppm/checkpoint-912/pytorch_model.bin
tokenizer config file saved in ./tmp/uspppm/checkpoint-912/tokenizer_config.json
Special tokens file saved in ./tmp/uspppm/checkpoint-912/special_tokens_map.json
added tokens file saved in ./tmp/uspppm/checkpoint-912/added_tokens.json
***** Running 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file ../finetune_lm/deberta-v3-small+finetuned-patent/checkpoint-100000/config.json
Model config DebertaV2Config {
  "_name_or_path": "../finetune_lm/deberta-v3-small+finetuned-patent/checkpoint-100000",
  "architectures": [
    "DebertaV2ForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": 

Epoch,Training Loss,Validation Loss


In [None]:
predictions = oof_df['preds'].values
label = oof_df['score'].values
eval_pred = predictions, label
compute_metrics(eval_pred)

In [None]:
oof_df.to_csv('oof_df.csv')