<a href="https://colab.research.google.com/github/bk-anupam/KaggleChallenges/blob/master/NLP/PatentPhraseMatching/USPPPM_train_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Fri May 20 06:00:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    34W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%pip install -q transformers[sentencepiece] datasets

Collecting transformers[sentencepiece]
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 61.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.5 MB/s 
[?25hCollecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux

In [None]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import warnings

warnings.filterwarnings('ignore')


In [None]:
class TrainingArgs:
    weight_decay = 0.01
    learning_rate = 2e-5  
    warmup_ratio = 0.1
    gradient_accumulation_steps = 1
    fp16 = True
    lr_scheduler_type = "cosine"
    # Number of checkpoints to save for each model
    save_total_limit = 1

class Config:
    DATA_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/data/"
    # location where trained model weights are saved
    OUT_DIR = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/"
    RUNTIME = "COLAB"
    RANDOM_STATE = 42
    BATCH_SIZE = 128
    NUM_LABELS = 1
    NUM_FOLDS = 5
    RUN_ALL_FOLDS = True
    NUM_EPOCHS = 4
    NUM_WORKERS = 8
    TRANSFORMER_CHECKPOINT = "microsoft/deberta-v3-small"
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    SUBSET_ROWS_FRAC = 0.1
    TRAIN_ON_SUBSET = False
    RANDOM_SEED = 42

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
df_train = pd.read_csv(Config.DATA_PATH + "train.csv")
df_test = pd.read_csv(Config.DATA_PATH + "test.csv")
df_titles = pd.read_csv(Config.DATA_PATH + "titles.csv")

In [None]:
df_train.target.value_counts()

composition                    24
data                           22
metal                          22
motor                          22
assembly                       21
                               ..
switching switch over valve     1
switching switch off valve      1
switching over valve            1
switching off valve             1
wooden substrate                1
Name: target, Length: 29340, dtype: int64

In [None]:
df_train.anchor.value_counts()

component composite coating              152
sheet supply roller                      150
source voltage                           140
perfluoroalkyl group                     136
el display                               135
                                        ... 
plug nozzle                                2
shannon                                    2
dry coating composition1                   2
peripheral nervous system stimulation      1
conduct conducting material                1
Name: anchor, Length: 733, dtype: int64

In [None]:
df_train.context.value_counts()

H01    2186
H04    2177
G01    1812
A61    1477
F16    1091
       ... 
B03      47
F17      33
B31      24
A62      23
F26      18
Name: context, Length: 106, dtype: int64

In [None]:
df_train["section"] = df_train.context.str[0]

In [None]:
df_train.section.value_counts()

B    8019
H    6195
G    6013
C    5288
A    4094
F    4054
E    1531
D    1279
Name: section, dtype: int64

In [None]:
df_train.score.value_counts()

0.50    12300
0.25    11519
0.00     7471
0.75     4029
1.00     1154
Name: score, dtype: int64

In [None]:
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df.sample(frac=1, random_state=Config.RANDOM_STATE).reset_index(drop=True)
    y = df[target_col_name].values
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE)
    # stratification is done on the basis of y labels, a placeholder for X is sufficient
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_idx, "kfold"] = fold
    return df

In [None]:
if Config.TRAIN_ON_SUBSET:
    print(f"Selecting {Config.SUBSET_ROWS_FRAC * 100}% training data")
    df_train = df_train.sample(frac=Config.SUBSET_ROWS_FRAC, random_state=Config.RANDOM_SEED).reset_index(drop=True)

# Since the target column (score) is continuous, we need to create bins out of the target column
df_train.loc[:, "bins"] = pd.cut(df_train.score, bins=5, labels=[0,1,2,3,4])
# Now do a stratified k fold on the bins column (which is a categorical column)
df_train = strat_kfold_dataframe(df_train, target_col_name="bins", num_folds=Config.NUM_FOLDS)            
# drop the bin column
df_train = df_train.drop(["bins"], axis=1)

In [None]:
# Let us check if the stratification has been done correctly
# The mean of score column should be similar across folds 
fold_score_mean = []
for fold in range(Config.NUM_FOLDS):
    fold_score_mean.append(np.mean(df_train[df_train.kfold == fold].score.values))
fold_score_mean

[0.36209732693625774,
 0.36209732693625774,
 0.36209732693625774,
 0.3619755963805868,
 0.3620441458733205]

In [None]:
df_titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [None]:
df_train = pd.merge(
    left = df_train,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att

In [None]:
sep = tokenizer.sep_token

In [None]:
df_train["inputs"] = df_train.anchor + sep + df_train.target + sep + df_train.title 
df_train.head()

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,inputs
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,1,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]abatement of pollution[SEP]FURNI...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]act of abating[SEP]FURNITURE; DO...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,2,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]active catalyst[SEP]FURNITURE; D...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]eliminating process[SEP]FURNITUR...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]forest region[SEP]FURNITURE; DOM...


In [None]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True
    )
    if with_labels:
        encoding["labels"] = row["score"]
    return encoding

In [None]:
# test_enc = tokenize_text(tokenizer, True, df_train.loc[0, :])
# test_enc

In [None]:
# ds_train_raw = Dataset.from_pandas(df_train)
# ds_train_raw

In [None]:
from functools import partial

preprocess_train_data = partial(tokenize_text, tokenizer, True)  
preprocess_test_data = partial(tokenize_text, tokenizer, False)  

In [None]:
def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    ds_train_raw = Dataset.from_pandas(train_df)
    ds_valid_raw = Dataset.from_pandas(valid_df)
    raw_ds_col_names = ds_train_raw.column_names    
    ds_train = ds_train_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)
    ds_valid = ds_valid_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)    
    return train_df, valid_df, ds_train, ds_valid

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
def get_oof_preds(trainer, ds_val, df_val_fold):
    oof_outputs = trainer.predict(ds_val)
    oof_predictions = oof_outputs.predictions.reshape(-1)
    df_val_fold["val_preds"] = oof_predictions
    return df_val_fold

In [None]:
df_val_preds = pd.DataFrame()
for fold in range(Config.NUM_FOLDS):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    df_train_fold, df_val_fold, ds_train, ds_val = get_fold_dls(fold, df_train)
    training_args = TrainingArguments(
        output_dir=Config.OUT_DIR + fold_str,
        evaluation_strategy="epoch",
        save_strategy='epoch',        
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        warmup_ratio=TrainingArgs.warmup_ratio,
        weight_decay=TrainingArgs.weight_decay,
        learning_rate=TrainingArgs.learning_rate,    
        gradient_accumulation_steps=TrainingArgs.gradient_accumulation_steps,
        fp16=TrainingArgs.fp16,
        lr_scheduler_type=TrainingArgs.lr_scheduler_type,
        save_total_limit=TrainingArgs.save_total_limit
    )
    model = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.NUM_LABELS)    
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=ds_train,              # training dataset
        eval_dataset=ds_val,                 # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    trainer.train()
    df_val_fold = get_oof_preds(trainer, ds_val, df_val_fold) 
    display(df_val_fold.head())
    df_val_preds = pd.concat([df_val_preds, df_val_fold], axis=0)
    # export the oof predictions to csv for later use in stacking
    if Config.RUNTIME != "KAGGLE":
        df_val_preds.to_csv(Config.DATA_PATH + "df_train_oof_preds.csv")
    else:
        df_val_preds.to_csv("/kaggle/working/df_train_oof_preds.csv")
    print(f"Saved OOF predictions for fold {fold}")    
    del model, trainer
    if not Config.RUN_ALL_FOLDS:
        break


Running training for fold0


  0%|          | 0/30 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/8 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": 

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.028728,0.779081
2,No log,0.027757,0.817237
3,0.050600,0.022291,0.828972
4,0.050600,0.023141,0.829249


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold0/checkpoint-228
Configuration saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold0/checkpoint-228/config.json
Model weights saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold0/checkpoint-228/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold0/checkpoint-228/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold0/checkpoint-228/special_tokens_map.json
added tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold0/checkpoint-228/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold0/checkpoin

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,inputs,val_preds
0,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]forest region[SEP]FURNITURE; DOM...,0.16687
1,ef2d4c2e6bbb208d,abatement,mixing core materials,A47,0.25,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]mixing core materials[SEP]FURNIT...,0.196289
2,cc96541d4987b399,abatement,rent abatement,A47,0.0,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]rent abatement[SEP]FURNITURE; DO...,0.190186
3,a8c9e9f37d4d836a,abatement,tax abatement,A47,0.0,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]tax abatement[SEP]FURNITURE; DOM...,0.060059
4,604210b7c7ce2f6a,adhesive mounting,adhesive,A47,0.5,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,adhesive mounting[SEP]adhesive[SEP]FURNITURE; ...,0.524414


Saved OOF predictions for fold 0
Running training for fold1


  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": 

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.026012,0.784165
2,No log,0.024049,0.813662
3,0.038900,0.02442,0.824081
4,0.038900,0.02402,0.825055


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold1/checkpoint-228
Configuration saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold1/checkpoint-228/config.json
Model weights saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold1/checkpoint-228/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold1/checkpoint-228/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold1/checkpoint-228/special_tokens_map.json
added tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold1/checkpoint-228/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold1/checkpoin

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,inputs,val_preds
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,1,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]abatement of pollution[SEP]FURNI...,0.513672
1,4c3f2750e7540ab7,abatement,multi pollution abatement device,A47,0.5,A,1,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]multi pollution abatement device...,0.415771
2,bfd7270f57530991,abatement,pollution abatement,A47,0.5,A,1,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]pollution abatement[SEP]FURNITUR...,0.514648
3,9001756895ec8ca1,abatement,pollution certificate,A47,0.0,A,1,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]pollution certificate[SEP]FURNIT...,-0.022491
4,deb9204cd9783e8b,abatement,sorbent material,A47,0.25,A,1,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]sorbent material[SEP]FURNITURE; ...,0.383301


Saved OOF predictions for fold 1
Running training for fold2


  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": 

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.028128,0.776928
2,No log,0.025106,0.809422
3,0.039000,0.024521,0.8197
4,0.039000,0.024911,0.819852


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold2/checkpoint-228
Configuration saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold2/checkpoint-228/config.json
Model weights saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold2/checkpoint-228/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold2/checkpoint-228/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold2/checkpoint-228/special_tokens_map.json
added tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold2/checkpoint-228/added_tokens.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold2/checkpoin

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,inputs,val_preds
0,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,2,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]active catalyst[SEP]FURNITURE; D...,0.270264
1,e1f44e48399a2027,abatement,measurement level,A47,0.25,A,2,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]measurement level[SEP]FURNITURE;...,0.224487
2,84261a11e5d1b68b,abatement,noise reduction,A47,0.5,A,2,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]noise reduction[SEP]FURNITURE; D...,0.464844
3,7920ba9fb0bf4578,adhesive mounting,adhesive mount,A47,1.0,A,2,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,adhesive mounting[SEP]adhesive mount[SEP]FURNI...,1.045898
4,7f028ab59b51e8bc,adhesive mounting,cohesive mount,A47,0.5,A,2,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,adhesive mounting[SEP]cohesive mount[SEP]FURNI...,0.711426


Saved OOF predictions for fold 2
Running training for fold3


  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": 

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.029911,0.774486
2,No log,0.024708,0.806176
3,0.038500,0.025847,0.815447
4,0.038500,0.02476,0.8169


***** Running Evaluation *****
  Num examples = 7294
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold3/checkpoint-228
Configuration saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold3/checkpoint-228/config.json
Model weights saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold3/checkpoint-228/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold3/checkpoint-228/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold3/checkpoint-228/special_tokens_map.json
added tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold3/checkpoint-228/added_tokens.json
***** Running Evaluation *****
  Num examples = 7294
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold3/checkpoin

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,inputs,val_preds
0,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]act of abating[SEP]FURNITURE; DO...,0.646973
1,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]eliminating process[SEP]FURNITUR...,0.5
2,1222e36d9a94c2a4,abatement,stone abutments,A47,0.0,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]stone abutments[SEP]FURNITURE; D...,0.253418
3,c450cc69fa315db9,abatement,water bodies,A47,0.0,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]water bodies[SEP]FURNITURE; DOME...,0.262695
4,f3186d573b97d3a6,adhesive mounting,flange mounting,A47,0.5,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,adhesive mounting[SEP]flange mounting[SEP]FURN...,0.410645


Saved OOF predictions for fold 3
Running training for fold4


  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": 

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.028856,0.780531
2,No log,0.024593,0.810226
3,0.038800,0.025328,0.81669
4,0.038800,0.024753,0.818874


***** Running Evaluation *****
  Num examples = 7294
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold4/checkpoint-228
Configuration saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold4/checkpoint-228/config.json
Model weights saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold4/checkpoint-228/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold4/checkpoint-228/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold4/checkpoint-228/special_tokens_map.json
added tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold4/checkpoint-228/added_tokens.json
***** Running Evaluation *****
  Num examples = 7294
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/fold4/checkpoin

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,inputs,val_preds
0,067203128142739c,abatement,greenhouse gases,A47,0.25,A,4,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]greenhouse gases[SEP]FURNITURE; ...,0.19104
1,061d17f04be2d1cf,abatement,increased rate,A47,0.25,A,4,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]increased rate[SEP]FURNITURE; DO...,0.204834
2,0a425937a3e86d10,abatement,minimising sounds,A47,0.5,A,4,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]minimising sounds[SEP]FURNITURE;...,0.312012
3,b3832eac81b73dfd,abatement,pollution abatement incinerator,A47,0.5,A,4,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,abatement[SEP]pollution abatement incinerator[...,0.312012
4,4603e13580940257,adhesive mounting,adhering mount,A47,0.75,A,4,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,adhesive mounting[SEP]adhering mount[SEP]FURNI...,1.02832


Saved OOF predictions for fold 4


In [None]:
# Calculate the CV score
predictions = df_val_preds['val_preds'].values
labels = df_val_preds['score'].values
eval_preds = predictions, labels
cv_metric_dict = compute_metrics(eval_preds)
print(f"CV score = {cv_metric_dict}")

CV score = {'pearson': 0.8219346967600348}


Using deberta-v3-small model that has been pretrained on patent abstract data using masked language modeling (Kaggle notebook USPPPM_train_debertav3small_mlm) gives a reduced CV score of 0.82017 when fine tuned on competition data