In [1]:
!nvidia-smi

Mon Jun  6 12:02:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%pip install -q transformers[sentencepiece] datasets wandb

[K     |████████████████████████████████| 4.2 MB 19.7 MB/s 
[K     |████████████████████████████████| 346 kB 46.5 MB/s 
[K     |████████████████████████████████| 1.8 MB 65.3 MB/s 
[K     |████████████████████████████████| 86 kB 7.1 MB/s 
[K     |████████████████████████████████| 212 kB 75.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 62.8 MB/s 
[K     |████████████████████████████████| 86 kB 7.8 MB/s 
[K     |████████████████████████████████| 140 kB 78.9 MB/s 
[K     |████████████████████████████████| 596 kB 66.4 MB/s 
[K     |████████████████████████████████| 127 kB 72.6 MB/s 
[K     |████████████████████████████████| 145 kB 73.7 MB/s 
[K     |████████████████████████████████| 181 kB 73.3 MB/s 
[K     |████████████████████████████████| 63 kB 2.5 MB/s 
[K     |████████████████████████████████| 271 kB 75.9 MB/s 
[K     |████████████████████████████████| 144 kB 77.5 MB/s 
[K     |████████████████████████████████| 94 kB 2.3 MB/s 
[K     |███████████████████████

### In this notebook we experiment by finetuning a bert-for-patents model on competition data by adding patent section as special token to the tokenizer vocab.

In [4]:
%pip install bitsandbytes-cuda112

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bitsandbytes-cuda112
  Downloading bitsandbytes_cuda112-0.26.0-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 32.4 MB/s 
[?25hInstalling collected packages: bitsandbytes-cuda112
Successfully installed bitsandbytes-cuda112-0.26.0


In [5]:
# This tests if the installation was successful
!wget https://gist.githubusercontent.com/TimDettmers/1f5188c6ee6ed69d211b7fe4e381e713/raw/4d17c3d09ccdb57e9ab7eca0171f2ace6e4d2858/check_bnb_install.py && python check_bnb_install.py

--2022-06-06 12:03:27--  https://gist.githubusercontent.com/TimDettmers/1f5188c6ee6ed69d211b7fe4e381e713/raw/4d17c3d09ccdb57e9ab7eca0171f2ace6e4d2858/check_bnb_install.py
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 327 [text/plain]
Saving to: ‘check_bnb_install.py’


2022-06-06 12:03:28 (16.0 MB/s) - ‘check_bnb_install.py’ saved [327/327]

SUCCESS!
Installation was successful!


In [6]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
my_drive = GoogleDrive(gauth)

In [7]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch.multiprocessing as mp
import wandb
import warnings
import bitsandbytes as bnb
from transformers.trainer_pt_utils import get_parameter_names

warnings.filterwarnings('ignore')


In [13]:
class WandbConfig:
    WANDB_KEY = "c5e2877bf080e6b62fcc57231c91e3a1455f97d0"
    WANDB_RUN_NAME = "deberta-v2-xlarge-run1"
    WANDB_PROJECT = "huggingface"
    USE_WANDB = True  

class TrainingArgs:
    weight_decay = 0.01
    learning_rate = 3e-5  
    warmup_ratio = 0.1
    gradient_accumulation_steps = 1
    fp16 = True
    lr_scheduler_type = "linear"
    # Number of checkpoints to save for each model
    save_total_limit = 1
    gradient_checkpointing = False
    max_steps=-1 # set >0 to limit
    report_to="wandb"
    max_grad_norm=50.0

class Config:  
    MODEL_NAME = "deberta-v2-xlarge"
    DATA_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/data/"
    VAL_PREDS_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/preds/"
    # location where trained model weights are saved
    OUT_DIR = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v2-xlarge/"
    RUNTIME = "COLAB"
    RANDOM_STATE = 42
    BATCH_SIZE = 8
    NUM_LABELS = 1
    NUM_FOLDS = 5
    RUN_ALL_FOLDS = False
    NUM_EPOCHS = 4
    NUM_WORKERS = mp.cpu_count()
    TRANSFORMER_CHECKPOINT = "microsoft/deberta-v2-xlarge"
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    SUBSET_ROWS_FRAC = 0.05
    TRAIN_ON_SUBSET = True
    RANDOM_SEED = 42
    ADAM_BITS = 8

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [14]:
def empty_gdrive_trash():
    deleted_file_name = []
    for a_file in my_drive.ListFile({'q': "trashed = true"}).GetList():
        file_name = a_file['title']
        deleted_file_name.append(file_name)
        # delete the file permanently.
        a_file.Delete()
    print("The below files were cleared from trash")
    print(deleted_file_name)

In [15]:
if WandbConfig.USE_WANDB:
    if Config.RUNTIME == "KAGGLE":
        pass
        #wandb_login()
    else:
        wandb.login(key=WandbConfig.WANDB_KEY)        

[34m[1mwandb[0m: Currently logged in as: [33mbkanupam[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [16]:
empty_gdrive_trash()

The below files were cleared from trash
['Kaggle']


In [17]:
df_train = pd.read_csv(Config.DATA_PATH + "train.csv")
df_test = pd.read_csv(Config.DATA_PATH + "test.csv")
df_titles = pd.read_csv(Config.DATA_PATH + "titles.csv")

In [18]:
df_train["section"] = df_train.context.str[0]

In [19]:
from sklearn.preprocessing import LabelEncoder

anchor_encoder = LabelEncoder()
df_train["anchor_map"] = anchor_encoder.fit_transform(df_train["anchor"])
# Score is not really a continuous value here as there are just five distinct values. But since it is float it needs to be converted
# to categorical value before we can perform stratified split on score
df_train["score_map"] = df_train["score"].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})

In [20]:
from sklearn import model_selection

def strat_group_kfold_dataframe(df, target_col_name, group_col_name, num_folds=Config.NUM_FOLDS):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values    
    groups = df[group_col_name].values
    # stratify data using anchor as group and score as target
    skf = model_selection.StratifiedGroupKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y, groups=groups)):
        df.loc[val_index, "kfold"] = fold        
    return df     

In [21]:
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df.sample(frac=1, random_state=Config.RANDOM_STATE).reset_index(drop=True)
    y = df[target_col_name].values
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE)
    # stratification is done on the basis of y labels, a placeholder for X is sufficient
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_idx, "kfold"] = fold
    return df

In [22]:
if Config.TRAIN_ON_SUBSET:
    print(f"Selecting {Config.SUBSET_ROWS_FRAC * 100}% training data")
    df_train = df_train.sample(frac=Config.SUBSET_ROWS_FRAC, random_state=Config.RANDOM_SEED).reset_index(drop=True)

# Since the target column (score) is continuous, we need to create bins out of the target column
df_train.loc[:, "bins"] = pd.cut(df_train.score, bins=5, labels=[0,1,2,3,4])
df_train = strat_kfold_dataframe(df_train, target_col_name="bins", num_folds=Config.NUM_FOLDS)

# Now do a stratified group k fold on the bins column (which is a categorical column) and anchor as groups
# df_train = strat_group_kfold_dataframe(df_train, target_col_name="score_map", group_col_name="anchor_map", num_folds=Config.NUM_FOLDS)            
# drop the bin column
df_train = df_train.drop(["bins"], axis=1)
df_train = df_train.drop(["anchor_map", "score_map"], axis=1)

Selecting 5.0% training data


In [23]:
# Let us check if the stratification has been done correctly
# The mean of score column should be similar across folds 
fold_score_mean = []
for fold in range(Config.NUM_FOLDS):
    fold_score_mean.append(np.mean(df_train[df_train.kfold == fold].score.values))
fold_score_mean

[0.36575342465753424,
 0.36506849315068496,
 0.36506849315068496,
 0.36506849315068496,
 0.3646978021978022]

In [24]:
df_titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [25]:
df_train = pd.merge(
    left = df_train,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [26]:
df_train

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title
0,ed1c4e525eb105fe,transmit alarm,display indicator,G08,0.00,G,2,G08,SIGNALLING
1,c261266d1ba87355,running tally,stationary items,G08,0.00,G,3,G08,SIGNALLING
2,499fd13434840bd3,main lane,entry lane,G08,0.25,G,1,G08,SIGNALLING
3,5889ce7d8b2da774,intruder detection,electronic information processing,G08,0.50,G,4,G08,SIGNALLING
4,9430304cd83c9637,calling card,numbers,G08,0.25,G,4,G08,SIGNALLING
...,...,...,...,...,...,...,...,...,...
1819,7516fa9e5f00a732,high gradient magnetic separators,photoelectric device,C02,0.00,C,4,C02,"TREATMENT OF WATER, WASTE WATER, SEWAGE, OR SL..."
1820,9ad5edeca151fde4,high gradient magnetic separators,separation by magnetic effect,C02,0.75,C,3,C02,"TREATMENT OF WATER, WASTE WATER, SEWAGE, OR SL..."
1821,ffd1307f72960a66,congruency,incongruence,C02,0.25,C,0,C02,"TREATMENT OF WATER, WASTE WATER, SEWAGE, OR SL..."
1822,b573b70fe84c1e23,electromagnetic radiation source,em radiation source,A22,0.75,A,2,A22,BUTCHERING; MEAT TREATMENT; PROCESSING POULTRY...


In [27]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#model = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.NUM_LABELS)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/633 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.33M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
deberta_vocab = tokenizer.get_vocab()
len(deberta_vocab)

128001

In [29]:
df_train['sectok'] = '[' + df_train.section + ']'
sectoks = list(df_train.sectok.unique())
sep = '[s]'
sectoks.append(sep)
tokenizer.add_special_tokens({'additional_special_tokens': sectoks})

9

In [30]:
#model.resize_token_embeddings(len(tokenizer.get_vocab()))

In [31]:
#sep = tokenizer.sep_token

In [32]:
df_train["inputs"] = df_train.sectok + sep + df_train.anchor + sep + df_train.target + sep + df_train.title 
df_train.head()

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,sectok,inputs
0,ed1c4e525eb105fe,transmit alarm,display indicator,G08,0.0,G,2,G08,SIGNALLING,[G],[G][s]transmit alarm[s]display indicator[s]SIG...
1,c261266d1ba87355,running tally,stationary items,G08,0.0,G,3,G08,SIGNALLING,[G],[G][s]running tally[s]stationary items[s]SIGNA...
2,499fd13434840bd3,main lane,entry lane,G08,0.25,G,1,G08,SIGNALLING,[G],[G][s]main lane[s]entry lane[s]SIGNALLING
3,5889ce7d8b2da774,intruder detection,electronic information processing,G08,0.5,G,4,G08,SIGNALLING,[G],[G][s]intruder detection[s]electronic informat...
4,9430304cd83c9637,calling card,numbers,G08,0.25,G,4,G08,SIGNALLING,[G],[G][s]calling card[s]numbers[s]SIGNALLING


In [33]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True
    )
    if with_labels:
        encoding["labels"] = row["score"]
    return encoding

In [34]:
from functools import partial

preprocess_train_data = partial(tokenize_text, tokenizer, True)  
preprocess_test_data = partial(tokenize_text, tokenizer, False)  

In [35]:
def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    ds_train_raw = Dataset.from_pandas(train_df)
    ds_valid_raw = Dataset.from_pandas(valid_df)
    raw_ds_col_names = ds_train_raw.column_names    
    ds_train = ds_train_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)
    ds_valid = ds_valid_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)    
    return train_df, valid_df, ds_train, ds_valid

In [36]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [37]:
def get_oof_preds(trainer, ds_val, df_val_fold):
    oof_outputs = trainer.predict(ds_val)
    oof_predictions = oof_outputs.predictions.reshape(-1)
    df_val_fold["val_preds"] = oof_predictions
    return df_val_fold

In [38]:
import math
import transformers

# Thanks to Nicolas Broad. Taken from https://www.kaggle.com/code/nbroad/8-bit-adam-optimization/notebook
def get_optimizer(model, args, train_dataset):    
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    # These are the only changes you need to make. The first part sets the optimizer to use 8-bits
    # The for loop sets embeddings to use 32-bits
    if Config.ADAM_BITS == 32:
        optimizer = bnb.optim.Adam32bit(optimizer_grouped_parameters, lr=args.learning_rate)
    if Config.ADAM_BITS == 8:
        optimizer = bnb.optim.Adam8bit(optimizer_grouped_parameters, lr=args.learning_rate)
        
    # Thank you @gregorlied https://www.kaggle.com/nbroad/8-bit-adam-optimization/comments#1661976
    for module in model.modules():
        if isinstance(module, torch.nn.Embedding):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                module, 'weight', {'optim_bits': 32}
            )            

    num_update_steps_per_epoch = len(train_dataset) // args.per_device_train_batch_size // args.gradient_accumulation_steps
    if args.max_steps == -1 or args.max_steps is None:
        args.max_steps = Config.NUM_EPOCHS * num_update_steps_per_epoch
    else:
        num_train_epochs = args.max_steps / num_update_steps_per_epoch
        #print(f"num_train_epochs = {num_train_epochs}")
        args.num_train_epochs = math.ceil(num_train_epochs)
        
    if args.warmup_ratio is not None:
        args.num_warmup_steps = int(args.warmup_ratio * args.max_steps)

    lr_scheduler = transformers.get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_steps,
    )
    return optimizer, lr_scheduler

In [39]:
import gc

df_val_preds = pd.DataFrame()
tok_vocab = tokenizer.get_vocab()
for fold in range(Config.NUM_FOLDS):
    fold_str = f"fold{fold}"
    print(f"Running training for {Config.MODEL_NAME} {fold_str}")
    df_train_fold, df_val_fold, ds_train, ds_val = get_fold_dls(fold, df_train)
    training_args = TrainingArguments(
        output_dir=Config.OUT_DIR + fold_str,
        evaluation_strategy="epoch",
        save_strategy='no',        
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        warmup_ratio=TrainingArgs.warmup_ratio,
        weight_decay=TrainingArgs.weight_decay,
        learning_rate=TrainingArgs.learning_rate,    
        gradient_accumulation_steps=TrainingArgs.gradient_accumulation_steps,
        fp16=TrainingArgs.fp16,
        lr_scheduler_type=TrainingArgs.lr_scheduler_type,
        save_total_limit=TrainingArgs.save_total_limit,
        gradient_checkpointing=TrainingArgs.gradient_checkpointing,
        max_steps=TrainingArgs.max_steps,
        report_to=TrainingArgs.report_to,
        max_grad_norm=TrainingArgs.max_grad_norm
    )
    model = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.NUM_LABELS)
    optimizer, lr_scheduler = get_optimizer(model, training_args, ds_train)
    print(f"len(tokenizer_vocab) = {len(tok_vocab)}")
    model.resize_token_embeddings(len(tok_vocab))    
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=ds_train,              # training dataset
        eval_dataset=ds_val,                 # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
        data_collator=data_collator,
        tokenizer=tokenizer,
        optimizers=(optimizer, lr_scheduler)
    )
    trainer.train()
    trainer.save_model(Config.OUT_DIR + fold_str)
    df_val_fold = get_oof_preds(trainer, ds_val, df_val_fold) 
    display(df_val_fold.head())
    df_val_preds = pd.concat([df_val_preds, df_val_fold], axis=0)
    # export the oof predictions to csv for later use in stacking
    if Config.RUNTIME != "KAGGLE":
        df_val_preds.to_csv(Config.VAL_PREDS_PATH + f"df_train_oof_preds_{Config.MODEL_NAME}.csv")
    else:
        df_val_preds.to_csv("/kaggle/working/df_train_oof_preds.csv")
    print(f"Saved OOF predictions for fold {fold}")    
    del model, trainer, optimizer, lr_scheduler
    gc.collect()
    torch.cuda.empty_cache()
    # Empty the trash to clear gdrive disk space
    empty_gdrive_trash()
    if not Config.RUN_ALL_FOLDS:
        break


Running training for deberta-v2-xlarge fold0


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/1.65G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v2-xlarge were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xlarge and are newly initialized: ['pooler.dens

len(tokenizer_vocab) = 128010


max_steps is given, it will override any value given in num_train_epochs
Using amp half precision backend
***** Running training *****
  Num examples = 1459
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 728
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.080652,0.001146
2,No log,0.069988,-0.021477
3,0.094300,0.068628,-0.106054
3,0.094300,0.06877,-0.014006


***** Running Evaluation *****
  Num examples = 365
  Batch size = 8
***** Running Evaluation *****
  Num examples = 365
  Batch size = 8
***** Running Evaluation *****
  Num examples = 365
  Batch size = 8
***** Running Evaluation *****
  Num examples = 365
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v2-xlarge/fold0
Configuration saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v2-xlarge/fold0/config.json
Model weights saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v2-xlarge/fold0/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v2-xlarge/fold0/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v2-xlarge/fold0/special_tokens_map.json

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,sectok,inputs,val_preds
0,c880803cb7685eaf,intruder detection,electronic information detection,G08,0.5,G,0,G08,SIGNALLING,[G],[G][s]intruder detection[s]electronic informat...,0.37793
1,604be32eafe1314e,movement directions,wireless network directions,B60,0.0,B,0,B60,VEHICLES IN GENERAL,[B],[B][s]movement directions[s]wireless network d...,0.37793
2,f7625c13fe23e113,smooth outer surface,substantially smooth outer end,B60,0.5,B,0,B60,VEHICLES IN GENERAL,[B],[B][s]smooth outer surface[s]substantially smo...,0.37793
3,2baeaa93c69b6226,fall to low value,predetermined low value,B60,0.5,B,0,B60,VEHICLES IN GENERAL,[B],[B][s]fall to low value[s]predetermined low va...,0.377686
4,013e42f29329f821,pushing pin,rotating sliding member,B60,0.25,B,0,B60,VEHICLES IN GENERAL,[B],[B][s]pushing pin[s]rotating sliding member[s]...,0.37793


Saved OOF predictions for fold 0
The below files were cleared from trash
[]


In [40]:
# Calculate the CV score
predictions = df_val_preds['val_preds'].values
labels = df_val_preds['score'].values
eval_preds = predictions, labels
cv_metric_dict = compute_metrics(eval_preds)
print(f"CV score = {cv_metric_dict}")

CV score = {'pearson': -0.014005530527946916}


In [41]:
df_val_preds["score_pred_diff"] = df_val_preds.apply(lambda row: abs(row["val_preds"] - row["score"]), axis=1)
df_val_preds = df_val_preds.sort_values(by=["score_pred_diff"], ascending=False)
df_val_preds_diff = df_val_preds[df_val_preds.score_pred_diff > 0.2]
len(df_val_preds_diff)

131

In [42]:
df_val_preds_diff[["anchor", "target", "context", "score", "val_preds", "score_pred_diff"]]

Unnamed: 0,anchor,target,context,score,val_preds,score_pred_diff
142,combination function,combination functional,H04,1.00,0.37793,0.62207
362,panel frame,panel frames,C02,1.00,0.37793,0.62207
219,insulation sleeve,insulator sleeve,F28,1.00,0.37793,0.62207
50,catching surface,catch surface,B65,1.00,0.37793,0.62207
327,saturated felt,saturate felt,D06,1.00,0.37793,0.62207
...,...,...,...,...,...,...
63,coaxial cable transmission,coaxial transmission line,H03,0.75,0.37793,0.37207
353,square lattice,square array,G21,0.75,0.37793,0.37207
24,battery heater,portable heater,H01,0.75,0.37793,0.37207
57,dac system,da ic,H03,0.75,0.37793,0.37207
