In [None]:
!nvidia-smi

Sun May 29 06:45:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%pip install -q transformers[sentencepiece] datasets

[K     |████████████████████████████████| 4.2 MB 8.0 MB/s 
[K     |████████████████████████████████| 346 kB 57.2 MB/s 
[K     |████████████████████████████████| 212 kB 72.7 MB/s 
[K     |████████████████████████████████| 86 kB 5.9 MB/s 
[K     |████████████████████████████████| 140 kB 74.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 57.2 MB/s 
[K     |████████████████████████████████| 86 kB 6.6 MB/s 
[K     |████████████████████████████████| 596 kB 63.4 MB/s 
[K     |████████████████████████████████| 127 kB 66.7 MB/s 
[K     |████████████████████████████████| 94 kB 4.1 MB/s 
[K     |████████████████████████████████| 144 kB 70.5 MB/s 
[K     |████████████████████████████████| 271 kB 72.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 48.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 54.6 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the follo

### In this notebook we experiment by finetuning a deberta-v3-small model on competition data by adding missing anchor and target words to deberta tokenizer vocab.

In [None]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch.multiprocessing as mp
import warnings

warnings.filterwarnings('ignore')


In [None]:
class TrainingArgs:
    weight_decay = 0.01
    learning_rate = 2e-5  
    warmup_ratio = 0.1
    gradient_accumulation_steps = 1
    fp16 = True
    lr_scheduler_type = "cosine"
    # Number of checkpoints to save for each model
    save_total_limit = 1

class Config:
    DATA_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/data/"
    # location where trained model weights are saved
    OUT_DIR = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-small"
    RUNTIME = "COLAB"
    RANDOM_STATE = 42
    BATCH_SIZE = 128
    NUM_LABELS = 1
    NUM_FOLDS = 5
    RUN_ALL_FOLDS = True
    NUM_EPOCHS = 4
    NUM_WORKERS = mp.cpu_count()
    TRANSFORMER_CHECKPOINT = "microsoft/deberta-v3-small"
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    SUBSET_ROWS_FRAC = 0.1
    TRAIN_ON_SUBSET = False
    RANDOM_SEED = 42

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
df_train = pd.read_csv(Config.DATA_PATH + "train.csv")
df_test = pd.read_csv(Config.DATA_PATH + "test.csv")
df_titles = pd.read_csv(Config.DATA_PATH + "titles.csv")

In [None]:
len(df_train)

36473

In [None]:
df_train.target.value_counts()

composition                    24
data                           22
metal                          22
motor                          22
assembly                       21
                               ..
switching switch over valve     1
switching switch off valve      1
switching over valve            1
switching off valve             1
wooden substrate                1
Name: target, Length: 29340, dtype: int64

In [None]:
df_train.anchor.value_counts()

component composite coating              152
sheet supply roller                      150
source voltage                           140
perfluoroalkyl group                     136
el display                               135
                                        ... 
plug nozzle                                2
shannon                                    2
dry coating composition1                   2
peripheral nervous system stimulation      1
conduct conducting material                1
Name: anchor, Length: 733, dtype: int64

In [None]:
df_train.context.value_counts()

H01    2186
H04    2177
G01    1812
A61    1477
F16    1091
       ... 
B03      47
F17      33
B31      24
A62      23
F26      18
Name: context, Length: 106, dtype: int64

In [None]:
df_train["section"] = df_train.context.str[0]

In [None]:
df_train.section.value_counts()

B    8019
H    6195
G    6013
C    5288
A    4094
F    4054
E    1531
D    1279
Name: section, dtype: int64

In [None]:
df_train.score.value_counts()

0.50    12300
0.25    11519
0.00     7471
0.75     4029
1.00     1154
Name: score, dtype: int64

In [None]:
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df.sample(frac=1, random_state=Config.RANDOM_STATE).reset_index(drop=True)
    y = df[target_col_name].values
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE)
    # stratification is done on the basis of y labels, a placeholder for X is sufficient
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_idx, "kfold"] = fold
    return df

In [None]:
if Config.TRAIN_ON_SUBSET:
    print(f"Selecting {Config.SUBSET_ROWS_FRAC * 100}% training data")
    df_train = df_train.sample(frac=Config.SUBSET_ROWS_FRAC, random_state=Config.RANDOM_SEED).reset_index(drop=True)

# Since the target column (score) is continuous, we need to create bins out of the target column
df_train.loc[:, "bins"] = pd.cut(df_train.score, bins=5, labels=[0,1,2,3,4])
# Now do a stratified k fold on the bins column (which is a categorical column)
df_train = strat_kfold_dataframe(df_train, target_col_name="bins", num_folds=Config.NUM_FOLDS)            
# drop the bin column
df_train = df_train.drop(["bins"], axis=1)

In [None]:
# Let us check if the stratification has been done correctly
# The mean of score column should be similar across folds 
fold_score_mean = []
for fold in range(Config.NUM_FOLDS):
    fold_score_mean.append(np.mean(df_train[df_train.kfold == fold].score.values))
fold_score_mean

[0.36209732693625774,
 0.36209732693625774,
 0.36209732693625774,
 0.3619755963805868,
 0.3620441458733205]

In [None]:
df_titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [None]:
df_train = pd.merge(
    left = df_train,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [None]:
df_train

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,A,1,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,2,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...
...,...,...,...,...,...,...,...,...,...
36468,718f1c6953e3942f,undulation,undulatory swimmers,B31,0.00,B,0,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA..."
36469,4dc407e6d0aa7844,undulation,voltage fluctuate,B31,0.00,B,0,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA..."
36470,de69548ad79caccc,web transfer,transfer from web,B31,0.75,B,3,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA..."
36471,6620317413e6e03f,web transfer,transfer to web,B31,0.25,B,2,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA..."


In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.NUM_LABELS)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a B

In [None]:
deberta_vocab = tokenizer.get_vocab()
len(deberta_vocab)

128001

In [None]:
df_train["anchor_words"] = df_train["anchor"].apply(lambda anchor: anchor.split())
df_train["target_words"] = df_train["target"].apply(lambda target: target.split())

In [None]:
unique_anchor_words = set(np.concatenate(df_train["anchor_words"].values))
unique_target_words = set(np.concatenate(df_train["target_words"].values))
unique_anchor_target_words = unique_anchor_words.union(unique_target_words)

In [None]:
len(unique_anchor_target_words)

8930

In [None]:
tokens_toadd = []
for word in unique_anchor_target_words:
    word_token = "▁" + word
    if word_token not in deberta_vocab.keys() and word not in deberta_vocab.keys():
        tokens_toadd.append(word)

In [None]:
len(tokens_toadd)

1566

In [None]:
print(tokens_toadd)

['agvhd', 'sawtooth', 'methoxyethanamine', 'circumscribe', 'collators', 'crt', 'wprkout', 'distillates', 'ochchnh', 'propulsor', 'tensioning', 'luciae', 'clarifiers', 'caprolactone', 'alkenes', 'm1', 'pawl', 'polarize', 'systemstimulation', 'corks', 'nonconjugative', 'sinoatrial', 'coiling', 'amphiphilic', 'tetrafluoropropene', 'cladded', 'puf', 'lecticans', 'liquors', 'pendente', 'lense', 'pll', 'diuresis', 'expellant', 'diethylenimide', 'electropositive', 'crosslink', 'propanoic', 'datalink', 'truing', 'endohydrolysis', 'ferritic', 'h23', 'paperboard', 'gnd', 'phospholipids', 'electroforming', 'photostructurable', 'her2', 'earthing', 'measurment', 'waterjet', 'monostable', 'nessler', 'enqueue', 'meshing', '3as4s6ar', 'juglandaceae', 'sina', 'dimethicone', 'interframe', 'anticlockwise', 'detarget', 'offsprings', 'hydroxycarboxylicacid', 'spreaders', 'aligner', 'fcf24chch2', 'avidin', 'linkers', 'couplers', 'undepleted', 'unsubstituted', 'phytic', 'welldone', 'swatooth', 'guanidine', '

In [None]:
tokenizer.add_tokens(tokens_toadd)

1566

In [None]:
model.resize_token_embeddings(len(tokenizer.get_vocab()))

Embedding(129567, 768)

In [None]:
encoding = tokenizer.encode_plus("hexanoyl")
tokenizer.convert_ids_to_tokens(encoding["input_ids"])

['[CLS]', 'hexanoyl', '[SEP]']

In [None]:
sep = tokenizer.sep_token

In [None]:
df_train["inputs"] = df_train.anchor + sep + df_train.target + sep + df_train.title 
df_train.head()

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,anchor_words,target_words,inputs
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A,1,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[abatement, of, pollution]",abatement[SEP]abatement of pollution[SEP]FURNI...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[act, of, abating]",abatement[SEP]act of abating[SEP]FURNITURE; DO...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A,2,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[active, catalyst]",abatement[SEP]active catalyst[SEP]FURNITURE; D...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A,3,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[eliminating, process]",abatement[SEP]eliminating process[SEP]FURNITUR...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[forest, region]",abatement[SEP]forest region[SEP]FURNITURE; DOM...


In [None]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True
    )
    if with_labels:
        encoding["labels"] = row["score"]
    return encoding

In [None]:
# test_enc = tokenize_text(tokenizer, True, df_train.loc[0, :])
# test_enc

In [None]:
# ds_train_raw = Dataset.from_pandas(df_train)
# ds_train_raw

In [None]:
from functools import partial

preprocess_train_data = partial(tokenize_text, tokenizer, True)  
preprocess_test_data = partial(tokenize_text, tokenizer, False)  

In [None]:
def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    ds_train_raw = Dataset.from_pandas(train_df)
    ds_valid_raw = Dataset.from_pandas(valid_df)
    raw_ds_col_names = ds_train_raw.column_names    
    ds_train = ds_train_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)
    ds_valid = ds_valid_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)    
    return train_df, valid_df, ds_train, ds_valid

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
def get_oof_preds(trainer, ds_val, df_val_fold):
    oof_outputs = trainer.predict(ds_val)
    oof_predictions = oof_outputs.predictions.reshape(-1)
    df_val_fold["val_preds"] = oof_predictions
    return df_val_fold

In [None]:
df_val_preds = pd.DataFrame()
tok_vocab = tokenizer.get_vocab()
for fold in range(Config.NUM_FOLDS):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    df_train_fold, df_val_fold, ds_train, ds_val = get_fold_dls(fold, df_train)
    training_args = TrainingArguments(
        output_dir=Config.OUT_DIR + fold_str,
        evaluation_strategy="epoch",
        save_strategy='epoch',        
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        warmup_ratio=TrainingArgs.warmup_ratio,
        weight_decay=TrainingArgs.weight_decay,
        learning_rate=TrainingArgs.learning_rate,    
        gradient_accumulation_steps=TrainingArgs.gradient_accumulation_steps,
        fp16=TrainingArgs.fp16,
        lr_scheduler_type=TrainingArgs.lr_scheduler_type,
        save_total_limit=TrainingArgs.save_total_limit
    )
    model = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.NUM_LABELS)
    print(f"len(tokenizer_vocab) = {len(tok_vocab)}")
    model.resize_token_embeddings(len(tok_vocab))    
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=ds_train,              # training dataset
        eval_dataset=ds_val,                 # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    trainer.train()
    df_val_fold = get_oof_preds(trainer, ds_val, df_val_fold) 
    display(df_val_fold.head())
    df_val_preds = pd.concat([df_val_preds, df_val_fold], axis=0)
    # export the oof predictions to csv for later use in stacking
    if Config.RUNTIME != "KAGGLE":
        df_val_preds.to_csv(Config.DATA_PATH + "df_train_oof_preds.csv")
    else:
        df_val_preds.to_csv("/kaggle/working/df_train_oof_preds.csv")
    print(f"Saved OOF predictions for fold {fold}")    
    del model, trainer
    if not Config.RUN_ALL_FOLDS:
        break


Running training for fold0


  0%|          | 0/30 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/8 [00:00<?, ?ba/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

len(tokenizer_vocab) = 129567


Using amp half precision backend
***** Running training *****
  Num examples = 29178
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 912


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.035112,0.738231
2,No log,0.028157,0.773033
3,0.049800,0.027114,0.784417
4,0.049800,0.027835,0.786277


***** Running Evaluation *****
  Num examples = 7295
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-smallfold0/checkpoint-228
Configuration saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-smallfold0/checkpoint-228/config.json
Model weights saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-smallfold0/checkpoint-228/pytorch_model.bin
tokenizer config file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-smallfold0/checkpoint-228/tokenizer_config.json
Special tokens file saved in /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-smallfold0/checkpoint-228/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7295
  Batch size = 128
Saving model checkpoint to /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-smallfold0/checkpoint-456
Configuration saved in /content/gdrive/MyDri

Unnamed: 0,id,anchor,target,context,score,section,kfold,code,title,anchor_words,target_words,inputs,val_preds
0,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[forest, region]",abatement[SEP]forest region[SEP]FURNITURE; DOM...,0.138306
1,ef2d4c2e6bbb208d,abatement,mixing core materials,A47,0.25,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[mixing, core, materials]",abatement[SEP]mixing core materials[SEP]FURNIT...,0.198853
2,cc96541d4987b399,abatement,rent abatement,A47,0.0,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[rent, abatement]",abatement[SEP]rent abatement[SEP]FURNITURE; DO...,0.200317
3,a8c9e9f37d4d836a,abatement,tax abatement,A47,0.0,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,[abatement],"[tax, abatement]",abatement[SEP]tax abatement[SEP]FURNITURE; DOM...,0.038208
4,604210b7c7ce2f6a,adhesive mounting,adhesive,A47,0.5,A,0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,"[adhesive, mounting]",[adhesive],adhesive mounting[SEP]adhesive[SEP]FURNITURE; ...,0.522461


Saved OOF predictions for fold 0
Running training for fold1


  0%|          | 0/30 [00:00<?, ?ba/s]



  0%|          | 0/8 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": 

len(tokenizer_vocab) = 129567


KeyboardInterrupt: ignored

In [None]:
# Calculate the CV score
predictions = df_val_preds['val_preds'].values
labels = df_val_preds['score'].values
eval_preds = predictions, labels
cv_metric_dict = compute_metrics(eval_preds)
print(f"CV score = {cv_metric_dict}")

Using deberta-v3-small model that using additional patent specific vocab (extracted from anchor and target phrases) gives a much reduced CV score of 0.7862 compared to CV score of 0.8219 (without additional vocab) when fine tuned on competition data