In [1]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, DataCollatorWithPadding
from functools import partial
import torch.multiprocessing as mp
import warnings

warnings.filterwarnings('ignore')


In [2]:
class Models:
    DEBERTA_V2_XLARGE = "deberta-v2-xlarge"

class TrainingArgs:
    weight_decay = 0.01
    learning_rate = 2e-5  
    warmup_ratio = 0.1
    gradient_accumulation_steps = 8
    fp16 = True
    lr_scheduler_type = "cosine"

class Config:
    DATA_PATH = "/kaggle/input/us-patent-phrase-to-phrase-matching/"
    RANDOM_STATE = 42
    BATCH_SIZE = 16
    NUM_LABELS = 1
    NUM_FOLDS = 5
    NUM_EPOCHS = 4
    NUM_WORKERS = mp.cpu_count()
    TRANSFORMER_CHECKPOINT = "microsoft/deberta-v2-xlarge"
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    

model_paths = {
    "deberta-v2-xlarge": f"/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/"
}

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
df_train = pd.read_csv(Config.DATA_PATH + "train.csv")
df_test = pd.read_csv(Config.DATA_PATH + "test.csv")
df_titles = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")

In [4]:
df_test = pd.merge(
    left = df_test,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [5]:
tokenizer_path = "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
df_train["section"] = df_train.context.str[0]
df_test["section"] = df_test.context.str[0]
df_train['sectok'] = '[' + df_train.section + ']'
df_test['sectok'] = '[' + df_test.section + ']'
train_sectoks = set(df_train.sectok.unique())
test_sectoks = set(df_test.sectok.unique())
sep = '[s]'
addn_special_tokens = list(train_sectoks.union(test_sectoks))
addn_special_tokens.append(sep)
print(f"addn_special_tokens = {addn_special_tokens}")
tokenizer.add_special_tokens({'additional_special_tokens': addn_special_tokens})

addn_special_tokens = ['[E]', '[A]', '[F]', '[B]', '[H]', '[D]', '[C]', '[G]', '[s]']


0

In [7]:
df_test["inputs"] = df_test.sectok + sep + df_test.anchor + sep + df_test.target + sep + df_test.title 
df_test.head()

Unnamed: 0,id,anchor,target,context,code,title,section,sectok,inputs
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,G02,OPTICS,G,[G],[G][s]opc drum[s]inorganic photoconductor drum...
1,5203a36c501f1b7c,generate in layer,generate by layer,G02,G02,OPTICS,G,[G],[G][s]generate in layer[s]generate by layer[s]...
2,7aa5908a77a7ec24,el display,illumination,G02,G02,OPTICS,G,[G],[G][s]el display[s]illumination[s]OPTICS
3,09e418c93a776564,adjust gas flow,altering gas flow,F23,F23,COMBUSTION APPARATUS; COMBUSTION PROCESSES,F,[F],[F][s]adjust gas flow[s]altering gas flow[s]CO...
4,36baf228038e314b,lower trunnion,lower locating,B60,B60,VEHICLES IN GENERAL,B,[B],[B][s]lower trunnion[s]lower locating[s]VEHICL...


In [8]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True
    )
    if with_labels:
        encoding["labels"] = row["score"]
    return encoding

In [9]:
preprocess_test_data = partial(tokenize_text, tokenizer, False)  
ds_test_raw = Dataset.from_pandas(df_test)
raw_ds_col_names = ds_test_raw.column_names  
ds_test = ds_test_raw.map(preprocess_test_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)  

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
def get_fold_model(model_path, tokenizer):
    model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)
    tok_vocab = tokenizer.get_vocab()
    print(f"len(tokenizer_vocab) = {len(tok_vocab)}")
    model.resize_token_embeddings(len(tok_vocab))    
    model.to(Config.DEVICE)
    return model

In [11]:
import gc
from transformers import logging

logging.set_verbosity_error()
logging.set_verbosity_warning()

model_fold_preds = {}
for model_name in model_paths.keys():
    fold_preds = []
    for fold in range(Config.NUM_FOLDS):
        model_path = model_paths[model_name] + f"fold{fold}/"
        print(f"model_path = {model_path}")
        model = get_fold_model(model_path, tokenizer)
        trainer_args = TrainingArguments(
            output_dir="/kaggle/working/model/",
            per_device_eval_batch_size=Config.BATCH_SIZE
        )
        trainer = Trainer(model=model, tokenizer=tokenizer, args=trainer_args)
        outputs = trainer.predict(ds_test)
        fold_preds.append(outputs.predictions.reshape(-1))
        print(f"Completed predictions for fold {fold}")
        del model, trainer
        gc.collect()
        torch.cuda.empty_cache()
    model_fold_preds[model_name] = fold_preds

model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/
len(tokenizer_vocab) = 128010


***** Running Prediction *****
  Num examples = 36
  Batch size = 16


loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold1/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold1/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

Completed predictions for fold 0
model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold1/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold1/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 16


loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold2/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold2/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

Completed predictions for fold 1
model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold2/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold2/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 16


loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold3/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold3/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

Completed predictions for fold 2
model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold3/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold3/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 16


Completed predictions for fold 3


loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold4/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold4/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold4/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold4/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 16


Completed predictions for fold 4


In [12]:
test_preds_arr = np.array(model_fold_preds[Models.DEBERTA_V2_XLARGE])
test_preds_avg = np.mean(test_preds_arr, axis=0)
df_submission = pd.DataFrame({"id": df_test.id, "score": test_preds_avg})
df_submission

Unnamed: 0,id,score
0,4112d61851461f60,0.602895
1,5203a36c501f1b7c,0.835709
2,7aa5908a77a7ec24,0.355864
3,09e418c93a776564,0.790647
4,36baf228038e314b,0.47334
5,b892011ab2e2cabc,0.702383
6,1f37ead645e7f0c8,0.350201
7,71a5b6ad068d531f,0.023082
8,16ae4b99d3601e60,0.282291
9,474c874d0c07bd21,0.537377


In [13]:
df_submission.to_csv("/kaggle/working/submission.csv", index=False)