In [1]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, DataCollatorWithPadding
from functools import partial
import torch.multiprocessing as mp
import warnings
from transformers import logging

logging.set_verbosity_error()
logging.set_verbosity_warning()
warnings.filterwarnings('ignore')


In [2]:
class Models:
    DEBERTA_V3_LARGE = "deberta-v3-large"
    BERT_FOR_PATENTS = "bert-for-patents"
    DEBERTA_V2_XLARGE = "deberta-v2-xlarge"

class TrainingArgs:
    weight_decay = 0.01
    learning_rate = 2e-5  
    warmup_ratio = 0.1
    gradient_accumulation_steps = 8
    fp16 = True
    lr_scheduler_type = "cosine"

class Config:
    DATA_PATH = "/kaggle/input/us-patent-phrase-to-phrase-matching/"
    RANDOM_STATE = 42
    BATCH_SIZE = 64
    NUM_LABELS = 1
    NUM_FOLDS = 5
    NUM_EPOCHS = 4
    NUM_WORKERS = mp.cpu_count()
    TRANSFORMER_CHECKPOINT = "microsoft/deberta-v3-large"
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    

In [3]:
model_paths = {
    Models.DEBERTA_V3_LARGE: f"/kaggle/input/anu-dbv3l/deberta-v3-large/",
    Models.BERT_FOR_PATENTS: f"/kaggle/input/anu-bfp/bert-for-patents/",
    Models.DEBERTA_V2_XLARGE: f"/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/"
}

tokenizer_paths = {
    Models.DEBERTA_V3_LARGE: f"/kaggle/input/anu-dbv3l/deberta-v3-large/fold0/checkpoint-2280/",
    Models.BERT_FOR_PATENTS: f"/kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/",
    Models.DEBERTA_V2_XLARGE: f"/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/"
}

oof_preds_paths = {
    Models.DEBERTA_V3_LARGE: "/kaggle/input/uspppmmodeloofpreds/uspppm-models-oof-preds2/df_train_oof_preds_deberta-v3-large.csv",
    Models.BERT_FOR_PATENTS: "/kaggle/input/uspppmmodeloofpreds/uspppm-models-oof-preds2/df_bfp_train_oof_preds.csv",
    Models.DEBERTA_V2_XLARGE: "/kaggle/input/uspppmmodeloofpreds/uspppm-models-oof-preds2/df_train_oof_preds_deberta-v2-xlarge.csv"
}

# columns to select from model OOF predictions dataframe
oof_cols_to_use = [
    "id",
    "anchor",
    "target",
    "context",
    "section",
    "kfold",
    "score",
    "val_preds"
]

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
df_train = pd.read_csv(Config.DATA_PATH + "train.csv")
df_test = pd.read_csv(Config.DATA_PATH + "test.csv")
df_titles = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")

In [5]:
df_test = pd.merge(
    left = df_test,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [6]:
df_train["section"] = df_train.context.str[0]
df_test["section"] = df_test.context.str[0]
df_train['sectok'] = '[' + df_train.section + ']'
df_test['sectok'] = '[' + df_test.section + ']'
train_sectoks = set(df_train.sectok.unique())
test_sectoks = set(df_test.sectok.unique())
sep = '[s]'
addn_special_tokens = list(train_sectoks.union(test_sectoks))
addn_special_tokens.append(sep)
print(f"addn_special_tokens = {addn_special_tokens}")

addn_special_tokens = ['[F]', '[B]', '[E]', '[G]', '[D]', '[C]', '[H]', '[A]', '[s]']


In [7]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True
    )
    if with_labels:
        encoding["labels"] = row["score"]
    return encoding

In [8]:
def get_test_dataset_for_model(model_name, df):
    tokenizer_path = tokenizer_paths[model_name]
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
    # add section special token to the tokenizer
    tokenizer.add_special_tokens({'additional_special_tokens': addn_special_tokens})
    # DataCollatorWithPadding pads each batch to the longest sequence length
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)        
    # create input column that we will tokenize
    df["inputs"] = df.sectok + sep + df.anchor + sep + df.target + sep + df.title 
    preprocess_test_data = partial(tokenize_text, tokenizer, False)  
    ds_test_raw = Dataset.from_pandas(df)
    raw_ds_col_names = ds_test_raw.column_names  
    ds_test = ds_test_raw.map(preprocess_test_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)
    return ds_test, tokenizer

In [9]:
def get_fold_model(model_path, tokenizer):
    model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)
    tok_vocab = tokenizer.get_vocab()
    print(f"len(tokenizer_vocab) = {len(tok_vocab)}")
    model.resize_token_embeddings(len(tok_vocab))    
    model.to(Config.DEVICE)
    return model

In [10]:
import gc

model_fold_preds = {}
for model_name in model_paths.keys():    
    fold_preds = []
    ds_test, tokenizer = get_test_dataset_for_model(model_name, df_test)    
    for fold in range(Config.NUM_FOLDS):
        print(f"Running test predictions using {model_name} fold {fold}")
        model_path = model_paths[model_name] + f"fold{fold}/"
        if model_name in [Models.DEBERTA_V3_LARGE, Models.BERT_FOR_PATENTS]:
            model_path = model_path + "checkpoint-2280/"
        print(f"model_path = {model_path}")
        model = get_fold_model(model_path, tokenizer)
        trainer_args = TrainingArguments(
            output_dir="/kaggle/working/model/",
            per_device_eval_batch_size=Config.BATCH_SIZE
        )
        trainer = Trainer(model=model, tokenizer=tokenizer, args=trainer_args)
        outputs = trainer.predict(ds_test)
        fold_preds.append(outputs.predictions.reshape(-1))
        print(f"Completed predictions for fold {fold}")
        del model, trainer
        gc.collect()
        torch.cuda.empty_cache()
    model_fold_preds[model_name] = fold_preds    

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running test predictions using deberta-v3-large fold 0
model_path = /kaggle/input/anu-dbv3l/deberta-v3-large/fold0/checkpoint-2280/
len(tokenizer_vocab) = 128010


***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading configuration file /kaggle/input/anu-dbv3l/deberta-v3-large/fold1/checkpoint-2280/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-dbv3l/deberta-v3-large/fold1/checkpoint-2280/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_

Completed predictions for fold 0
Running test predictions using deberta-v3-large fold 1
model_path = /kaggle/input/anu-dbv3l/deberta-v3-large/fold1/checkpoint-2280/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-dbv3l/deberta-v3-large/fold1/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading configuration file /kaggle/input/anu-dbv3l/deberta-v3-large/fold2/checkpoint-2280/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-dbv3l/deberta-v3-large/fold2/checkpoint-2280/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_

Completed predictions for fold 1
Running test predictions using deberta-v3-large fold 2
model_path = /kaggle/input/anu-dbv3l/deberta-v3-large/fold2/checkpoint-2280/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-dbv3l/deberta-v3-large/fold2/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading configuration file /kaggle/input/anu-dbv3l/deberta-v3-large/fold3/checkpoint-2280/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-dbv3l/deberta-v3-large/fold3/checkpoint-2280/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_

Completed predictions for fold 2
Running test predictions using deberta-v3-large fold 3
model_path = /kaggle/input/anu-dbv3l/deberta-v3-large/fold3/checkpoint-2280/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-dbv3l/deberta-v3-large/fold3/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading configuration file /kaggle/input/anu-dbv3l/deberta-v3-large/fold4/checkpoint-2280/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-dbv3l/deberta-v3-large/fold4/checkpoint-2280/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_

Completed predictions for fold 3
Running test predictions using deberta-v3-large fold 4
model_path = /kaggle/input/anu-dbv3l/deberta-v3-large/fold4/checkpoint-2280/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-dbv3l/deberta-v3-large/fold4/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading file /kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/vocab.txt
loading file /kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/tokenizer.json
loading file /kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/added_tokens.json
loading file /kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/special_tokens_map.json
loading file /kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/tokenizer_config.json
Assigning ['[F]', '[B]', '[E]', '[G]', '[D]', '[C]', '[H]', '[A]', '[s]'] to the additional_special_tokens key of the tokenizer


Completed predictions for fold 4


  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
loading configuration file /kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/config.json
Model config BertConfig {
  "_name_or_path": "/kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "typ

Running test predictions using bert-for-patents fold 0
model_path = /kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-bfp/bert-for-patents/fold0/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 39868


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading configuration file /kaggle/input/anu-bfp/bert-for-patents/fold1/checkpoint-2280/config.json
Model config BertConfig {
  "_name_or_path": "/kaggle/input/anu-bfp/bert-for-patents/fold1/checkpoint-2280/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 39867
}

loading weights file /kaggle/input/anu-bfp/bert-for-patents/fold1/checkpoint-

Completed predictions for fold 0
Running test predictions using bert-for-patents fold 1
model_path = /kaggle/input/anu-bfp/bert-for-patents/fold1/checkpoint-2280/


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-bfp/bert-for-patents/fold1/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 39868


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


Completed predictions for fold 1


loading configuration file /kaggle/input/anu-bfp/bert-for-patents/fold2/checkpoint-2280/config.json
Model config BertConfig {
  "_name_or_path": "/kaggle/input/anu-bfp/bert-for-patents/fold2/checkpoint-2280/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 39867
}

loading weights file /kaggle/input/anu-bfp/bert-for-patents/fold2/checkpoint-

Running test predictions using bert-for-patents fold 2
model_path = /kaggle/input/anu-bfp/bert-for-patents/fold2/checkpoint-2280/


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-bfp/bert-for-patents/fold2/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 39868


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


Completed predictions for fold 2


loading configuration file /kaggle/input/anu-bfp/bert-for-patents/fold3/checkpoint-2280/config.json
Model config BertConfig {
  "_name_or_path": "/kaggle/input/anu-bfp/bert-for-patents/fold3/checkpoint-2280/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 39867
}

loading weights file /kaggle/input/anu-bfp/bert-for-patents/fold3/checkpoint-

Running test predictions using bert-for-patents fold 3
model_path = /kaggle/input/anu-bfp/bert-for-patents/fold3/checkpoint-2280/


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-bfp/bert-for-patents/fold3/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 39868


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


Completed predictions for fold 3


loading configuration file /kaggle/input/anu-bfp/bert-for-patents/fold4/checkpoint-2280/config.json
Model config BertConfig {
  "_name_or_path": "/kaggle/input/anu-bfp/bert-for-patents/fold4/checkpoint-2280/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 39867
}

loading weights file /kaggle/input/anu-bfp/bert-for-patents/fold4/checkpoint-

Running test predictions using bert-for-patents fold 4
model_path = /kaggle/input/anu-bfp/bert-for-patents/fold4/checkpoint-2280/


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-bfp/bert-for-patents/fold4/checkpoint-2280/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 39868


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/spm.model
loading file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/added_tokens.json
loading file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/special_tokens_map.json
loading file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/tokenizer_config.json


Completed predictions for fold 4


Adding [MASK] to the vocabulary
Adding [A] to the vocabulary
Adding [C] to the vocabulary
Adding [F] to the vocabulary
Adding [H] to the vocabulary
Adding [B] to the vocabulary
Adding [D] to the vocabulary
Adding [E] to the vocabulary
Adding [G] to the vocabulary
Adding [s] to the vocabulary
Assigning ['[F]', '[B]', '[E]', '[G]', '[D]', '[C]', '[H]', '[A]', '[s]'] to the additional_special_tokens key of the tokenizer


  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

Running test predictions using deberta-v2-xlarge fold 0
model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold0/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


Completed predictions for fold 0


loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold1/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold1/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

Running test predictions using deberta-v2-xlarge fold 1
model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold1/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold1/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold2/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold2/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

Completed predictions for fold 1
Running test predictions using deberta-v2-xlarge fold 2
model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold2/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold2/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold3/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold3/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

Completed predictions for fold 2
Running test predictions using deberta-v2-xlarge fold 3
model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold3/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold3/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


Completed predictions for fold 3


loading configuration file /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold4/config.json
Model config DebertaV2Config {
  "_name_or_path": "/kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold4/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_head_size": 64,
  "attention_probs_dropout_prob": 0.1,
  "conv_act": "gelu",
  "conv_kernel_size": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1536,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 6144,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 24,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1536,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_bi

Running test predictions using deberta-v2-xlarge fold 4
model_path = /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold4/


All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /kaggle/input/anu-uspppm-deberta-v2-xlarge2/deberta-v2-xlarge/fold4/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.


len(tokenizer_vocab) = 128010


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 36
  Batch size = 64


Completed predictions for fold 4


### On test dataset generate predictions using level 1 models

In [11]:
df_submission = pd.DataFrame({"id": df_test.id})
for model_name in model_paths.keys():
    test_preds_arr = np.array(model_fold_preds[model_name])
    test_preds_avg = np.mean(test_preds_arr, axis=0)
    df_submission.loc[:, f"{model_name}-preds"] = test_preds_avg    

In [12]:
test_pred_cols = df_submission.columns.tolist()
test_pred_cols.remove("id")
test_pred_cols
# df_submission["score"] = df_submission[pred_cols].mean(axis=1)

['deberta-v3-large-preds', 'bert-for-patents-preds', 'deberta-v2-xlarge-preds']

### Now we will first train a level 2 linear regression model using OOF predictions of different level 1 models on train dataset

In [13]:
# Load the level1 models OOF predictions on train dataset
df_train_oof_bfp = pd.read_csv(oof_preds_paths[Models.BERT_FOR_PATENTS])
# df_train_oof_bfp = df_train_oof_bfp[oof_cols_to_use]
df_train_oof_dbv3l = pd.read_csv(oof_preds_paths[Models.DEBERTA_V3_LARGE])
df_train_oof_dbl = pd.read_csv(oof_preds_paths[Models.DEBERTA_V2_XLARGE])

In [14]:
def rename_val_preds(df, cols_to_use, suffix):
    df = df[cols_to_use]
    df.rename(columns = {"val_preds": "val_preds_"+suffix}, inplace=True)
    return df

In [15]:
import functools

df_list = [(df_train_oof_dbv3l, "dbv3l"), (df_train_oof_bfp, "bfp"), (df_train_oof_dbl, "dbv2xl")]
df_list_renamed = []
for idx, (df, suffix) in enumerate(df_list):
    cols = ["id", "val_preds"]
    if idx == 0:
        cols.append("score")
        cols.append("kfold")
    df_list_renamed.append(rename_val_preds(df, cols, suffix))
df_train_all = functools.reduce(lambda df1, df2: pd.merge(left=df1, right=df2, on=["id"], how="inner"), df_list_renamed)
df_train_all.head()

Unnamed: 0,id,val_preds_dbv3l,score,kfold,val_preds_bfp,val_preds_dbv2xl
0,54c1e3b9184cb5b6,0.085144,0.0,0,-0.00927,0.076599
1,ef2d4c2e6bbb208d,0.235596,0.25,0,0.1637,0.223633
2,cc96541d4987b399,0.297607,0.0,0,0.167,0.308838
3,a8c9e9f37d4d836a,0.161621,0.0,0,0.11896,0.246094
4,604210b7c7ce2f6a,0.525391,0.5,0,0.4807,0.515625


In [16]:
# # Merge different model OOF predictions to create a single dataframe
# df_train_all = pd.merge(
#     left = df_train_oof_bfp,
#     right = df_train_oof_dbv3l[["id", "val_preds"]],
#     how = "inner",
#     left_on = "id",
#     right_on = "id",
#     suffixes = ("_bfp", "_dbv3l")
# )

# df_train_all.head()

In [17]:
def get_fold_data(fold, df, X_cols, y_col):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]
    X_train = df_train[X_cols].to_numpy()
    y_train = df_train[y_col].to_numpy()
    X_val = df_val[X_cols].to_numpy()
    y_val = df_val[y_col].to_numpy()
    return X_train, y_train, X_val, y_val

In [18]:
# Target column (continuous valued)
y_col = "score"
# Input features (OOF predictions) to be used by L2 regression model
X_cols_train = ["val_preds_dbv3l","val_preds_bfp", "val_preds_dbv2xl"]
X_cols_test = test_pred_cols

In [19]:
from sklearn.linear_model import Ridge

def run_training(train_X, train_y, val_X, val_y, params=None):    
    model = Ridge(alpha=params["alpha"])    
    model.fit(train_X, train_y.ravel())
    val_y_pred = model.predict(val_X)
    #print(f"val_y_pred.shape = {val_y_pred.shape}")
    #print(f"val_y.shape = {val_y.shape}")
    p_corr_coeff = np.corrcoef(val_y_pred, val_y)[0][1]    
    return p_corr_coeff, model, val_y_pred

In [20]:
fold_metrics_model = []
l2_test_preds = []
model_params = {'alpha': 2.0}
# test data level 1 predictions to be used as input features by L2 model
X_test = df_submission[X_cols_test].to_numpy()

for fold in range(Config.NUM_FOLDS):
    X_train, y_train, X_val, y_val = get_fold_data(fold, df_train_all, X_cols_train, y_col)
    fold_pcc, model, fold_val_preds = run_training(X_train, y_train, X_val, y_val, params=model_params)
    print(f"fold {fold } pearson corr. coeff = {fold_pcc}")
    # add the level 2 validation predictions for the fold to a new column in train data
    df_train_all.loc[df_train_all.kfold == fold, "meta_val_preds"] = fold_val_preds        
    fold_metrics_model.append((round(fold_pcc, 4), model))
    fold_test_preds = model.predict(X_test)
    l2_test_preds.append(fold_test_preds)

fold 0 pearson corr. coeff = 0.8813455538507928
fold 1 pearson corr. coeff = 0.8714793770030376
fold 2 pearson corr. coeff = 0.8723145829405757
fold 3 pearson corr. coeff = 0.8720874415180684
fold 4 pearson corr. coeff = 0.858539480612362


In [21]:
# Calculate the CV score
meta_val_preds = df_train_all['meta_val_preds'].values
labels = df_train_all['score'].values
p_corr_coeff_cv = np.corrcoef(meta_val_preds, labels)[0][1]    
print(f"L2 model CV score = {p_corr_coeff_cv}")

L2 model CV score = 0.8709813043182332


In [22]:
l2_test_preds_arr = np.array(l2_test_preds)
l2_test_preds_avg = np.mean(l2_test_preds_arr, axis=0)
df_submission["score"] = l2_test_preds_avg

In [23]:
df_submission[["id", "score"]].head()

Unnamed: 0,id,score
0,4112d61851461f60,0.518275
1,5203a36c501f1b7c,0.737798
2,7aa5908a77a7ec24,0.341652
3,09e418c93a776564,0.695098
4,36baf228038e314b,0.508804


In [24]:
df_submission[["id", "score"]].to_csv("/kaggle/working/submission.csv", index=False)