In [1]:
!nvidia-smi

Wed Jun 22 11:40:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%pip install -q transformers[sentencepiece] datasets

[K     |████████████████████████████████| 4.4 MB 5.2 MB/s 
[K     |████████████████████████████████| 362 kB 57.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 59.8 MB/s 
[K     |████████████████████████████████| 101 kB 12.7 MB/s 
[K     |████████████████████████████████| 140 kB 75.2 MB/s 
[K     |████████████████████████████████| 212 kB 60.3 MB/s 
[K     |████████████████████████████████| 596 kB 73.2 MB/s 
[K     |████████████████████████████████| 127 kB 53.1 MB/s 
[K     |████████████████████████████████| 94 kB 3.1 MB/s 
[K     |████████████████████████████████| 144 kB 74.8 MB/s 
[K     |████████████████████████████████| 271 kB 78.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 50.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 65.3 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires fo

### In this notebook we experiment by finetuning a bert-for-patents model on competition data by adding patent section as special token to the tokenizer vocab.

In [4]:
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# my_drive = GoogleDrive(gauth)

In [35]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch.multiprocessing as mp
from transformers import logging
import warnings

logging.set_verbosity_error()
logging.set_verbosity_warning()
warnings.filterwarnings('ignore')


In [36]:
class TrainingArgs:
    weight_decay = 0.01
    learning_rate = 1e-5
    warmup_ratio = 0.1
    gradient_accumulation_steps = 8
    fp16 = True
    lr_scheduler_type = "linear"
    # Number of checkpoints to save for each model
    save_total_limit = 1
    #  Whether or not to load the best model found during training at the end of training.
    load_best_model_at_end=True
    # Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different
    # models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`. Will
    # default to `"loss"` if unspecified and `load_best_model_at_end=True` (to use the evaluation loss).
    # If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to `False` if
    # your metric is better when lower.
    metric_for_best_model="pearson"
    greater_is_better=True
    adam_epsilon=1e-6
    #warmup_steps=1000
    log_level="warning"
    group_by_length=True

class Config:
    MODEL_NAME = "deberta-v3-large"
    DATA_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/data/"
    VAL_PREDS_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/preds/"
    # location where trained model weights are saved
    OUT_DIR = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v3-large/"
    RUNTIME = "COLAB"
    RANDOM_STATE = 42
    BATCH_SIZE = 4
    EVAL_BATCH_SIZE = 8
    NUM_LABELS = 1
    LABEL_COL = "score"
    NUM_FOLDS = 5
    RUN_ALL_FOLDS = False
    NUM_EPOCHS = 1
    NUM_WORKERS = mp.cpu_count()
    TRANSFORMER_CHECKPOINT = "microsoft/deberta-v3-large"
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    SUBSET_ROWS_FRAC = 0.05
    TRAIN_ON_SUBSET = False
    RANDOM_SEED = 42

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [37]:
def empty_gdrive_trash():
    deleted_file_name = []
    for a_file in my_drive.ListFile({'q': "trashed = true"}).GetList():
        file_name = a_file['title']
        deleted_file_name.append(file_name)
        # delete the file permanently.
        a_file.Delete()
    print("The below files were cleared from trash")
    print(deleted_file_name)

In [38]:
# empty_gdrive_trash()

In [39]:
df_train = pd.read_csv(Config.DATA_PATH + "train.csv")
df_test = pd.read_csv(Config.DATA_PATH + "test.csv")
df_titles = pd.read_csv(Config.DATA_PATH + "titles.csv")

In [40]:
df_train["section"] = df_train.context.str[0]

In [41]:
from sklearn.preprocessing import LabelEncoder

anchor_encoder = LabelEncoder()
df_train["anchor_map"] = anchor_encoder.fit_transform(df_train["anchor"])
df_train["context_map"] = anchor_encoder.fit_transform(df_train["context"])
df_train["anchor_context_map"] = df_train["anchor_map"].astype(str).str.cat(df_train["context_map"].astype(str), sep="_")
# Score is not really a continuous value here as there are just five distinct values. But since it is float it needs to be converted
# to categorical value before we can perform stratified split on score
df_train["score_map"] = df_train["score"].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})

In [42]:
from sklearn import model_selection

def strat_group_kfold_dataframe(df, target_col_name, group_col_name, num_folds=Config.NUM_FOLDS):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values    
    groups = df[group_col_name].values
    # stratify data using anchor as group and score as target
    skf = model_selection.StratifiedGroupKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y, groups=groups)):
        df.loc[val_index, "kfold"] = fold        
    return df     

In [43]:
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df.sample(frac=1, random_state=Config.RANDOM_STATE).reset_index(drop=True)
    y = df[target_col_name].values
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE)
    # stratification is done on the basis of y labels, a placeholder for X is sufficient
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_idx, "kfold"] = fold
    return df

In [44]:
if Config.TRAIN_ON_SUBSET:
    print(f"Selecting {Config.SUBSET_ROWS_FRAC * 100}% training data")
    df_train = df_train.sample(frac=Config.SUBSET_ROWS_FRAC, random_state=Config.RANDOM_SEED).reset_index(drop=True)

# Since the target column (score) is continuous, we need to create bins out of the target column
# df_train.loc[:, "bins"] = pd.cut(df_train.score, bins=5, labels=[0,1,2,3,4])
# df_train = strat_kfold_dataframe(df_train, target_col_name="bins", num_folds=Config.NUM_FOLDS)

# Now do a stratified group k fold on the bins column (which is a categorical column) and anchor as groups
df_train = strat_group_kfold_dataframe(df_train, target_col_name="score_map", group_col_name="anchor_context_map", num_folds=Config.NUM_FOLDS)            
# drop the bin column
# df_train = df_train.drop(["bins"], axis=1)
# df_train = df_train.drop(["anchor_map", "score_map"], axis=1)

In [45]:
# Let us check if the stratification has been done correctly
# The mean of score column should be similar across folds 
fold_score_mean = []
fold_anchor_context_maps = []
for fold in range(Config.NUM_FOLDS):
    df_train_fold = df_train[df_train.kfold == fold]
    fold_score_mean.append(np.mean(df_train_fold.score.values))
    fold_anchor_context_maps.append(set(df_train_fold.anchor_context_map.unique()))
fold_score_mean

[0.3557356434260165,
 0.35813229056203605,
 0.36268028846153844,
 0.36315899290582837,
 0.37085976039464413]

In [46]:
# check each of the folds has no common anchor value
def check_disjoint(start, fold_anchor_context_maps):
    for i in range(start, 4):
        for j in range(i+1, 5):
            if fold_anchor_context_maps[i].isdisjoint(fold_anchor_context_maps[j]):
                print(f"anchor context map for fold {i} and {j} are disjoint")

check_disjoint(0, fold_anchor_context_maps)                

anchor context map for fold 0 and 1 are disjoint
anchor context map for fold 0 and 2 are disjoint
anchor context map for fold 0 and 3 are disjoint
anchor context map for fold 0 and 4 are disjoint
anchor context map for fold 1 and 2 are disjoint
anchor context map for fold 1 and 3 are disjoint
anchor context map for fold 1 and 4 are disjoint
anchor context map for fold 2 and 3 are disjoint
anchor context map for fold 2 and 4 are disjoint
anchor context map for fold 3 and 4 are disjoint


In [47]:
df_titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [48]:
df_train = pd.merge(
    left = df_train,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [49]:
df_train.anchor_context_map.value_counts()

555_89     84
129_47     79
324_49     79
430_104    74
129_48     73
           ..
525_62      1
649_62      1
480_11      1
485_11      1
727_101     1
Name: anchor_context_map, Length: 1699, dtype: int64

In [50]:
# For each anchor, context group (i.e. set of records having same anchor and context values), concatenate the target phrases
# key is unique anchor_context_map , value is concatenation of target phrases of all records for that unique anchor_context_map
anc_ctx_targets = {}
for anchor_context_map in df_train.anchor_context_map.unique():
    df_train_sub = df_train[df_train.anchor_context_map == anchor_context_map]
    anchor_context_target_text = ",".join(df_train_sub.target)    
    anc_ctx_targets[anchor_context_map] = anchor_context_target_text

df_train["anchor_context_targets"] = df_train.anchor_context_map.map(anc_ctx_targets)
df_train["anc_ctx_tgt_len"] = df_train["anchor_context_targets"].apply(lambda text: len(text.split()))
df_train = df_train.sort_values(by=["anc_ctx_tgt_len"], ascending=False)
# df_train = df_train.head(500)
# df_train = df_train[df_train.anchor_context_map == "555_89"]

In [51]:
import torch.nn as nn
from typing import Optional, Union, Tuple
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import DebertaV2Model, DebertaV2PreTrainedModel
from transformers.models.deberta.modeling_deberta import ContextPooler, StableDropout

In [52]:
# base class for implementing custom heads on top of deberta-v2 backbone
class DebertaV2ForSeqClfBase(DebertaV2PreTrainedModel):    
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)
        self.num_labels = num_labels

        self.deberta = DebertaV2Model(config)
        self.pooler = ContextPooler(config)
        output_dim = self.pooler.output_dim

        self.classifier = nn.Linear(output_dim, num_labels)
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = StableDropout(drop_out)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)
    
    def get_loss(self, labels, logits):
        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    # regression task
                    loss_fn = nn.MSELoss()
                    logits = logits.view(-1).to(labels.dtype)
                    loss = loss_fn(logits, labels.view(-1))
                elif labels.dim() == 1 or labels.size(-1) == 1:
                    label_index = (labels >= 0).nonzero()
                    labels = labels.long()
                    if label_index.size(0) > 0:
                        labeled_logits = torch.gather(
                            logits, 0, label_index.expand(label_index.size(0), logits.size(1))
                        )
                        labels = torch.gather(labels, 0, label_index.view(-1))
                        loss_fct = nn.CrossEntropyLoss()
                        loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
                    else:
                        loss = torch.tensor(0).to(logits)
                else:
                    log_softmax = nn.LogSoftmax(-1)
                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
            elif self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        return loss

In [53]:
class DebertaV2ForSeqClfMeanPooling(DebertaV2ForSeqClfBase):    
    def __init__(self, config):
        super().__init__(config)
    
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # hidden state from the last layer [batch_size, seq_len, hidden_size]
        last_hidden_state = outputs[0]
        # copy the 2d attention mask [batch_size, seq_len] hidden_size times in the third dimension (hidden state)
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        # [batch_size, seq_len, hidden_size ]
        # Each hidden state is a tensor of dimension [batch_size, seq_len] and we have hidden_size number of such hidden state
        # Of these seq_len columns in each hidden state only those need to be taken into account for which attention_mask = 1.  
        # Doing an element wise multiplication of the 2d attention mask [batch_size, seq_len] with the corresponding 2d hidden state 
        # [batch_size, seq_len] gives hidden state with only the non-padded columns. Sum this hidden state along dimension 1 
        # ( the dimension of sequence length) to get the sum_embeddings [batch_size, hidden_size] 
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        # The sum mask is a value between 0 to 256 signifying the number of unpadded columns for that hidden state
        sum_mask = input_mask_expanded.sum(1)
        # [batch_size, hidden_size]
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        # element wise division 
        mean_embeddings = sum_embeddings / sum_mask
        # [batch_size, hidden_size]
        logits = self.classifier(mean_embeddings)
        loss = self.get_loss(labels, logits)
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )


In [54]:
class DebertaV2ForSeqClfMaxPooling(DebertaV2ForSeqClfBase):    
    def __init__(self, config):
        super().__init__(config)
    
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # hidden state from the last layer [batch_size, seq_len, hidden_size]
        last_hidden_state = outputs[0]
        # copy the 2d attention mask [batch_size, seq_len] hidden_size times in the third dimension (hidden state)
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        # [batch_size, seq_len, hidden_size ]        
        # Set padding tokens to large negative value
        last_hidden_state[input_mask_expanded == 0] = -1e9  
        # Get the max value along dimension 1 (seq_len dimension) of last_hidden_state. The result will be [batch_size, hidden_size] tensor
        # The first output torch.max returns the max values along a dimension, the second output is the index of max value
        max_embeddings = torch.max(last_hidden_state, 1)[0]        
        # [batch_size, hidden_size]
        logits = self.classifier(max_embeddings)
        loss = self.get_loss(labels, logits)
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )


In [55]:
class DebertaV2ForSeqClfMeanMaxPooling(DebertaV2ForSeqClfBase):    
    def __init__(self, config):
        super().__init__(config)
    
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        last_hidden_state = outputs[0]        
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()

        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)        
        sum_mask = input_mask_expanded.sum(1)        
        sum_mask = torch.clamp(sum_mask, min=1e-9)        
        mean_pooling_embeddings = sum_embeddings / sum_mask        

        last_hidden_state[input_mask_expanded == 0] = -1e9          
        max_pooling_embeddings = torch.max(last_hidden_state, 1)[0]

        mean_max_embeddings = torch.cat((mean_pooling_embeddings, max_pooling_embeddings), 1)
        # [batch_size, 2*hidden_size]
        logits = self.classifier(mean_max_embeddings)
        loss = self.get_loss(labels, logits)
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )


In [56]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [57]:
df_train['sectok'] = '[' + df_train.section + ']'
sectoks = list(df_train.sectok.unique())
print(f"Additional special tokens: {sectoks}")
tokenizer.add_special_tokens({'additional_special_tokens': sectoks})

Additional special tokens: ['[G]', '[C]', '[H]', '[B]', '[A]', '[E]', '[F]', '[D]']


8

In [58]:
sep = " " + tokenizer.sep_token + " "
# sep = tokenizer.sep_token
df_train["inputs"] = df_train.sectok + sep + df_train.anchor + sep + df_train.target + sep + df_train.title + sep + df_train.anchor_context_targets
df_train["inputs"] = df_train["inputs"].apply(lambda x: x.lower())
df_train.head()

Unnamed: 0,id,anchor,target,context,score,section,anchor_map,context_map,anchor_context_map,score_map,kfold,code,title,anchor_context_targets,anc_ctx_tgt_len,sectok,inputs
18583,426b5d4ee52dfbba,reflection type liquid crystal display,reflective mode liquid display,G02,0.5,G,555,89,555_89,2,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...
19157,d583a6c02fed7b2a,reflection type liquid crystal display,reflection mode lcd crystal,G02,0.5,G,555,89,555_89,2,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...
19227,668eb746e5b96f9e,reflection type liquid crystal display,mobile,G02,0.25,G,555,89,555_89,1,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...
19221,b0e707f934a27619,reflection type liquid crystal display,reflection type crystal display,G02,0.5,G,555,89,555_89,2,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...
19217,722fbf83a2054afa,reflection type liquid crystal display,liquid crystal device,G02,0.5,G,555,89,555_89,2,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...


In [59]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True,
        # maximum possible sequence length (for inputs column). Sequences exceeding this
        # length will be truncated
        max_length = 512
    )
    if with_labels:
        encoding["labels"] = row[Config.LABEL_COL]
    return encoding

In [60]:
from functools import partial

preprocess_train_data = partial(tokenize_text, tokenizer, True)  
preprocess_test_data = partial(tokenize_text, tokenizer, False)  

In [61]:
def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    ds_train_raw = Dataset.from_pandas(train_df)
    ds_valid_raw = Dataset.from_pandas(valid_df)
    raw_ds_col_names = ds_train_raw.column_names    
    ds_train = ds_train_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)
    ds_valid = ds_valid_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)    
    return train_df, valid_df, ds_train, ds_valid

In [62]:
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred        
#     if Config.NUM_LABELS == 1:
#         y_preds = predictions.reshape(len(predictions))
#     else:
#         y_preds = np.argmax(predictions, axis=1)
#     return {
#         'eval_pearson': np.corrcoef(y_preds, labels)[0][1]
#     }

In [63]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [64]:
# def get_oof_preds(trainer, ds_val, df_val_fold):
#     oof_outputs = trainer.predict(ds_val)
#     if Config.NUM_LABELS == 1:
#         y_preds_proba = oof_outputs.predictions
#         oof_predictions = np.argmax(y_preds_proba, axis=1)
#     else:
#         oof_predictions = oof_outputs.predictions.reshape(-1)
#     df_val_fold["val_preds"] = oof_predictions
#     return df_val_fold

In [65]:
def get_oof_preds(trainer, ds_val, df_val_fold):
    oof_outputs = trainer.predict(ds_val)
    oof_predictions = oof_outputs.predictions.reshape(-1)
    df_val_fold["val_preds"] = oof_predictions
    return df_val_fold

In [66]:
def get_training_args(fold_str):
    training_args = TrainingArguments(
        output_dir=Config.OUT_DIR + fold_str,
        evaluation_strategy="epoch",
        save_strategy='epoch',        
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
        warmup_ratio=TrainingArgs.warmup_ratio,
        weight_decay=TrainingArgs.weight_decay,
        learning_rate=TrainingArgs.learning_rate,    
        gradient_accumulation_steps=TrainingArgs.gradient_accumulation_steps,
        fp16=TrainingArgs.fp16,
        lr_scheduler_type=TrainingArgs.lr_scheduler_type,
        save_total_limit=TrainingArgs.save_total_limit,
        load_best_model_at_end=TrainingArgs.load_best_model_at_end,
        metric_for_best_model=TrainingArgs.metric_for_best_model,
        greater_is_better=TrainingArgs.greater_is_better,
        adam_epsilon=TrainingArgs.adam_epsilon,
        #warmup_steps=TrainingArgs.warmup_steps,
        log_level=TrainingArgs.log_level,
        group_by_length=TrainingArgs.group_by_length
    )
    return training_args

In [None]:
import gc

df_val_preds = pd.DataFrame()
tok_vocab = tokenizer.get_vocab()
for fold in range(Config.NUM_FOLDS):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    df_train_fold, df_val_fold, ds_train, ds_val = get_fold_dls(fold, df_train)
    training_args = get_training_args(fold_str)
    model = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.NUM_LABELS)
    print(f"len(tokenizer_vocab) = {len(tok_vocab)}")
    model.resize_token_embeddings(len(tok_vocab))    
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=ds_train,              # training dataset
        eval_dataset=ds_val,                 # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    trainer.train()
    trainer.save_model(Config.OUT_DIR + fold_str)
    df_val_fold = get_oof_preds(trainer, ds_val, df_val_fold) 
    # display(df_val_fold.head())
    df_val_preds = pd.concat([df_val_preds, df_val_fold], axis=0)
    # export the oof predictions to csv for later use in stacking
    if Config.RUNTIME != "KAGGLE":
        df_val_fold.to_csv(Config.VAL_PREDS_PATH + f"df_train_oof_preds_{Config.MODEL_NAME}_{fold_str}.csv")
    else:
        df_val_preds.to_csv("/kaggle/working/df_train_oof_preds.csv")
    print(f"Saved OOF predictions for fold {fold}")    
    del model, trainer
    gc.collect()
    torch.cuda.empty_cache()
    # Empty the trash to clear gdrive disk space
    # empty_gdrive_trash()
    if not Config.RUN_ALL_FOLDS:
        break

if Config.RUN_ALL_FOLDS:
    df_val_preds.to_csv(Config.VAL_PREDS_PATH + f"df_train_oof_preds_{Config.MODEL_NAME}.csv")

Running training for fold0


  0%|          | 0/30 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/8 [00:00<?, ?ba/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

len(tokenizer_vocab) = 128009


Epoch,Training Loss,Validation Loss


In [None]:
# Calculate the CV score
predictions = df_val_preds['val_preds'].values
labels = df_val_preds['score'].values
eval_preds = predictions, labels
cv_metric_dict = compute_metrics(eval_preds)
print(f"CV score = {cv_metric_dict}")