In [17]:
!nvidia-smi

Sat Jun 25 06:31:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [19]:
%pip install -q transformers[sentencepiece] datasets hydra-core

In [20]:
import sys
import os

COLAB_ROOT_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/"
sys.path.append(COLAB_ROOT_PATH + "src/util_code")
os.chdir(COLAB_ROOT_PATH)

In [46]:
import hydra
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import DictConfig, OmegaConf

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(version_base=None, config_path="configs")
config = compose(config_name="config")
print(OmegaConf.to_yaml(config))

train_run:
  random_state: 42
  num_labels: 1
  label_col: score
  num_folds: 5
  run_all_folds: false
  num_epochs: 2
  num_workers: 2
  device: cuda
  subset_rows_frac: 0.05
  train_on_subset: false
  transformer_checkpoint: microsoft/deberta-v3-small
training_args:
  output_dir: None
  evaluation_strategy: epoch
  save_strategy: epoch
  save_total_limit: 1
  load_best_model_at_end: true
  metric_for_best_model: pearson
  greater_is_better: true
  group_by_length: true
  num_train_epochs: 5
  per_device_train_batch_size: 32
  per_device_eval_batch_size: 64
  weight_decay: 0.01
  learning_rate: 4.0e-05
  warmup_ratio: 0.1
  gradient_accumulation_steps: 4
  fp16: true
  lr_scheduler_type: linear
  adam_epsilon: 1.0e-06
paths:
  data_path: /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/data/
  val_preds_path: /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/preds/
  out_dir: /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/



In [22]:
import colab_utils
import utils
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch.multiprocessing as mp
from transformers import logging
import warnings

logging.set_verbosity_error()
logging.set_verbosity_warning()
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [47]:
# This is where we configure the run parameters
# Model to run
MODEL_NAME = "deberta-v3-small"
TRANSFORMER_CHECKPOINT = "microsoft/deberta-v3-small"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# update the config with run params
# update the output_dir to model specific directory
config.paths.out_dir += MODEL_NAME
print(f"config.paths.out_dir = {config.paths.out_dir}")
config.train_run.transformer_checkpoint = TRANSFORMER_CHECKPOINT
print(f"config.train_run.transformer_checkpoint = {config.train_run.transformer_checkpoint}")
config.train_run.num_workers = mp.cpu_count()
print(f"config.train_run.num_workers = {config.train_run.num_workers}")

config.paths.out_dir = /content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v3-small
config.train_run.transformer_checkpoint = microsoft/deberta-v3-small
config.train_run.num_workers = 2


In [25]:
colab_utils.empty_gdrive_trash()

The below files were cleared from trash
['rng_state.pth', 'trainer_state.json', 'scaler.pt', 'scheduler.pt', 'optimizer.pt', 'training_args.bin', 'tokenizer.json', 'spm.model', 'added_tokens.json', 'special_tokens_map.json', 'tokenizer_config.json', 'pytorch_model.bin', 'config.json', 'checkpoint-458']


In [26]:
df_train = pd.read_csv(config.paths.data_path + "train.csv")
df_train["section"] = df_train.context.str[0]
df_test = pd.read_csv(config.paths.data_path + "test.csv")
df_titles = pd.read_csv(config.paths.data_path + "titles.csv")

In [27]:
from sklearn.preprocessing import LabelEncoder

anchor_encoder = LabelEncoder()
df_train["anchor_map"] = anchor_encoder.fit_transform(df_train["anchor"])
df_train["context_map"] = anchor_encoder.fit_transform(df_train["context"])
df_train["anchor_context_map"] = df_train["anchor_map"].astype(str).str.cat(df_train["context_map"].astype(str), sep="_")
# Score is not really a continuous value here as there are just five distinct values. But since it is float it needs to be converted
# to categorical value before we can perform stratified split on score
df_train["score_map"] = df_train["score"].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})

In [30]:
if config.train_run.train_on_subset:
    print(f"Selecting {config.train_run.subset_rows_frac * 100}% training data")
    df_train = df_train.sample(
        frac=config.train_run.subset_rows_frac, 
        random_state=config.train_run.random_state
    ).reset_index(drop=True)

# Since the target column (score) is continuous, we need to create bins out of the target column
# df_train.loc[:, "bins"] = pd.cut(df_train.score, bins=5, labels=[0,1,2,3,4])
# df_train = utils.strat_kfold_dataframe(df_train, target_col_name="bins", random_state=config.train_run.random_state, num_folds=Config.NUM_FOLDS)

# Now do a stratified group k fold on the bins column (which is a categorical column) and anchor as groups
df_train = utils.strat_group_kfold_dataframe(
    df_train, 
    target_col_name="score_map", 
    group_col_name="anchor_context_map",
    random_state=config.train_run.random_state, 
    num_folds=config.train_run.num_folds
)            
# drop the bin column
# df_train = df_train.drop(["bins"], axis=1)
# df_train = df_train.drop(["anchor_map", "score_map"], axis=1)

In [31]:
# Let us check if the stratification has been done correctly
# The mean of score column should be similar across folds 
fold_score_mean = []
fold_anchor_context_maps = []
for fold in range(config.train_run.num_folds):
    df_train_fold = df_train[df_train.kfold == fold]
    fold_score_mean.append(np.mean(df_train_fold.score.values))
    fold_anchor_context_maps.append(set(df_train_fold.anchor_context_map.unique()))
fold_score_mean

[0.3557356434260165,
 0.35813229056203605,
 0.36268028846153844,
 0.36315899290582837,
 0.37085976039464413]

In [32]:
# check each of the folds has no common anchor value
def check_disjoint(start, fold_anchor_context_maps):
    for i in range(start, 4):
        for j in range(i+1, 5):
            if fold_anchor_context_maps[i].isdisjoint(fold_anchor_context_maps[j]):
                print(f"anchor context map for fold {i} and {j} are disjoint")

check_disjoint(0, fold_anchor_context_maps)                

anchor context map for fold 0 and 1 are disjoint
anchor context map for fold 0 and 2 are disjoint
anchor context map for fold 0 and 3 are disjoint
anchor context map for fold 0 and 4 are disjoint
anchor context map for fold 1 and 2 are disjoint
anchor context map for fold 1 and 3 are disjoint
anchor context map for fold 1 and 4 are disjoint
anchor context map for fold 2 and 3 are disjoint
anchor context map for fold 2 and 4 are disjoint
anchor context map for fold 3 and 4 are disjoint


In [33]:
df_titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [34]:
df_train = pd.merge(
    left = df_train,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [35]:
df_train.anchor_context_map.value_counts()

555_89     84
129_47     79
324_49     79
430_104    74
129_48     73
           ..
525_62      1
649_62      1
480_11      1
485_11      1
727_101     1
Name: anchor_context_map, Length: 1699, dtype: int64

In [36]:
# For each anchor, context group (i.e. set of records having same anchor and context values), concatenate the target phrases
# key is unique anchor_context_map , value is concatenation of target phrases of all records for that unique anchor_context_map
anc_ctx_targets = {}
for anchor_context_map in df_train.anchor_context_map.unique():
    df_train_sub = df_train[df_train.anchor_context_map == anchor_context_map]
    anchor_context_target_text = ",".join(df_train_sub.target)    
    anc_ctx_targets[anchor_context_map] = anchor_context_target_text

df_train["anchor_context_targets"] = df_train.anchor_context_map.map(anc_ctx_targets)
df_train["anc_ctx_tgt_len"] = df_train["anchor_context_targets"].apply(lambda text: len(text.split()))
df_train = df_train.sort_values(by=["anc_ctx_tgt_len"], ascending=False)
# df_train = df_train.head(500)
# df_train = df_train[df_train.anchor_context_map == "555_89"]

In [37]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(config.train_run.transformer_checkpoint)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
df_train['sectok'] = '[' + df_train.section + ']'
sectoks = list(df_train.sectok.unique())
print(f"Additional special tokens: {sectoks}")
tokenizer.add_special_tokens({'additional_special_tokens': sectoks})

Additional special tokens: ['[G]', '[C]', '[H]', '[B]', '[A]', '[E]', '[F]', '[D]']


8

In [39]:
sep = " " + tokenizer.sep_token + " "
df_train["inputs"] = df_train.sectok + sep + df_train.anchor + sep + df_train.target + sep + df_train.title + sep + df_train.anchor_context_targets
df_train["inputs"] = df_train["inputs"].apply(lambda x: x.lower())
df_train.head()

Unnamed: 0,id,anchor,target,context,score,section,anchor_map,context_map,anchor_context_map,score_map,kfold,code,title,anchor_context_targets,anc_ctx_tgt_len,sectok,inputs
18583,426b5d4ee52dfbba,reflection type liquid crystal display,reflective mode liquid display,G02,0.5,G,555,89,555_89,2,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...
19157,d583a6c02fed7b2a,reflection type liquid crystal display,reflection mode lcd crystal,G02,0.5,G,555,89,555_89,2,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...
19227,668eb746e5b96f9e,reflection type liquid crystal display,mobile,G02,0.25,G,555,89,555_89,1,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...
19221,b0e707f934a27619,reflection type liquid crystal display,reflection type crystal display,G02,0.5,G,555,89,555_89,2,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...
19217,722fbf83a2054afa,reflection type liquid crystal display,liquid crystal device,G02,0.5,G,555,89,555_89,2,4,G02,OPTICS,"liquid matrix type crystal,lcd displays reflec...",213,[G],[g] [sep] reflection type liquid crystal displ...


In [40]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True,
        # maximum possible sequence length (for inputs column). Sequences exceeding this length will be truncated
        max_length = 512
    )
    if with_labels:
        encoding["labels"] = row[config.train_run.label_col]
    return encoding

In [41]:
from functools import partial

preprocess_train_data = partial(tokenize_text, tokenizer, True)  
preprocess_test_data = partial(tokenize_text, tokenizer, False)  

In [43]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [44]:
def get_oof_preds(trainer, ds_val, df_val_fold):
    oof_outputs = trainer.predict(ds_val)
    oof_predictions = oof_outputs.predictions.reshape(-1)
    df_val_fold["val_preds"] = oof_predictions
    return df_val_fold

In [None]:
import gc
import custom_transformer_heads

df_val_preds = pd.DataFrame()
tok_vocab = tokenizer.get_vocab()
for fold in range(config.train_run.num_folds):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    df_train_fold, df_val_fold, ds_train, ds_val = utils.get_fold_ds(fold, df_train, preprocess_train_data)
    config.training_args["output_dir"] = config.paths.out_dir + fold_str
    training_args = TrainingArguments(**config.training_args, report_to=None)
    model = custom_transformer_heads.DebertaV2ForSeqClfMeanPooling.from_pretrained(
        config.train_run.transformer_checkpoint, 
        num_labels=config.train_run.num_labels
    )
    print(f"len(tokenizer_vocab) = {len(tok_vocab)}")
    model.resize_token_embeddings(len(tok_vocab))    
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=ds_train,              # training dataset
        eval_dataset=ds_val,                 # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    trainer.train()
    trainer.save_model(config.training_args["output_dir"])
    df_val_fold = get_oof_preds(trainer, ds_val, df_val_fold) 
    # display(df_val_fold.head())
    df_val_preds = pd.concat([df_val_preds, df_val_fold], axis=0)
    # export the oof predictions to csv for later use in stacking    
    df_val_fold.to_csv(config.paths.val_preds_path + f"df_train_oof_preds_{MODEL_NAME}_{fold_str}.csv")
    print(f"Saved OOF predictions for fold {fold}")    
    del model, trainer
    gc.collect()
    torch.cuda.empty_cache()
    utils.delete_checkpoints(config.training_args["output_dir"])
    print(f"deleted checkpoints as best model for {fold_str} saved already")
    # Empty the trash to clear gdrive disk space
    colab_utils.empty_gdrive_trash()
    if not config.train_run.run_all_folds:
        break

if config.train_run.run_all_folds:
    df_val_preds.to_csv(config.paths.val_preds_path + f"df_train_oof_preds_{MODEL_NAME}.csv")

Running training for fold0


  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSeqClfMeanPooling: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSeqClfMeanPooling from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSeqClfMeanPooling from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequen

len(tokenizer_vocab) = 128009


Epoch,Training Loss,Validation Loss


In [None]:
# Calculate the CV score
predictions = df_val_preds['val_preds'].values
labels = df_val_preds['score'].values
eval_preds = predictions, labels
cv_metric_dict = compute_metrics(eval_preds)
print(f"CV score = {cv_metric_dict}")