In [1]:
!nvidia-smi

Wed Jun 22 06:26:18 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%pip install -q transformers[sentencepiece] datasets

[K     |████████████████████████████████| 4.4 MB 5.2 MB/s 
[K     |████████████████████████████████| 362 kB 67.8 MB/s 
[K     |████████████████████████████████| 140 kB 71.3 MB/s 
[K     |████████████████████████████████| 101 kB 11.7 MB/s 
[K     |████████████████████████████████| 212 kB 63.3 MB/s 
[K     |████████████████████████████████| 1.1 MB 66.6 MB/s 
[K     |████████████████████████████████| 596 kB 56.7 MB/s 
[K     |████████████████████████████████| 127 kB 67.9 MB/s 
[K     |████████████████████████████████| 94 kB 3.2 MB/s 
[K     |████████████████████████████████| 271 kB 61.8 MB/s 
[K     |████████████████████████████████| 144 kB 73.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 43.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 53.2 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires fo

### In this notebook we experiment by finetuning a bert-for-patents model on competition data by adding patent section as special token to the tokenizer vocab.

In [34]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
my_drive = GoogleDrive(gauth)

In [35]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch.multiprocessing as mp
from transformers import logging
import warnings

logging.set_verbosity_error()
logging.set_verbosity_warning()
warnings.filterwarnings('ignore')


In [36]:
class TrainingArgs:
    weight_decay = 0.01
    learning_rate = 3e-5
    warmup_ratio = 0.1
    gradient_accumulation_steps = 4
    fp16 = True
    lr_scheduler_type = "linear"
    # Number of checkpoints to save for each model
    save_total_limit = 1
    #  Whether or not to load the best model found during training at the end of training.
    load_best_model_at_end=True
    # Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different
    # models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`. Will
    # default to `"loss"` if unspecified and `load_best_model_at_end=True` (to use the evaluation loss).
    # If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to `False` if
    # your metric is better when lower.
    metric_for_best_model="pearson"
    adam_epsilon=1e-6
    #warmup_steps=1000
    log_level="warning"

class Config:
    MODEL_NAME = "deberta-v3-base"
    DATA_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/data/"
    VAL_PREDS_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/preds/"
    # location where trained model weights are saved
    OUT_DIR = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/model/deberta-v3-base/"
    RUNTIME = "COLAB"
    RANDOM_STATE = 42
    BATCH_SIZE = 16
    EVAL_BATCH_SIZE = 32
    NUM_LABELS = 1
    NUM_FOLDS = 5
    RUN_ALL_FOLDS = False
    NUM_EPOCHS = 1
    NUM_WORKERS = mp.cpu_count()
    TRANSFORMER_CHECKPOINT = "microsoft/deberta-v3-base"
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    SUBSET_ROWS_FRAC = 0.05
    TRAIN_ON_SUBSET = False
    RANDOM_SEED = 42

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [37]:
def empty_gdrive_trash():
    deleted_file_name = []
    for a_file in my_drive.ListFile({'q': "trashed = true"}).GetList():
        file_name = a_file['title']
        deleted_file_name.append(file_name)
        # delete the file permanently.
        a_file.Delete()
    print("The below files were cleared from trash")
    print(deleted_file_name)

In [38]:
#empty_gdrive_trash()

In [39]:
df_train = pd.read_csv(Config.DATA_PATH + "train.csv")
df_test = pd.read_csv(Config.DATA_PATH + "test.csv")
df_titles = pd.read_csv(Config.DATA_PATH + "titles.csv")

In [40]:
df_train["section"] = df_train.context.str[0]

In [41]:
from sklearn.preprocessing import LabelEncoder

anchor_encoder = LabelEncoder()
df_train["anchor_map"] = anchor_encoder.fit_transform(df_train["anchor"])
df_train["context_map"] = anchor_encoder.fit_transform(df_train["context"])
# we want to do a stratified group k fold on anchor and context. For this we create a new column to join
# the label encoded values of anchor and context
df_train["anchor_context_map"] = df_train["anchor_map"].astype(str).str.cat(df_train["context_map"].astype(str), sep="_")
# Score is not really a continuous value here as there are just five distinct values. But since it is float it needs to be converted
# to categorical value before we can perform stratified split on score
df_train["score_map"] = df_train["score"].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})

In [42]:
from sklearn import model_selection

def strat_group_kfold_dataframe(df, target_col_name, group_col_name, num_folds=Config.NUM_FOLDS):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values    
    groups = df[group_col_name].values
    # stratify data using anchor as group and score as target
    skf = model_selection.StratifiedGroupKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y, groups=groups)):
        df.loc[val_index, "kfold"] = fold        
    return df     

In [43]:
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df.sample(frac=1, random_state=Config.RANDOM_STATE).reset_index(drop=True)
    y = df[target_col_name].values
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE)
    # stratification is done on the basis of y labels, a placeholder for X is sufficient
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_idx, "kfold"] = fold
    return df

In [44]:
if Config.TRAIN_ON_SUBSET:
    print(f"Selecting {Config.SUBSET_ROWS_FRAC * 100}% training data")
    df_train = df_train.sample(frac=Config.SUBSET_ROWS_FRAC, random_state=Config.RANDOM_SEED).reset_index(drop=True)

# Since the target column (score) is continuous, we need to create bins out of the target column
# df_train.loc[:, "bins"] = pd.cut(df_train.score, bins=5, labels=[0,1,2,3,4])
# df_train = strat_kfold_dataframe(df_train, target_col_name="bins", num_folds=Config.NUM_FOLDS)

# Now do a stratified group k fold on the bins column (which is a categorical column) and anchor and context as groups
df_train = strat_group_kfold_dataframe(df_train, target_col_name="score_map", group_col_name="anchor_context_map", num_folds=Config.NUM_FOLDS)            
# drop the bin column
# df_train = df_train.drop(["bins"], axis=1)
# df_train = df_train.drop(["anchor_map", "score_map"], axis=1)

In [45]:
# Let us check if the stratification has been done correctly
# The mean of score column should be similar across folds 
fold_score_mean = []
fold_anchor_context_maps = []
for fold in range(Config.NUM_FOLDS):
    df_train_fold = df_train[df_train.kfold == fold]
    fold_score_mean.append(np.mean(df_train_fold.score.values))
    fold_anchor_context_maps.append(set(df_train_fold.anchor_context_map.unique()))
fold_score_mean

[0.3557356434260165,
 0.35813229056203605,
 0.36268028846153844,
 0.36315899290582837,
 0.37085976039464413]

In [46]:
# We have create StratifiedGroupKFolds on anchor and context, so the different folds will not have common values of the
# grouping columns. This is done if we do not want the same group to be present both in train and validation splits to prevent
# overfitting. 
# check each of the folds has no common anchor value
def check_disjoint(start, fold_anchor_context_maps):
    for i in range(start, 4):
        for j in range(i+1, 5):
            if fold_anchor_context_maps[i].isdisjoint(fold_anchor_context_maps[j]):
                print(f"anchor context map for fold {i} and {j} are disjoint")

check_disjoint(0, fold_anchor_context_maps)                

anchor context map for fold 0 and 1 are disjoint
anchor context map for fold 0 and 2 are disjoint
anchor context map for fold 0 and 3 are disjoint
anchor context map for fold 0 and 4 are disjoint
anchor context map for fold 1 and 2 are disjoint
anchor context map for fold 1 and 3 are disjoint
anchor context map for fold 1 and 4 are disjoint
anchor context map for fold 2 and 3 are disjoint
anchor context map for fold 2 and 4 are disjoint
anchor context map for fold 3 and 4 are disjoint


In [47]:
# For each anchor, context group (i.e. set of records having same anchor and context values), concatenate the target phrases
# key is unique anchor_context_map , value is concatenation of target phrases of all records for that unique anchor_context_map
anc_ctx_targets = {}
for anchor_context_map in df_train.anchor_context_map.unique():
    df_train_sub = df_train[df_train.anchor_context_map == anchor_context_map]
    anchor_context_target_text = ",".join(df_train_sub.target)    
    anc_ctx_targets[anchor_context_map] = anchor_context_target_text

df_train["anchor_context_targets"] = df_train.anchor_context_map.map(anc_ctx_targets)
df_train["anc_ctx_tgt_len"] = df_train["anchor_context_targets"].apply(lambda text: len(text.split()))
df_train = df_train.sort_values(by=["anc_ctx_tgt_len"], ascending=False)
# df_train = df_train[df_train.anchor_context_map == "555_89"]

In [48]:
df_titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [49]:
df_train = pd.merge(
    left = df_train,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [50]:
# df_train = df_train[(df_train.anchor_context_map == "555_89")]

In [51]:
df_train.head()

Unnamed: 0,id,anchor,target,context,score,section,anchor_map,context_map,anchor_context_map,score_map,kfold,anchor_context_targets,anc_ctx_tgt_len,code,title
0,b914e293003ae773,reflection type liquid crystal display,reflection matrix type crystal display,G02,0.5,G,555,89,555_89,2,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS
1,b0e707f934a27619,reflection type liquid crystal display,reflection type crystal display,G02,0.5,G,555,89,555_89,2,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS
2,419d60028f275a2c,reflection type liquid crystal display,liquid type crystal display,G02,0.5,G,555,89,555_89,2,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS
3,b30ea39bcb4f502d,reflection type liquid crystal display,reflective mode liquid crystal display,G02,0.75,G,555,89,555_89,3,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS
4,030854fc7b18b42c,reflection type liquid crystal display,reflection liquid display,G02,0.5,G,555,89,555_89,2,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS


In [52]:
# df_train.loc[0, "anchor_context_targets"]

In [53]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# model = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.NUM_LABELS)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [54]:
df_train['sectok'] = '[' + df_train.section + ']'
sectoks = list(df_train.sectok.unique())
# sep = '[s]'
# sectoks.append(sep)
print(f"Additional special tokens: {sectoks}")
tokenizer.add_special_tokens({'additional_special_tokens': sectoks})

Additional special tokens: ['[G]', '[C]', '[H]', '[B]', '[A]', '[E]', '[F]', '[D]']


8

In [55]:
sep = " " + tokenizer.sep_token + " "
df_train["inputs"] = df_train.sectok + sep + df_train.anchor + sep + df_train.target + sep + df_train.title + sep + df_train.anchor_context_targets
df_train["inputs"] = df_train["inputs"].apply(lambda x: x.lower())
df_train.head()

Unnamed: 0,id,anchor,target,context,score,section,anchor_map,context_map,anchor_context_map,score_map,kfold,anchor_context_targets,anc_ctx_tgt_len,code,title,sectok,inputs
0,b914e293003ae773,reflection type liquid crystal display,reflection matrix type crystal display,G02,0.5,G,555,89,555_89,2,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS,[G],[g] [sep] reflection type liquid crystal displ...
1,b0e707f934a27619,reflection type liquid crystal display,reflection type crystal display,G02,0.5,G,555,89,555_89,2,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS,[G],[g] [sep] reflection type liquid crystal displ...
2,419d60028f275a2c,reflection type liquid crystal display,liquid type crystal display,G02,0.5,G,555,89,555_89,2,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS,[G],[g] [sep] reflection type liquid crystal displ...
3,b30ea39bcb4f502d,reflection type liquid crystal display,reflective mode liquid crystal display,G02,0.75,G,555,89,555_89,3,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS,[G],[g] [sep] reflection type liquid crystal displ...
4,030854fc7b18b42c,reflection type liquid crystal display,reflection liquid display,G02,0.5,G,555,89,555_89,2,4,"liquid matrix type crystal,lcd displays reflec...",213,G02,OPTICS,[G],[g] [sep] reflection type liquid crystal displ...


In [56]:
# sep = tokenizer.sep_token
# df_train["inputs"] = df_train.sectok + sep + df_train.anchor + sep + df_train.target + sep + df_train.title 
# df_train.head()

In [57]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True
    )
    if with_labels:
        encoding["labels"] = row["score"]
    return encoding

In [58]:
from functools import partial

preprocess_train_data = partial(tokenize_text, tokenizer, True)  
preprocess_test_data = partial(tokenize_text, tokenizer, False)  

In [59]:
def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    ds_train_raw = Dataset.from_pandas(train_df)
    ds_valid_raw = Dataset.from_pandas(valid_df)
    raw_ds_col_names = ds_train_raw.column_names    
    ds_train = ds_train_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)
    ds_valid = ds_valid_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)    
    return train_df, valid_df, ds_train, ds_valid

In [60]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [61]:
def get_oof_preds(trainer, ds_val, df_val_fold):
    oof_outputs = trainer.predict(ds_val)
    oof_predictions = oof_outputs.predictions.reshape(-1)
    df_val_fold["val_preds"] = oof_predictions
    return df_val_fold

In [62]:
def get_training_args(fold_str):
    training_args = TrainingArguments(
        output_dir=Config.OUT_DIR + fold_str,
        evaluation_strategy="epoch",
        save_strategy='epoch',        
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.EVAL_BATCH_SIZE,
        warmup_ratio=TrainingArgs.warmup_ratio,
        weight_decay=TrainingArgs.weight_decay,
        learning_rate=TrainingArgs.learning_rate,    
        gradient_accumulation_steps=TrainingArgs.gradient_accumulation_steps,
        fp16=TrainingArgs.fp16,
        lr_scheduler_type=TrainingArgs.lr_scheduler_type,
        save_total_limit=TrainingArgs.save_total_limit,
        load_best_model_at_end=TrainingArgs.load_best_model_at_end,
        metric_for_best_model=TrainingArgs.metric_for_best_model,
        adam_epsilon=TrainingArgs.adam_epsilon,
        #warmup_steps=TrainingArgs.warmup_steps,
        log_level=TrainingArgs.log_level
    )
    return training_args

In [63]:
import gc

df_val_preds = pd.DataFrame()
tok_vocab = tokenizer.get_vocab()
for fold in range(Config.NUM_FOLDS):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    df_train_fold, df_val_fold, ds_train, ds_val = get_fold_dls(fold, df_train)
    training_args = get_training_args(fold_str)
    model = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=Config.NUM_LABELS)
    print(f"len(tokenizer_vocab) = {len(tok_vocab)}")
    model.resize_token_embeddings(len(tok_vocab))    
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=ds_train,              # training dataset
        eval_dataset=ds_val,                 # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    trainer.train()
    trainer.save_model(Config.OUT_DIR + fold_str)
    df_val_fold = get_oof_preds(trainer, ds_val, df_val_fold) 
    display(df_val_fold.head())
    df_val_preds = pd.concat([df_val_preds, df_val_fold], axis=0)
    # export the oof predictions to csv for later use in stacking
    if Config.RUNTIME != "KAGGLE":
        df_val_fold.to_csv(Config.VAL_PREDS_PATH + f"df_train_oof_preds_{Config.MODEL_NAME}_{fold_str}.csv")
    else:
        df_val_preds.to_csv("/kaggle/working/df_train_oof_preds.csv")
    print(f"Saved OOF predictions for fold {fold}")    
    del model, trainer
    gc.collect()
    torch.cuda.empty_cache()
    # Empty the trash to clear gdrive disk space
    empty_gdrive_trash()
    if not Config.RUN_ALL_FOLDS:
        break

if Config.RUN_ALL_FOLDS:
    df_val_preds.to_csv(Config.VAL_PREDS_PATH + f"df_train_oof_preds_{Config.MODEL_NAME}.csv")

Running training for fold0


  0%|          | 0/30 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/8 [00:00<?, ?ba/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

len(tokenizer_vocab) = 128009


Epoch,Training Loss,Validation Loss,Pearson
0,No log,0.025726,0.805223


Unnamed: 0,id,anchor,target,context,score,section,anchor_map,context_map,anchor_context_map,score_map,kfold,anchor_context_targets,anc_ctx_tgt_len,code,title,sectok,inputs,val_preds
0,23113e2a55baefee,lamination method,tape,G02,0.5,G,367,89,367_89,2,0,"lamination process,treatment method,laminating...",41,G02,OPTICS,[G],[g] [sep] lamination method [sep] tape [sep] o...,0.327393
1,07a4affd80ea727b,lamination method,printing process,G02,0.5,G,367,89,367_89,2,0,"lamination process,treatment method,laminating...",41,G02,OPTICS,[G],[g] [sep] lamination method [sep] printing pro...,0.472656
2,f74dabf1e37560fd,lamination method,laminator,G02,0.5,G,367,89,367_89,2,0,"lamination process,treatment method,laminating...",41,G02,OPTICS,[G],[g] [sep] lamination method [sep] laminator [s...,0.45166
3,99556e130ed67261,lamination method,adhesive member,G02,0.25,G,367,89,367_89,1,0,"lamination process,treatment method,laminating...",41,G02,OPTICS,[G],[g] [sep] lamination method [sep] adhesive mem...,0.461182
4,c6a85528bad51162,lamination method,printing,G02,0.5,G,367,89,367_89,2,0,"lamination process,treatment method,laminating...",41,G02,OPTICS,[G],[g] [sep] lamination method [sep] printing [se...,0.348389


Saved OOF predictions for fold 0
The below files were cleared from trash
['scaler.pt', 'scheduler.pt', 'optimizer.pt', 'training_args.bin', 'tokenizer.json', 'spm.model', 'added_tokens.json', 'special_tokens_map.json', 'tokenizer_config.json', 'pytorch_model.bin', 'config.json', 'checkpoint-1']


In [64]:
# Calculate the CV score
predictions = df_val_preds['val_preds'].values
labels = df_val_preds['score'].values
eval_preds = predictions, labels
cv_metric_dict = compute_metrics(eval_preds)
print(f"CV score = {cv_metric_dict}")

CV score = {'pearson': 0.8052227756644419}


In [65]:
# df_val_preds["score_pred_diff"] = df_val_preds.apply(lambda row: abs(row["val_preds"] - row["score"]), axis=1)
# df_val_preds = df_val_preds.sort_values(by=["score_pred_diff"], ascending=False)
# df_val_preds_diff = df_val_preds[df_val_preds.score_pred_diff > 0.2]
# len(df_val_preds_diff)

In [66]:
# df_val_preds_diff[["anchor", "target", "context", "score", "val_preds", "score_pred_diff"]]