In [1]:
!nvidia-smi

Sat Jul 30 06:13:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%pip install -q transformers[sentencepiece] datasets hydra-core wandb iterative-stratification scikit-multilearn bitsandbytes-cuda112

[K     |████████████████████████████████| 4.7 MB 7.5 MB/s 
[K     |████████████████████████████████| 365 kB 72.1 MB/s 
[K     |████████████████████████████████| 151 kB 74.3 MB/s 
[K     |████████████████████████████████| 1.8 MB 58.1 MB/s 
[K     |████████████████████████████████| 89 kB 9.1 MB/s 
[K     |████████████████████████████████| 4.2 MB 60.4 MB/s 
[K     |████████████████████████████████| 212 kB 75.2 MB/s 
[K     |████████████████████████████████| 101 kB 14.1 MB/s 
[K     |████████████████████████████████| 141 kB 76.7 MB/s 
[K     |████████████████████████████████| 596 kB 67.1 MB/s 
[K     |████████████████████████████████| 127 kB 74.2 MB/s 
[K     |████████████████████████████████| 117 kB 70.9 MB/s 
[K     |████████████████████████████████| 79 kB 8.8 MB/s 
[K     |████████████████████████████████| 181 kB 74.2 MB/s 
[K     |████████████████████████████████| 156 kB 67.7 MB/s 
[K     |████████████████████████████████| 63 kB 2.1 MB/s 
[K     |██████████████████████

In [8]:
import gc
import sys
import os

def add_sys_paths(ml_utils_root):
    # os.walk will give a list of all sub directories with a given directory. It return a tuple of 3 elements with first element
    # being the subdir path
    for sub_dir, _, _ in os.walk(ml_utils_root):
        # check if there is a file in the sub directory
        # If yes, append the directory path to sys.path as we need the code in the file
        sub_dir_files = [file for file in os.listdir(sub_dir) if os.path.isfile(os.path.join(sub_dir, file))]    
        if len(sub_dir_files) > 0:
            print(sub_dir)
            sys.path.append(sub_dir)

In [9]:
COLAB_ROOT_PATH = "/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/"
ml_utils_root = '/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/src/ML_UTILS/'
add_sys_paths(ml_utils_root)
os.chdir(COLAB_ROOT_PATH)

/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/src/ML_UTILS/
/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/src/ML_UTILS/NLP/HuggingFace
/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/src/ML_UTILS/NLP/HuggingFace/Models/Deberta
/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/src/ML_UTILS/NLP/HuggingFace/Models/Deberta/__pycache__
/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/src/ML_UTILS/NLP/HuggingFace/__pycache__
/content/gdrive/MyDrive/Kaggle/NLP/PatentPhraseMatching/src/ML_UTILS/__pycache__


In [10]:
# Load the config for training run
import hydra
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import DictConfig, OmegaConf

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(version_base=None, config_path="configs")
config = compose(config_name="config")

In [11]:
import loss_functions
import colab_utils
import cv_split_utils
import helper
import hf_utils
from loss_functions import LossType

In [12]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch.multiprocessing as mp
from transformers import logging
import warnings
import wandb

logging.set_verbosity_error()
logging.set_verbosity_warning()
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
# This is where we configure the run parameters
EXP_NUM = "03"
MODEL_NAME = "deberta-v3-small"
TRAIN_RUN_NAME = "mseloss_concatlastfourhead"
TRANSFORMER_CHECKPOINT = "microsoft/deberta-v3-small"
EXPERIMENT_NAME = EXP_NUM + "_" + MODEL_NAME + "_" + TRAIN_RUN_NAME
TRAIN_ON_SUBSET = True
LOSS_TYPE = LossType.MSE
ENABLE_WANDB = False
OUTPUT_HIDDEN_STATES = True

In [14]:
# update the config with run params
# update the output_dir to model specific directory
config.paths.out_dir += MODEL_NAME
config.train_run.transformer_checkpoint = TRANSFORMER_CHECKPOINT
config.train_run.num_workers = mp.cpu_count()
config.train_run.experiment_name = EXPERIMENT_NAME
config.train_run.train_on_subset = TRAIN_ON_SUBSET
config.train_run.loss_type = LOSS_TYPE
config.train_run.output_hidden_states = OUTPUT_HIDDEN_STATES
config.wandb.enabled = ENABLE_WANDB
if config.wandb.enabled:
    # enable reporting to wandb via huggingface training arguments
    config.training_args.report_to = "wandb"
    config.wandb.key = ""    
else:
    config.training_args.report_to = "none"

In [15]:
print(OmegaConf.to_yaml(config))

train_run:
  random_state: 42
  num_labels: 1
  label_col: score
  num_folds: 5
  run_all_folds: false
  num_epochs: 2
  num_workers: 2
  device: cuda
  subset_rows_frac: 0.05
  train_on_subset: true
  transformer_checkpoint: microsoft/deberta-v3-small
  experiment_name: 03_deberta-v3-small_mseloss_concatlastfourhead
  save_artifacts: true
  loss_type: mse
  output_hidden_states: true
wandb:
  key: None
  project: USPPPM
  enabled: false
training_args:
  output_dir: None
  evaluation_strategy: epoch
  save_strategy: epoch
  save_total_limit: 1
  load_best_model_at_end: true
  metric_for_best_model: pearson
  greater_is_better: true
  group_by_length: true
  report_to: none
  num_train_epochs: 2
  per_device_train_batch_size: 40
  per_device_eval_batch_size: 80
  weight_decay: 0.01
  learning_rate: 4.0e-05
  warmup_ratio: 0.1
  gradient_accumulation_steps: 3
  fp16: true
  lr_scheduler_type: linear
  adam_epsilon: 1.0e-06
paths:
  data_path: /content/gdrive/MyDrive/Kaggle/NLP/PatentPhra

In [16]:
colab_utils.empty_gdrive_trash()

The below files were cleared from trash
['colab_utils.cpython-37.pyc', 'loss_functions.cpython-37.pyc', 'helper.cpython-37.pyc', 'cv_split_utils.cpython-37.pyc', '__pycache__', 'utils.cpython-37.pyc', 'debertav2_custom_heads.cpython-37.pyc', 'custom_transformer_heads.cpython-37.pyc', 'colab_utils.cpython-37.pyc', '__pycache__', 'colab_utils.py']


In [17]:
def initialize_wandb(fold):    
    wandb.login(key=config.wandb.key)
    wandb.init(
        config=config,
        project=config.wandb.project,
        group=config.train_run.experiment_name,
        name=f"fold_{fold}"
    )        

In [18]:
df_train = pd.read_csv(config.paths.data_path + "train.csv")
df_train["section"] = df_train.context.str[0]
df_test = pd.read_csv(config.paths.data_path + "test.csv")
df_titles = pd.read_csv(config.paths.data_path + "titles.csv")

In [19]:
from sklearn.preprocessing import LabelEncoder

anchor_encoder = LabelEncoder()
df_train["anchor_map"] = anchor_encoder.fit_transform(df_train["anchor"])
df_train["context_map"] = anchor_encoder.fit_transform(df_train["context"])
df_train["anchor_context_map"] = df_train["anchor_map"].astype(str).str.cat(df_train["context_map"].astype(str), sep="_")
# Score is not really a continuous value here as there are just five distinct values. But since it is float it needs to be converted
# to categorical value before we can perform stratified split on score
df_train["score_map"] = df_train["score"].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
df_train["score_binary_map"] = df_train["score"].map({0.00: 0, 0.25: 0, 0.50: 1, 0.75: 1, 1.00: 1})

In [20]:
if config.train_run.train_on_subset:
    print(f"Selecting {config.train_run.subset_rows_frac * 100}% training data")
    df_train = df_train.sample(
        frac=config.train_run.subset_rows_frac, 
        random_state=config.train_run.random_state
    ).reset_index(drop=True)

# Since the target column (score) is continuous, we need to create bins out of the target column
# df_train.loc[:, "bins"] = pd.cut(df_train.score, bins=5, labels=[0,1,2,3,4])
# df_train = utils.strat_kfold_dataframe(df_train, target_col_name="bins", random_state=config.train_run.random_state, num_folds=Config.NUM_FOLDS)

# Now do a stratified group k fold on the bins column (which is a categorical column) and anchor as groups
df_train = cv_split_utils.strat_group_kfold_dataframe(
    df_train, 
    target_col_name="score_map", 
    group_col_name="anchor_context_map",
    random_state=config.train_run.random_state, 
    num_folds=config.train_run.num_folds
)            
# drop the bin column
# df_train = df_train.drop(["bins"], axis=1)
# df_train = df_train.drop(["anchor_map", "score_map"], axis=1)

Selecting 5.0% training data


In [21]:
# Let us check if the stratification has been done correctly
# The mean of score column should be similar across folds 
fold_score_mean = []
fold_anchor_context_maps = []
for fold in range(config.train_run.num_folds):
    df_train_fold = df_train[df_train.kfold == fold]
    fold_score_mean.append(np.mean(df_train_fold.score.values))
    fold_anchor_context_maps.append(set(df_train_fold.anchor_context_map.unique()))
fold_score_mean

[0.37714285714285717,
 0.37532981530343007,
 0.3704896907216495,
 0.3475274725274725,
 0.35422740524781343]

In [22]:
# check each of the folds has no common anchor value
def check_disjoint(start, fold_anchor_context_maps):
    for i in range(start, 4):
        for j in range(i+1, 5):
            if fold_anchor_context_maps[i].isdisjoint(fold_anchor_context_maps[j]):
                print(f"anchor context map for fold {i} and {j} are disjoint")

check_disjoint(0, fold_anchor_context_maps)                

anchor context map for fold 0 and 1 are disjoint
anchor context map for fold 0 and 2 are disjoint
anchor context map for fold 0 and 3 are disjoint
anchor context map for fold 0 and 4 are disjoint
anchor context map for fold 1 and 2 are disjoint
anchor context map for fold 1 and 3 are disjoint
anchor context map for fold 1 and 4 are disjoint
anchor context map for fold 2 and 3 are disjoint
anchor context map for fold 2 and 4 are disjoint
anchor context map for fold 3 and 4 are disjoint


In [23]:
df_titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [24]:
df_train = pd.merge(
    left = df_train,
    right = df_titles[["code", "title"]],
    how = "inner",
    left_on = "context",
    right_on = "code"
)

In [25]:
df_train.anchor_context_map.value_counts()

129_47    9
129_48    6
617_88    6
328_89    6
548_68    6
         ..
389_64    1
726_64    1
155_37    1
447_37    1
356_99    1
Name: anchor_context_map, Length: 996, dtype: int64

In [26]:
# For each anchor, context group (i.e. set of records having same anchor and context values), concatenate the target phrases
# key is unique anchor_context_map , value is concatenation of target phrases of all records for that unique anchor_context_map
anc_ctx_targets = {}
for anchor_context_map in df_train.anchor_context_map.unique():
    df_train_sub = df_train[df_train.anchor_context_map == anchor_context_map]
    anchor_context_target_text = ",".join(df_train_sub.target)    
    anc_ctx_targets[anchor_context_map] = anchor_context_target_text

df_train["anchor_context_targets"] = df_train.anchor_context_map.map(anc_ctx_targets)
df_train["anc_ctx_tgt_len"] = df_train["anchor_context_targets"].apply(lambda text: len(text.split()))
df_train = df_train.sort_values(by=["anc_ctx_tgt_len"], ascending=False)
# df_train = df_train.head(500)
# df_train = df_train[df_train.anchor_context_map == "555_89"]

In [27]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(config.train_run.transformer_checkpoint)
# DataCollatorWithPadding pads each batch to the longest sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
df_train['sectok'] = '[' + df_train.section + ']'
sectoks = list(df_train.sectok.unique())
print(f"Additional special tokens: {sectoks}")
tokenizer.add_special_tokens({'additional_special_tokens': sectoks})

Additional special tokens: ['[C]', '[E]', '[G]', '[H]', '[B]', '[A]', '[F]', '[D]']


8

In [29]:
sep = " " + tokenizer.sep_token + " "
df_train["inputs"] = df_train.sectok + sep + df_train.anchor + sep + df_train.target + sep + df_train.title + sep + df_train.anchor_context_targets
df_train["inputs"] = df_train["inputs"].apply(lambda x: x.lower())
df_train.head()

Unnamed: 0,id,anchor,target,context,score,section,anchor_map,context_map,anchor_context_map,score_map,score_binary_map,kfold,code,title,anchor_context_targets,anc_ctx_tgt_len,sectok,inputs
988,04e7b8dcdc84fba0,component composite coating,polyurethane composite coating,C08,0.5,C,129,47,129_47,2,1,3,C08,ORGANIC MACROMOLECULAR COMPOUNDS; THEIR PREPAR...,"phase polymer,coat component aqueous binder,la...",17,[C],[c] [sep] component composite coating [sep] po...
986,a362b1a0c71e9480,component composite coating,component coating composition,C08,0.5,C,129,47,129_47,2,1,3,C08,ORGANIC MACROMOLECULAR COMPOUNDS; THEIR PREPAR...,"phase polymer,coat component aqueous binder,la...",17,[C],[c] [sep] component composite coating [sep] co...
981,762e7c79104fcdc5,component composite coating,protective coatings,C08,0.25,C,129,47,129_47,1,0,3,C08,ORGANIC MACROMOLECULAR COMPOUNDS; THEIR PREPAR...,"phase polymer,coat component aqueous binder,la...",17,[C],[c] [sep] component composite coating [sep] pr...
963,7a41a29fda0bbd4a,component composite coating,coat component aqueous binder,C08,0.5,C,129,47,129_47,2,1,3,C08,ORGANIC MACROMOLECULAR COMPOUNDS; THEIR PREPAR...,"phase polymer,coat component aqueous binder,la...",17,[C],[c] [sep] component composite coating [sep] co...
960,53ce3c619abe31e8,component composite coating,phase polymer,C08,0.5,C,129,47,129_47,2,1,3,C08,ORGANIC MACROMOLECULAR COMPOUNDS; THEIR PREPAR...,"phase polymer,coat component aqueous binder,la...",17,[C],[c] [sep] component composite coating [sep] ph...


In [30]:
def tokenize_text(tokenizer, with_labels, row):
    encoding = tokenizer(
        text = row["inputs"],
        padding = False,
        truncation = True,
        # maximum possible sequence length (for inputs column). Sequences exceeding this length will be truncated
        max_length = 512
    )
    if with_labels:
        encoding["labels"] = row[config.train_run.label_col]
    return encoding

In [31]:
from functools import partial

preprocess_train_data = partial(tokenize_text, tokenizer, True)  
preprocess_test_data = partial(tokenize_text, tokenizer, False)  

In [32]:
def compute_metrics(eval_pred):    
    predictions, labels = eval_pred
    # if the model has been configured to output intermediate layer hidden states, the model output will be a tuple 
    # consist of two terms, first last layer hidden state (which is what we need for computing metrics) and second
    # each of the intermediate layer hidden states ( a tuple (of length = num_layers) of tensors [batch_size, seq_len, hidden_size]) )
    if config.train_run.output_hidden_states:
        predictions = predictions[0]
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [33]:
def get_oof_preds(trainer, ds_val, df_val_fold):
    oof_outputs = trainer.predict(ds_val)
    if config.train_run.output_hidden_states:
        predictions = oof_outputs.predictions[0]
    else:
        predictions = oof_outputs.predictions
    oof_predictions = predictions.reshape(-1)
    df_val_fold["val_preds"] = oof_predictions
    return df_val_fold

In [34]:
def clean_up(fold_str):
    gc.collect()
    torch.cuda.empty_cache()
    helper.delete_checkpoints(config.training_args["output_dir"])
    print(f"deleted checkpoints as best model for {fold_str} saved already")
    # Empty the trash to clear gdrive disk space
    colab_utils.empty_gdrive_trash()

In [37]:
from transformers import AutoConfig
from debertav2_seqclf_attention import DebertaV2ForSeqClfAttention
from debertav2_seqclf_concatlastfour import DebertaV2ForSeqClfConcatLastFour
import time

df_val_preds = pd.DataFrame()
tok_vocab = tokenizer.get_vocab()
exp_start_time = time.time()
for fold in range(config.train_run.num_folds):
    fold_str = f"fold{fold}"    
    if config.wandb.enabled:
        initialize_wandb(fold)
    df_train_fold, df_val_fold, ds_train, ds_val = hf_utils.get_fold_ds(fold, df_train, preprocess_train_data)
    config.training_args["output_dir"] = config.paths.out_dir + fold_str
    training_args = TrainingArguments(**config.training_args) 
    model_config = AutoConfig.from_pretrained(config.train_run.transformer_checkpoint)
    model_config.output_hidden_states = config.train_run.output_hidden_states
    model_config.num_labels = config.train_run.num_labels
    model = DebertaV2ForSeqClfConcatLastFour.from_pretrained(
        config.train_run.transformer_checkpoint,
        config = model_config, 
        loss_type=config.train_run.loss_type
    )
    print(f"len(tokenizer_vocab) = {len(tok_vocab)}")
    model.resize_token_embeddings(len(tok_vocab))    
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=ds_train,              # training dataset
        eval_dataset=ds_val,                 # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    print(f"Running training for {fold_str} using loss_type={config.train_run.loss_type}")
    trainer.train()
    trainer.save_model(config.training_args["output_dir"])
    df_val_fold = get_oof_preds(trainer, ds_val, df_val_fold) 
    # display(df_val_fold.head())
    df_val_preds = pd.concat([df_val_preds, df_val_fold], axis=0)
    # export the oof predictions to csv for later use in stacking    
    df_val_fold.to_csv(config.paths.val_preds_path + f"df_train_oof_preds_{MODEL_NAME}_{fold_str}.csv")
    print(f"Saved OOF predictions for fold {fold}")    
    del model, trainer
    clean_up()
    if not config.train_run.run_all_folds:
        break

exp_end_time = time.time()        

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSeqClfConcatLastFour: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSeqClfConcatLastFour from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSeqClfConcatLastFour from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Bert

config.output_hidden_states = True
len(tokenizer_vocab) = 128009
Running training for fold0 using loss_type=mse


Epoch,Training Loss,Validation Loss,Pearson
0,No log,0.125484,0.136944
1,No log,0.071834,0.253321


Saved OOF predictions for fold 0
deleted checkpoints as best model for fold0 saved already
The below files were cleared from trash
['rng_state.pth', 'trainer_state.json', 'scaler.pt', 'scheduler.pt', 'optimizer.pt', 'training_args.bin', 'tokenizer.json', 'spm.model', 'added_tokens.json', 'special_tokens_map.json', 'tokenizer_config.json', 'pytorch_model.bin', 'config.json', 'checkpoint-24']


In [40]:
# Calculate the CV score
def calculate_cv_score(df_oof):
    predictions = df_oof['val_preds'].values
    labels = df_oof['score'].values
    eval_preds = predictions, labels
    return np.corrcoef(predictions, labels)[0][1]

In [41]:
import json

# save config file, log file, oof file etc. related to training run
exp_run_time = helper.asHours(exp_end_time - exp_start_time)
if config.train_run.run_all_folds:
        df_val_preds.to_csv(config.paths.val_preds_path + f"df_train_oof_preds_{MODEL_NAME}.csv")

cv_score = calculate_cv_score(df_val_preds)
print(f"cv_score = {cv_score}")
run_summary_dict = {
    "experiment": config.train_run.experiment_name,
    "cv": cv_score,
    "experiment_time": exp_run_time,
    "wandb_run": wandb.run.get_url() if config.wandb.enabled else None,
}
print(run_summary_dict)
if config.train_run.save_artifacts:
    run_summary_file = config.paths.out_dir + "/run_summary.json"
    with open(run_summary_file, "w") as f:
        json.dump(run_summary_dict, f, indent=4)

    # save run config yaml file (hydra config)
    exp_config_file = config.paths.out_dir + "/exp_config.yaml"
    with open(exp_config_file, "w") as fp:
        OmegaConf.save(config, fp)

    # upload artifacts and log final metrics to wandb
    if config.wandb.enabled:
        # Log config, cv score, run time to wandb
        wandb.log({"cv_score": cv_score, "exp_run_time": exp_run_time})    
        wandb.finish()
        #wandb.save(exp_config_file)        
        #wandb.save(run_summary_file)


cv_score = 0.2533210301284113
{'experiment': '03_deberta-v3-small_mseloss_concatlastfourhead', 'cv': 0.2533210301284113, 'experiment_time': '0h:1m:13s', 'wandb_run': None}
