In [1]:
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [2]:
# if packages from internet required for inference
# if iskaggle:
#    !pip install -q datasets
#    pip download datasets --dest frozen_packages --prefer-binary

In [3]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import os
from sklearn.model_selection import KFold, StratifiedKFold
import shutil
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup, TrainingArguments, Trainer, AutoModelForSequenceClassification
from transformers import BertModel
from tqdm import tqdm
import warnings, transformers, logging, torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
os.environ["WANDB_DISABLED"] = "true"
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [5]:
class CFG:
    if iskaggle:
        input_path = '../input/us-patent-phrase-to-phrase-matching'
    else:
        input_path = '/home/bhavik/projects/kaggle-patent-phrase-matching/data'
    
    model_path = 'anferico/bert-for-patents'
    
    learning_rate = 1e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 16
    max_len=133

In [6]:
train_df = pd.read_csv(f"{CFG.input_path}/train.csv")
if iskaggle:
    titles = pd.read_csv(f"../input/us-patents-category-titles/titles.csv")
else:
    titles = pd.read_csv(f"{CFG.input_path}/titles.csv")
train_df = train_df.merge(titles, left_on='context', right_on='code')

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    # data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [7]:
def prep_input_tokens(df):
    return df.anchor + '[SEP]' + df.target + '[cpc]' + df.title

In [8]:
train_df['input'] = prep_input_tokens(train_df)
train_df = create_folds(train_df, CFG.num_fold)

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    CFG.model_path,
    additional_special_tokens = ['[abstract]','[claim]','[summary]','[invention]','[cpc]']
)

In [10]:
# ====================================================
# Define max_len
# ====================================================
lenghts = []
for input_text in train_df['input']:
    lenghts.append(len(tokenizer(input_text, add_special_tokens=False)['input_ids']))
    
max_len = max(lenghts) + 4 # CLS + SEP + SEP + SEP
CFG.max_len = max_len
max_len

70

In [11]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer(inputs),
        #**tokenizer(inputs, max_length=CFG.max_len, padding="max_length"),
        #**tokenizer(inputs),
        'label': label.astype(np.float32)
    }

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [13]:
oof_df = pd.DataFrame()
for fold in range(CFG.num_fold):
    
    tr_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
    va_data = train_df[train_df['fold']==fold].reset_index(drop=True)
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)

    if iskaggle:
        train_output_dir = f"/kaggle/working/tmp/uspppm"
    else:
        train_output_dir = f"/tmp/uspppm"
    
    args = TrainingArguments(
        output_dir=train_output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CFG.learning_rate,
        lr_scheduler_type='cosine',
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size*2,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model="pearson",
        load_best_model_at_end=True,
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    trainer.save_model(f"uspppm_{fold}")

    val_outputs = trainer.predict(va_dataset)
    val_predictions = val_outputs.predictions.reshape(-1)
    va_data['preds'] = val_predictions
    va_data['preds'][va_data['preds'] < 0] = 0.0
    oof_df = pd.concat([oof_df, va_data])

    shutil.rmtree(train_output_dir)

  5%|▌         | 501/9120 [01:41<27:34,  5.21it/s]

{'loss': 0.0866, 'learning_rate': 9.926019681921196e-06, 'epoch': 0.27}


 11%|█         | 1001/9120 [03:20<25:47,  5.25it/s]

{'loss': 0.0508, 'learning_rate': 9.706267962669999e-06, 'epoch': 0.55}


 16%|█▋        | 1501/9120 [05:00<25:33,  4.97it/s]

{'loss': 0.0393, 'learning_rate': 9.347247763081834e-06, 'epoch': 0.82}


                                                   
 20%|██        | 1824/9120 [06:17<22:40,  5.36it/s]

{'eval_loss': 0.02415216527879238, 'eval_pearson': 0.8200252760329477, 'eval_runtime': 13.2349, 'eval_samples_per_second': 551.195, 'eval_steps_per_second': 17.227, 'epoch': 1.0}


 22%|██▏       | 2001/9120 [06:56<23:19,  5.09it/s]   

{'loss': 0.032, 'learning_rate': 8.859583254581604e-06, 'epoch': 1.1}


 27%|██▋       | 2501/9120 [08:35<20:34,  5.36it/s]

{'loss': 0.0283, 'learning_rate': 8.257705467351144e-06, 'epoch': 1.37}


 33%|███▎      | 3001/9120 [10:15<21:59,  4.64it/s]

{'loss': 0.0248, 'learning_rate': 7.559425245448006e-06, 'epoch': 1.64}


 38%|███▊      | 3501/9120 [11:55<18:07,  5.17it/s]

{'loss': 0.0235, 'learning_rate': 6.785406186042e-06, 'epoch': 1.92}


                                                   
 40%|████      | 3648/9120 [12:37<17:02,  5.35it/s]

{'eval_loss': 0.019371772184967995, 'eval_pearson': 0.8471462247159892, 'eval_runtime': 13.3191, 'eval_samples_per_second': 547.71, 'eval_steps_per_second': 17.118, 'epoch': 2.0}


 44%|████▍     | 4000/9120 [13:51<16:41,  5.11it/s]  

{'loss': 0.0199, 'learning_rate': 5.958553159618693e-06, 'epoch': 2.19}


 49%|████▉     | 4501/9120 [15:31<14:38,  5.26it/s]

{'loss': 0.016, 'learning_rate': 5.103334506137773e-06, 'epoch': 2.47}


 55%|█████▍    | 5001/9120 [17:10<13:34,  5.05it/s]

{'loss': 0.0156, 'learning_rate': 4.245057964803815e-06, 'epoch': 2.74}


                                                   
 60%|██████    | 5472/9120 [18:51<10:34,  5.75it/s]

{'eval_loss': 0.017898565158247948, 'eval_pearson': 0.8562505324238119, 'eval_runtime': 12.5436, 'eval_samples_per_second': 581.572, 'eval_steps_per_second': 18.177, 'epoch': 3.0}


 60%|██████    | 5501/9120 [19:01<12:32,  4.81it/s]  

{'loss': 0.0151, 'learning_rate': 3.409121764227809e-06, 'epoch': 3.02}


 66%|██████▌   | 6001/9120 [20:34<09:35,  5.42it/s]

{'loss': 0.0113, 'learning_rate': 2.6202630348146323e-06, 'epoch': 3.29}


 71%|███████▏  | 6501/9120 [22:08<07:47,  5.60it/s]

{'loss': 0.0108, 'learning_rate': 1.901825784452777e-06, 'epoch': 3.56}


 77%|███████▋  | 7001/9120 [23:41<06:32,  5.40it/s]

{'loss': 0.0109, 'learning_rate': 1.275070099662815e-06, 'epoch': 3.84}


                                                   
 80%|████████  | 7296/9120 [24:49<05:13,  5.82it/s]

{'eval_loss': 0.01971891149878502, 'eval_pearson': 0.8607799767401757, 'eval_runtime': 12.5447, 'eval_samples_per_second': 581.52, 'eval_steps_per_second': 18.175, 'epoch': 4.0}


 82%|████████▏ | 7501/9120 [25:31<04:44,  5.69it/s]  

{'loss': 0.0099, 'learning_rate': 7.585430144121319e-07, 'epoch': 4.11}


 88%|████████▊ | 8001/9120 [27:05<04:02,  4.62it/s]

{'loss': 0.0088, 'learning_rate': 3.675296639259912e-07, 'epoch': 4.39}


 93%|█████████▎| 8500/9120 [28:38<01:56,  5.33it/s]

{'loss': 0.0086, 'learning_rate': 1.1360096502120387e-07, 'epoch': 4.66}


 99%|█████████▊| 9001/9120 [30:13<00:21,  5.41it/s]

{'loss': 0.0086, 'learning_rate': 4.2712080634949024e-09, 'epoch': 4.93}


100%|██████████| 9120/9120 [30:35<00:00,  5.70it/s]
100%|██████████| 9120/9120 [30:47<00:00,  5.70it/s]

{'eval_loss': 0.01812131702899933, 'eval_pearson': 0.8613911465254375, 'eval_runtime': 12.5359, 'eval_samples_per_second': 581.926, 'eval_steps_per_second': 18.188, 'epoch': 5.0}


100%|██████████| 9120/9120 [30:51<00:00,  4.93it/s]


{'train_runtime': 1851.6617, 'train_samples_per_second': 78.789, 'train_steps_per_second': 4.925, 'train_loss': 0.02317766560274258, 'epoch': 5.0}


100%|██████████| 228/228 [00:18<00:00, 12.15it/s]
  5%|▌         | 500/9120 [01:33<26:02,  5.52it/s]

{'loss': 0.0762, 'learning_rate': 9.926019681921196e-06, 'epoch': 0.27}


 11%|█         | 1001/9120 [03:07<25:15,  5.36it/s]

{'loss': 0.0462, 'learning_rate': 9.706267962669999e-06, 'epoch': 0.55}


 16%|█▋        | 1500/9120 [04:41<24:57,  5.09it/s]

{'loss': 0.0379, 'learning_rate': 9.347247763081834e-06, 'epoch': 0.82}


                                                   
 20%|██        | 1824/9120 [05:55<21:13,  5.73it/s]

{'eval_loss': 0.028873704373836517, 'eval_pearson': 0.8026925584144187, 'eval_runtime': 12.4459, 'eval_samples_per_second': 586.137, 'eval_steps_per_second': 18.319, 'epoch': 1.0}


 22%|██▏       | 2001/9120 [06:33<22:43,  5.22it/s]   

{'loss': 0.0318, 'learning_rate': 8.859583254581604e-06, 'epoch': 1.1}


 27%|██▋       | 2501/9120 [08:06<18:53,  5.84it/s]

{'loss': 0.0264, 'learning_rate': 8.257705467351144e-06, 'epoch': 1.37}


 33%|███▎      | 3001/9120 [09:41<19:15,  5.29it/s]

{'loss': 0.0235, 'learning_rate': 7.559425245448006e-06, 'epoch': 1.64}


 38%|███▊      | 3501/9120 [11:14<16:33,  5.66it/s]

{'loss': 0.0223, 'learning_rate': 6.785406186042e-06, 'epoch': 1.92}


                                                   
 40%|████      | 3648/9120 [11:54<15:50,  5.75it/s]

{'eval_loss': 0.019625477492809296, 'eval_pearson': 0.8414542086672256, 'eval_runtime': 12.4292, 'eval_samples_per_second': 586.926, 'eval_steps_per_second': 18.344, 'epoch': 2.0}


 44%|████▍     | 4001/9120 [13:04<15:51,  5.38it/s]  

{'loss': 0.0187, 'learning_rate': 5.958553159618693e-06, 'epoch': 2.19}


 49%|████▉     | 4501/9120 [14:38<13:39,  5.63it/s]

{'loss': 0.0157, 'learning_rate': 5.103334506137773e-06, 'epoch': 2.47}


 55%|█████▍    | 5001/9120 [16:12<13:51,  4.95it/s]

{'loss': 0.0154, 'learning_rate': 4.245057964803815e-06, 'epoch': 2.74}


                                                   
 60%|██████    | 5472/9120 [17:53<10:40,  5.70it/s]

{'eval_loss': 0.01977909542620182, 'eval_pearson': 0.8525127054800109, 'eval_runtime': 12.4455, 'eval_samples_per_second': 586.156, 'eval_steps_per_second': 18.32, 'epoch': 3.0}


 60%|██████    | 5500/9120 [18:02<13:00,  4.64it/s]  

{'loss': 0.0151, 'learning_rate': 3.409121764227809e-06, 'epoch': 3.02}


 66%|██████▌   | 6001/9120 [19:36<09:50,  5.28it/s]

{'loss': 0.0114, 'learning_rate': 2.6202630348146323e-06, 'epoch': 3.29}


 71%|███████▏  | 6501/9120 [21:10<07:46,  5.61it/s]

{'loss': 0.0112, 'learning_rate': 1.901825784452777e-06, 'epoch': 3.56}


 77%|███████▋  | 7001/9120 [22:44<06:31,  5.41it/s]

{'loss': 0.0107, 'learning_rate': 1.275070099662815e-06, 'epoch': 3.84}


                                                   
 80%|████████  | 7296/9120 [23:52<05:19,  5.72it/s]

{'eval_loss': 0.018914444372057915, 'eval_pearson': 0.8569660104830724, 'eval_runtime': 12.433, 'eval_samples_per_second': 586.746, 'eval_steps_per_second': 18.338, 'epoch': 4.0}


 82%|████████▏ | 7501/9120 [24:35<04:46,  5.65it/s]  

{'loss': 0.0096, 'learning_rate': 7.585430144121319e-07, 'epoch': 4.11}


 88%|████████▊ | 8001/9120 [26:09<03:57,  4.72it/s]

{'loss': 0.0088, 'learning_rate': 3.675296639259912e-07, 'epoch': 4.39}


 93%|█████████▎| 8501/9120 [27:43<01:57,  5.27it/s]

{'loss': 0.0087, 'learning_rate': 1.1360096502120387e-07, 'epoch': 4.66}


 99%|█████████▊| 9001/9120 [29:17<00:21,  5.45it/s]

{'loss': 0.0084, 'learning_rate': 4.2712080634949024e-09, 'epoch': 4.93}


                                                   
100%|██████████| 9120/9120 [29:52<00:00,  5.81it/s]

{'eval_loss': 0.018690628930926323, 'eval_pearson': 0.8563713158287694, 'eval_runtime': 12.4475, 'eval_samples_per_second': 586.063, 'eval_steps_per_second': 18.317, 'epoch': 5.0}


100%|██████████| 9120/9120 [29:56<00:00,  5.08it/s]


{'train_runtime': 1796.6017, 'train_samples_per_second': 81.203, 'train_steps_per_second': 5.076, 'train_loss': 0.021942704850644396, 'epoch': 5.0}


100%|██████████| 228/228 [00:18<00:00, 12.60it/s]
  5%|▌         | 501/9120 [01:33<25:43,  5.58it/s]

{'loss': 0.0918, 'learning_rate': 9.926019681921196e-06, 'epoch': 0.27}


 11%|█         | 1001/9120 [03:07<25:31,  5.30it/s]

{'loss': 0.0506, 'learning_rate': 9.706267962669999e-06, 'epoch': 0.55}


 16%|█▋        | 1501/9120 [04:41<23:36,  5.38it/s]

{'loss': 0.0383, 'learning_rate': 9.347247763081834e-06, 'epoch': 0.82}


                                                   
 20%|██        | 1824/9120 [05:54<20:36,  5.90it/s]

{'eval_loss': 0.02335944212973118, 'eval_pearson': 0.8109236507452093, 'eval_runtime': 12.4257, 'eval_samples_per_second': 587.092, 'eval_steps_per_second': 18.349, 'epoch': 1.0}


 22%|██▏       | 2001/9120 [06:32<21:51,  5.43it/s]   

{'loss': 0.0319, 'learning_rate': 8.859583254581604e-06, 'epoch': 1.1}


 27%|██▋       | 2501/9120 [08:05<19:41,  5.60it/s]

{'loss': 0.0265, 'learning_rate': 8.257705467351144e-06, 'epoch': 1.37}


 33%|███▎      | 3001/9120 [09:40<20:04,  5.08it/s]

{'loss': 0.0235, 'learning_rate': 7.559425245448006e-06, 'epoch': 1.64}


 38%|███▊      | 3501/9120 [11:14<16:53,  5.54it/s]

{'loss': 0.0241, 'learning_rate': 6.785406186042e-06, 'epoch': 1.92}


 40%|████      | 3648/9120 [11:41<14:57,  6.10it/s]
 40%|████      | 3648/9120 [11:54<14:57,  6.10it/s]

{'eval_loss': 0.02434719353914261, 'eval_pearson': 0.8328887952978997, 'eval_runtime': 12.4315, 'eval_samples_per_second': 586.815, 'eval_steps_per_second': 18.34, 'epoch': 2.0}


 44%|████▍     | 4001/9120 [13:03<15:59,  5.34it/s]  

{'loss': 0.0182, 'learning_rate': 5.958553159618693e-06, 'epoch': 2.19}


 49%|████▉     | 4501/9120 [14:37<13:38,  5.64it/s]

{'loss': 0.0159, 'learning_rate': 5.103334506137773e-06, 'epoch': 2.47}


 55%|█████▍    | 5001/9120 [16:12<13:17,  5.16it/s]

{'loss': 0.016, 'learning_rate': 4.245057964803815e-06, 'epoch': 2.74}


 60%|██████    | 5472/9120 [17:40<10:38,  5.71it/s]
 60%|██████    | 5472/9120 [17:52<10:38,  5.71it/s]

{'eval_loss': 0.021041328087449074, 'eval_pearson': 0.8460662543275785, 'eval_runtime': 12.4229, 'eval_samples_per_second': 587.222, 'eval_steps_per_second': 18.353, 'epoch': 3.0}


 60%|██████    | 5501/9120 [18:02<12:38,  4.77it/s]  

{'loss': 0.0149, 'learning_rate': 3.409121764227809e-06, 'epoch': 3.02}


 66%|██████▌   | 6001/9120 [19:35<09:42,  5.35it/s]

{'loss': 0.0111, 'learning_rate': 2.6202630348146323e-06, 'epoch': 3.29}


 71%|███████▏  | 6501/9120 [21:09<07:31,  5.80it/s]

{'loss': 0.0108, 'learning_rate': 1.901825784452777e-06, 'epoch': 3.56}


 77%|███████▋  | 7001/9120 [22:43<06:48,  5.19it/s]

{'loss': 0.0103, 'learning_rate': 1.275070099662815e-06, 'epoch': 3.84}


 80%|████████  | 7296/9120 [23:39<05:09,  5.89it/s]
 80%|████████  | 7296/9120 [23:51<05:09,  5.89it/s]

{'eval_loss': 0.019392266869544983, 'eval_pearson': 0.8502206139012896, 'eval_runtime': 12.4373, 'eval_samples_per_second': 586.541, 'eval_steps_per_second': 18.332, 'epoch': 4.0}


 82%|████████▏ | 7501/9120 [24:34<04:45,  5.67it/s]  

{'loss': 0.0097, 'learning_rate': 7.585430144121319e-07, 'epoch': 4.11}


 88%|████████▊ | 8001/9120 [26:08<03:56,  4.73it/s]

{'loss': 0.0086, 'learning_rate': 3.675296639259912e-07, 'epoch': 4.39}


 93%|█████████▎| 8501/9120 [27:42<01:59,  5.19it/s]

{'loss': 0.0085, 'learning_rate': 1.1360096502120387e-07, 'epoch': 4.66}


 99%|█████████▊| 9001/9120 [29:16<00:22,  5.19it/s]

{'loss': 0.0083, 'learning_rate': 4.2712080634949024e-09, 'epoch': 4.93}


100%|██████████| 9120/9120 [29:38<00:00,  5.98it/s]
100%|██████████| 9120/9120 [29:50<00:00,  5.98it/s]

{'eval_loss': 0.019257014617323875, 'eval_pearson': 0.8506431349000566, 'eval_runtime': 12.4297, 'eval_samples_per_second': 586.903, 'eval_steps_per_second': 18.343, 'epoch': 5.0}


100%|██████████| 9120/9120 [29:54<00:00,  5.08it/s]


{'train_runtime': 1794.964, 'train_samples_per_second': 81.277, 'train_steps_per_second': 5.081, 'train_loss': 0.023088559259970984, 'epoch': 5.0}


100%|██████████| 228/228 [00:18<00:00, 12.28it/s]
  5%|▌         | 501/9120 [01:33<25:57,  5.53it/s]

{'loss': 0.0959, 'learning_rate': 9.926019681921196e-06, 'epoch': 0.27}


 11%|█         | 1001/9120 [03:07<25:13,  5.36it/s]

{'loss': 0.0773, 'learning_rate': 9.706267962669999e-06, 'epoch': 0.55}


 16%|█▋        | 1501/9120 [04:41<22:57,  5.53it/s]

{'loss': 0.0461, 'learning_rate': 9.347247763081834e-06, 'epoch': 0.82}


 20%|██        | 1824/9120 [05:41<23:10,  5.25it/s]
 20%|██        | 1824/9120 [05:54<23:10,  5.25it/s]

{'eval_loss': 0.028964407742023468, 'eval_pearson': 0.7912404991128362, 'eval_runtime': 12.4298, 'eval_samples_per_second': 586.815, 'eval_steps_per_second': 18.343, 'epoch': 1.0}


 22%|██▏       | 2001/9120 [06:30<24:13,  4.90it/s]   

{'loss': 0.0341, 'learning_rate': 8.859583254581604e-06, 'epoch': 1.1}


 27%|██▋       | 2501/9120 [08:04<21:38,  5.10it/s]

{'loss': 0.0289, 'learning_rate': 8.257705467351144e-06, 'epoch': 1.37}


 33%|███▎      | 3001/9120 [09:39<18:09,  5.61it/s]

{'loss': 0.0267, 'learning_rate': 7.559425245448006e-06, 'epoch': 1.64}


 38%|███▊      | 3501/9120 [11:12<17:11,  5.45it/s]

{'loss': 0.0261, 'learning_rate': 6.785406186042e-06, 'epoch': 1.92}


                                                   
 40%|████      | 3648/9120 [11:52<16:44,  5.45it/s]

{'eval_loss': 0.022285936400294304, 'eval_pearson': 0.8394010161760249, 'eval_runtime': 12.4387, 'eval_samples_per_second': 586.397, 'eval_steps_per_second': 18.33, 'epoch': 2.0}


 44%|████▍     | 4001/9120 [13:02<15:41,  5.44it/s]  

{'loss': 0.0204, 'learning_rate': 5.958553159618693e-06, 'epoch': 2.19}


 49%|████▉     | 4501/9120 [14:35<14:32,  5.29it/s]

{'loss': 0.0177, 'learning_rate': 5.103334506137773e-06, 'epoch': 2.47}


 55%|█████▍    | 5000/9120 [16:09<12:14,  5.61it/s]

{'loss': 0.0171, 'learning_rate': 4.245057964803815e-06, 'epoch': 2.74}


 60%|██████    | 5472/9120 [17:38<11:48,  5.15it/s]
 60%|██████    | 5472/9120 [17:50<11:48,  5.15it/s]

{'eval_loss': 0.018835337832570076, 'eval_pearson': 0.8551147082170918, 'eval_runtime': 12.4458, 'eval_samples_per_second': 586.06, 'eval_steps_per_second': 18.319, 'epoch': 3.0}


 60%|██████    | 5500/9120 [18:00<11:25,  5.28it/s]  

{'loss': 0.0162, 'learning_rate': 3.409121764227809e-06, 'epoch': 3.02}


 66%|██████▌   | 6000/9120 [19:33<09:44,  5.34it/s]

{'loss': 0.0122, 'learning_rate': 2.6202630348146323e-06, 'epoch': 3.29}


 71%|███████▏  | 6501/9120 [21:07<07:48,  5.59it/s]

{'loss': 0.0123, 'learning_rate': 1.901825784452777e-06, 'epoch': 3.56}


 77%|███████▋  | 7001/9120 [22:42<06:36,  5.35it/s]

{'loss': 0.0114, 'learning_rate': 1.275070099662815e-06, 'epoch': 3.84}


                                                   
 80%|████████  | 7296/9120 [23:49<05:12,  5.84it/s]

{'eval_loss': 0.01799064129590988, 'eval_pearson': 0.8607275323456131, 'eval_runtime': 12.4457, 'eval_samples_per_second': 586.065, 'eval_steps_per_second': 18.32, 'epoch': 4.0}


 82%|████████▏ | 7501/9120 [24:31<05:16,  5.11it/s]  

{'loss': 0.0104, 'learning_rate': 7.585430144121319e-07, 'epoch': 4.11}


 88%|████████▊ | 8001/9120 [26:04<03:31,  5.30it/s]

{'loss': 0.0095, 'learning_rate': 3.675296639259912e-07, 'epoch': 4.39}


 93%|█████████▎| 8501/9120 [27:38<01:55,  5.35it/s]

{'loss': 0.0094, 'learning_rate': 1.1360096502120387e-07, 'epoch': 4.66}


 99%|█████████▊| 9001/9120 [29:13<00:21,  5.42it/s]

{'loss': 0.0093, 'learning_rate': 4.2712080634949024e-09, 'epoch': 4.93}


                                                   
100%|██████████| 9120/9120 [29:48<00:00,  5.85it/s]

{'eval_loss': 0.018120521679520607, 'eval_pearson': 0.8606255798568142, 'eval_runtime': 12.4327, 'eval_samples_per_second': 586.678, 'eval_steps_per_second': 18.339, 'epoch': 5.0}


100%|██████████| 9120/9120 [29:52<00:00,  5.09it/s]


{'train_runtime': 1792.405, 'train_samples_per_second': 81.396, 'train_steps_per_second': 5.088, 'train_loss': 0.02649584548515186, 'epoch': 5.0}


100%|██████████| 228/228 [00:18<00:00, 12.60it/s]
  5%|▌         | 501/9120 [01:34<25:56,  5.54it/s]

{'loss': 0.0727, 'learning_rate': 9.926019681921196e-06, 'epoch': 0.27}


 11%|█         | 1001/9120 [03:07<23:42,  5.71it/s]

{'loss': 0.046, 'learning_rate': 9.706267962669999e-06, 'epoch': 0.55}


 16%|█▋        | 1501/9120 [04:40<22:59,  5.52it/s]

{'loss': 0.0368, 'learning_rate': 9.347247763081834e-06, 'epoch': 0.82}


 20%|██        | 1824/9120 [05:41<22:37,  5.38it/s]
 20%|██        | 1824/9120 [05:53<22:37,  5.38it/s]

{'eval_loss': 0.026180848479270935, 'eval_pearson': 0.8058253983400744, 'eval_runtime': 12.6447, 'eval_samples_per_second': 576.842, 'eval_steps_per_second': 18.031, 'epoch': 1.0}


 22%|██▏       | 2001/9120 [06:30<23:25,  5.07it/s]   

{'loss': 0.032, 'learning_rate': 8.859583254581604e-06, 'epoch': 1.1}


 27%|██▋       | 2501/9120 [08:04<21:11,  5.21it/s]

{'loss': 0.0261, 'learning_rate': 8.257705467351144e-06, 'epoch': 1.37}


 33%|███▎      | 3001/9120 [09:39<20:02,  5.09it/s]

{'loss': 0.0233, 'learning_rate': 7.559425245448006e-06, 'epoch': 1.64}


 38%|███▊      | 3501/9120 [11:11<16:24,  5.70it/s]

{'loss': 0.0222, 'learning_rate': 6.785406186042e-06, 'epoch': 1.92}


                                                   
 40%|████      | 3648/9120 [11:52<17:02,  5.35it/s]

{'eval_loss': 0.022924985736608505, 'eval_pearson': 0.8407268737552833, 'eval_runtime': 12.6392, 'eval_samples_per_second': 577.095, 'eval_steps_per_second': 18.039, 'epoch': 2.0}


 44%|████▍     | 4001/9120 [13:02<16:07,  5.29it/s]  

{'loss': 0.0177, 'learning_rate': 5.958553159618693e-06, 'epoch': 2.19}


 49%|████▉     | 4501/9120 [14:35<13:33,  5.68it/s]

{'loss': 0.0151, 'learning_rate': 5.103334506137773e-06, 'epoch': 2.47}


 55%|█████▍    | 5001/9120 [16:09<11:47,  5.82it/s]

{'loss': 0.0148, 'learning_rate': 4.245057964803815e-06, 'epoch': 2.74}


 60%|██████    | 5472/9120 [17:37<11:34,  5.26it/s]
 60%|██████    | 5472/9120 [17:50<11:34,  5.26it/s]

{'eval_loss': 0.020706506446003914, 'eval_pearson': 0.8470280800772504, 'eval_runtime': 12.6358, 'eval_samples_per_second': 577.25, 'eval_steps_per_second': 18.044, 'epoch': 3.0}


 60%|██████    | 5500/9120 [17:59<11:36,  5.20it/s]  

{'loss': 0.0139, 'learning_rate': 3.409121764227809e-06, 'epoch': 3.02}


 66%|██████▌   | 6001/9120 [19:32<10:02,  5.18it/s]

{'loss': 0.0115, 'learning_rate': 2.6202630348146323e-06, 'epoch': 3.29}


 71%|███████▏  | 6501/9120 [21:06<07:45,  5.62it/s]

{'loss': 0.0106, 'learning_rate': 1.901825784452777e-06, 'epoch': 3.56}


 77%|███████▋  | 7001/9120 [22:41<06:46,  5.21it/s]

{'loss': 0.0102, 'learning_rate': 1.275070099662815e-06, 'epoch': 3.84}


                                                   
 80%|████████  | 7296/9120 [23:49<05:24,  5.62it/s]

{'eval_loss': 0.018949927762150764, 'eval_pearson': 0.853708792279914, 'eval_runtime': 12.646, 'eval_samples_per_second': 576.784, 'eval_steps_per_second': 18.029, 'epoch': 4.0}


 82%|████████▏ | 7501/9120 [24:31<05:16,  5.12it/s]  

{'loss': 0.0095, 'learning_rate': 7.585430144121319e-07, 'epoch': 4.11}


 88%|████████▊ | 8000/9120 [26:05<03:25,  5.44it/s]

{'loss': 0.0082, 'learning_rate': 3.675296639259912e-07, 'epoch': 4.39}


 93%|█████████▎| 8501/9120 [27:39<01:55,  5.34it/s]

{'loss': 0.0083, 'learning_rate': 1.1360096502120387e-07, 'epoch': 4.66}


 99%|█████████▊| 9001/9120 [29:13<00:21,  5.50it/s]

{'loss': 0.0084, 'learning_rate': 4.2712080634949024e-09, 'epoch': 4.93}


                                                   
100%|██████████| 9120/9120 [29:48<00:00,  5.94it/s]

{'eval_loss': 0.018755391240119934, 'eval_pearson': 0.8543612973073892, 'eval_runtime': 12.6457, 'eval_samples_per_second': 576.798, 'eval_steps_per_second': 18.03, 'epoch': 5.0}


100%|██████████| 9120/9120 [29:52<00:00,  5.09it/s]


{'train_runtime': 1792.7076, 'train_samples_per_second': 81.382, 'train_steps_per_second': 5.087, 'train_loss': 0.02133643121311539, 'epoch': 5.0}


100%|██████████| 228/228 [00:12<00:00, 13.84it/s]

In [14]:
predictions = oof_df['preds'].values
label = oof_df['score'].values
eval_pred = predictions, label
compute_metrics(eval_pred)

{'pearson': 0.8566102166369711}

In [15]:
oof_df.to_csv('oof_df_submission3.csv', index=False)

100%|██████████| 228/228 [00:26<00:00, 13.84it/s]