In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-base-save/rob_tok.zip
/kaggle/input/roberta-base-save/__results__.html
/kaggle/input/roberta-base-save/rob.zip
/kaggle/input/roberta-base-save/__notebook__.ipynb
/kaggle/input/roberta-base-save/__output__.json
/kaggle/input/roberta-base-save/custom.css
/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv


In [2]:
import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import tqdm
from matplotlib import pyplot as plt
import copy
import gc
import pickle
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope as ho_scope

In [3]:
%%bash
cp ../input/roberta-base-save/rob.zip .
cp ../input/roberta-base-save/rob_tok.zip .
unzip rob.zip
unzip rob_tok.zip 
rm -r rob.zip rob_tok.zip

Archive:  rob.zip
   creating: rob/
  inflating: rob/pytorch_model.bin   
  inflating: rob/config.json         
Archive:  rob_tok.zip
   creating: rob_tok/
  inflating: rob_tok/vocab.json      
  inflating: rob_tok/special_tokens_map.json  
  inflating: rob_tok/tokenizer_config.json  
  inflating: rob_tok/merges.txt      


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [6]:
class ContinuousStratifiedKFold(StratifiedKFold):
    def split(self, X, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(X, bins, groups)

class BERTRegressorMP(torch.nn.Module): 
    def __init__(self, pretrained_src = 'rob'): 
        super().__init__()
        self.bert = RobertaModel.from_pretrained(pretrained_src)
        self.linear = torch.nn.Linear(768, 1)
        self.dropout = torch.nn.Dropout(0.1)        
        
    def forward(self, input_ids, attention_mask):           
        last_hidden_state = self.bert(input_ids, 
                           attention_mask=attention_mask)[0]   
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        output = self.linear(self.dropout(mean_embeddings))
        return output
    
class EmbedderMP(torch.nn.Module):
    def __init__(self, model): 
        super().__init__()
        self.bert = model.bert
    
    def forward(self, input_ids, attention_mask): 
        last_hidden_state = self.bert(input_ids, 
                           attention_mask=attention_mask)[0]   
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class RMSELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y))
        return loss

def rmse_metric(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))


def evaluate(dataloader_val, model):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm.notebook.tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
        }
        target = batch[2]

        with torch.no_grad():        
            output = model(**inputs)
            
        loss = criterion(output, target.view(-1,1))
        loss_val_total += loss

        output = output.detach().cpu().numpy()
        target = target.cpu().numpy()
        predictions.append(output)
        true_vals.append(target)
    
    loss_val_avg = loss_val_total / len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


def get_bert_embeddings(embedder, dataloader, device = device):
    embedder.eval()
    embeddings_all = []
    for batch in tqdm.notebook.tqdm(dataloader):        
        batch = tuple(b.to(device) for b in batch)        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }
        with torch.no_grad():        
            output = embedder(**inputs)

        embeddings_batch = output.detach().cpu().numpy()
        embeddings_all.append(embeddings_batch)

    return np.vstack(embeddings_all)

In [7]:
tokenizer = RobertaTokenizer.from_pretrained(
    'rob_tok'
)

In [8]:
BATCH_SIZE = 16

warm_prop = 0.1
epochs = 8
clip = 1

In [9]:
kf = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print('START')
for k, (train_ids, val_ids) in enumerate(tqdm.notebook.tqdm(kf.split(X=data, y=data['target'].values))):
    print('**************')
    print('------------')
    print('**************')
    print(f'ITERATION {k} starts')

    gc.collect()

    print('------------')
    print('data slicing ...')

    data_train = data.iloc[train_ids]['excerpt'].values
    data_val = data.iloc[val_ids]['excerpt'].values
    print(f'train/val data shapes: {data_train.shape}, {data_val.shape}')
    target_train = data.iloc[train_ids]['target'].values
    target_val = data.iloc[val_ids]['target'].values


    print('------------')
    print('data preparation ...')
    encoded_data_train = tokenizer.batch_encode_plus(
        data_train,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=512,
        return_tensors='pt',
    )

    encoded_data_val = tokenizer.batch_encode_plus(
        data_val,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=512,
        return_tensors='pt'
    )


    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    values_train = torch.tensor(target_train, dtype=torch.float)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    values_val = torch.tensor(target_val, dtype=torch.float)

    dataset_train = TensorDataset(input_ids_train,
                                 attention_masks_train,
                                 values_train)
    dataset_val = TensorDataset(input_ids_val,
                                attention_masks_val,
                                values_val)

    dataloader_train = DataLoader(
        dataset_train,
        sampler=RandomSampler(dataset_train),
        batch_size=BATCH_SIZE
    )

    dataloader_val = DataLoader(
        dataset_val,
        sampler=RandomSampler(dataset_val),
        batch_size=2*BATCH_SIZE
    )

    print('------------')
    print('Roberta finetuning ...')

    model = BERTRegressorMP().to(device)
    criterion = RMSELoss()
    optimizer = AdamW(
        model.parameters(),
        lr= 3e-5,#the original paper:2e-5 -> 5e-5
        eps=1e-8
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(len(dataloader_train)*epochs * warm_prop),
        num_training_steps=len(dataloader_train)*epochs
    )


    best_val_loss = float('inf')
    for epoch in tqdm.notebook.tqdm(range(epochs)):
        print(f'Epoch {epoch}')
        model.train()

        epoch_loss = 0
        for batch in tqdm.notebook.tqdm(dataloader_train):

            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids':      batch[0],
                    'attention_mask': batch[1]
              }
            target = batch[2]

            optimizer.zero_grad()        

            output = model(**inputs)     
            loss = criterion(output, target.view(-1,1))      
            loss.backward()
            epoch_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)      
            optimizer.step()
            scheduler.step()     

        val_loss, predictions, true_vals = evaluate(dataloader_val, model)
        if val_loss < best_val_loss:
            best_val_loss = val_loss        
            torch.save(model.state_dict(), f'roberta_base_{k}.pt')
            best_model = copy.deepcopy(model).cpu()
        train_loss = epoch_loss / len(dataloader_train)
        rmse_val = rmse_metric(true_vals, predictions)
        print('-------')
        print(f'Training loss: {train_loss}')
        print(f'Validation loss: {val_loss}')
        print(f"RMSE on validation: {rmse_val}")
    
    print('loaded best model with lm head performance...')
    dataloader_val_frozen = DataLoader(
        dataset_val,
        batch_size=2*BATCH_SIZE
    )  
    
    model = BERTRegressorMP()
    PATH = f'roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    model.to(device)
    val_loss, predictions, true_vals = evaluate(dataloader_val_frozen, model)
    rmse_val = rmse_metric(true_vals, predictions)
    print(f"RMSE on validation: {rmse_val}")
    
    del model
    gc.collect()

    print('------------')
    print('Embeddings extraction ...')

    embedder = EmbedderMP(best_model).to(device)    
    
    dataloader_train_frozen = DataLoader(
        dataset_train,
        batch_size=2*BATCH_SIZE,
    )

    embeddings_train = get_bert_embeddings(embedder, dataloader_train_frozen)
    embeddings_val = get_bert_embeddings(embedder, dataloader_val_frozen)
    
    del embedder
    gc.collect()
    
    print('------------')
    print('XGB head HP tuning ...')
    def hyperopt_train_test(params):
        estimator = xgb.XGBRegressor(**params)  
        estimator.fit(embeddings_train, target_train)
        preds = estimator.predict(embeddings_val)
        metric = rmse_metric(target_val, preds)
        return metric

    space_xgb = {'n_estimators': ho_scope.int(hp.quniform('n_estimators', 50, 400, q=25)),
             'learning_rate': hp.loguniform('learning_rate', np.log(0.0001), np.log(0.5)) - 0.0001,
             'max_depth': ho_scope.int(hp.quniform('max_depth', 1, 11, 1)),
             'min_child_weight': ho_scope.int(hp.quniform('min_child_weight', 0, 300, 25)),
             'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma': hp.loguniform('gamma', np.log(0.0001), np.log(5)) - 0.0001,
             'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05), 
             'colsample_bylevel': hp.quniform('colsample_bylevel', 0.5, 1, 0.05),
             'reg_alpha': hp.loguniform('reg_alpha', np.log(0.0001), np.log(1)) - 0.0001,
             'reg_lambda': hp.loguniform('reg_lambda', np.log(1), np.log(4))
             }
    def f(params):
        rmse_metric_val = hyperopt_train_test(params)
        return {'loss': rmse_metric_val, 'status': STATUS_OK}

    trials = Trials()
    best = fmin(f, space_xgb, algo = tpe.suggest, max_evals = 50, trials=trials)
    print('best', best)

    best['max_depth'] = int(best['max_depth'])
    best['min_child_weight'] = int(best['min_child_weight'])
    best['n_estimators'] = int(best['n_estimators'])
    best['random_state'] = 42
    best['n_jobs'] = -1
    best['objective'] = 'reg:squarederror'
    xgb_head = xgb.XGBRegressor(**best)    
    xgb_head.fit(embeddings_train, target_train)
    filename = f'xgb_head_{k}.pkl'
    pickle.dump(xgb_head, open(filename, 'wb'))
    xgb_head = pickle.load(open(filename, 'rb'))
    preds_xgb = xgb_head.predict(embeddings_val)
    rmse_val = rmse_metric(target_val, preds_xgb)
    print(f"RMSE on validation: {rmse_val}")    

    print('------------')
    print(f'Iteration {k} completed.')
    
print('**************')
print('------------')
print('**************')    
print('FINISH')

START


0it [00:00, ?it/s]

**************
------------
**************
ITERATION 0 starts
------------
data slicing ...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


train/val data shapes: (2267,), (567,)
------------
data preparation ...




------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.7665377926658576
Validation loss: 0.6470295786857605
RMSE on validation: 0.6520655155181885
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.5064939425235063
Validation loss: 0.5175865888595581
RMSE on validation: 0.5197195410728455
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3966082189704331
Validation loss: 0.740657389163971
RMSE on validation: 0.744906485080719
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.30536616908412584
Validation loss: 0.5697546601295471
RMSE on validation: 0.573124885559082
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.23817641246066967
Validation loss: 0.5867723822593689
RMSE on validation: 0.5943396091461182
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.1991411429685606
Validation loss: 0.6509798169136047
RMSE on validation: 0.6572798490524292
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.16756542882239314
Validation loss: 0.6750023365020752
RMSE on validation: 0.6811125874519348
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.1401867236889584
Validation loss: 0.6219977736473083
RMSE on validation: 0.6261906027793884
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5197195410728455
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
XGB head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:31<25:32, 31.28s/trial, best loss: 1.7100854374709475][A
  4%|▍         | 2/50 [00:34<11:57, 14.95s/trial, best loss: 0.5012669479146836][A
  6%|▌         | 3/50 [00:48<11:23, 14.55s/trial, best loss: 0.5012669479146836][A
  8%|▊         | 4/50 [01:06<12:10, 15.88s/trial, best loss: 0.48197190030977344][A
 10%|█         | 5/50 [01:21<11:30, 15.35s/trial, best loss: 0.48197190030977344][A
 12%|█▏        | 6/50 [01:35<10:58, 14.97s/trial, best loss: 0.48197190030977344][A
 14%|█▍        | 7/50 [02:32<20:29, 28.59s/trial, best loss: 0.48197190030977344][A
 16%|█▌        | 8/50 [02:38<15:06, 21.59s/trial, best loss: 0.48197190030977344][A
 18%|█▊        | 9/50 [03:03<15:23, 22.52s/trial, best loss: 0.48197190030977344][A
 20%|██        | 10/50 [03:05<10:50, 16.26s/trial, best loss: 0.48197190030977344][A
 22%|██▏       | 11/50 [03:06<07:33, 11.62s/trial, best l



------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.7625709530753149
Validation loss: 0.5857367515563965
RMSE on validation: 0.5894920229911804
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.514479248876303
Validation loss: 0.649823784828186
RMSE on validation: 0.6538690328598022
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.41517104754145717
Validation loss: 0.682174563407898
RMSE on validation: 0.6862257719039917
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.29824873154432
Validation loss: 0.6265295147895813
RMSE on validation: 0.6298312544822693
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.23695220651341156
Validation loss: 0.6400802731513977
RMSE on validation: 0.642801821231842
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.1864953308667935
Validation loss: 0.6575247049331665
RMSE on validation: 0.667529284954071
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.15629014456775828
Validation loss: 0.5860822796821594
RMSE on validation: 0.5901533961296082
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.12709312891246569
Validation loss: 0.6125737428665161
RMSE on validation: 0.6183398962020874
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5894920229911804
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
XGB head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:13<11:23, 13.96s/trial, best loss: 1.741923953571498][A
  4%|▍         | 2/50 [01:06<29:27, 36.82s/trial, best loss: 0.5347283366530381][A
  6%|▌         | 3/50 [01:09<16:27, 21.02s/trial, best loss: 0.5347283366530381][A
  8%|▊         | 4/50 [01:19<12:57, 16.91s/trial, best loss: 0.5347283366530381][A
 10%|█         | 5/50 [01:34<12:05, 16.12s/trial, best loss: 0.5347283366530381][A
 12%|█▏        | 6/50 [02:03<15:08, 20.64s/trial, best loss: 0.5347283366530381][A
 14%|█▍        | 7/50 [02:17<13:04, 18.24s/trial, best loss: 0.531678252869399] [A
 16%|█▌        | 8/50 [02:28<11:21, 16.23s/trial, best loss: 0.531678252869399][A
 18%|█▊        | 9/50 [02:58<13:59, 20.46s/trial, best loss: 0.531678252869399][A
 20%|██        | 10/50 [03:03<10:20, 15.52s/trial, best loss: 0.531678252869399][A
 22%|██▏       | 11/50 [03:15<09:24, 14.48s/trial, best loss: 0.5316



------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.7648830182955298
Validation loss: 0.8257569670677185
RMSE on validation: 0.8300561904907227
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.5531703156064933
Validation loss: 0.5822140574455261
RMSE on validation: 0.5854587554931641
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.40533201039676936
Validation loss: 0.7170739769935608
RMSE on validation: 0.7186874151229858
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.32218525453772345
Validation loss: 0.705394983291626
RMSE on validation: 0.7075778841972351
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.24759698265665014
Validation loss: 0.7118643522262573
RMSE on validation: 0.7151224613189697
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.1952731491814197
Validation loss: 0.638304591178894
RMSE on validation: 0.6424824595451355
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.1631301153503673
Validation loss: 0.6062021255493164
RMSE on validation: 0.6108719706535339
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.1386189170339158
Validation loss: 0.6186599731445312
RMSE on validation: 0.6239883899688721
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5854587554931641
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
XGB head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:12<10:29, 12.84s/trial, best loss: 0.9413551242762128][A
  4%|▍         | 2/50 [00:36<15:16, 19.09s/trial, best loss: 0.9413551242762128][A
  6%|▌         | 3/50 [00:53<14:11, 18.12s/trial, best loss: 0.9413551242762128][A
  8%|▊         | 4/50 [01:02<11:16, 14.71s/trial, best loss: 0.5186065830487638][A
 10%|█         | 5/50 [01:08<08:31, 11.36s/trial, best loss: 0.5186065830487638][A
 12%|█▏        | 6/50 [01:15<07:13,  9.85s/trial, best loss: 0.49926950455459584][A
 14%|█▍        | 7/50 [01:22<06:29,  9.06s/trial, best loss: 0.49926950455459584][A
 16%|█▌        | 8/50 [01:52<11:01, 15.76s/trial, best loss: 0.49926950455459584][A
 18%|█▊        | 9/50 [02:13<11:45, 17.21s/trial, best loss: 0.49926950455459584][A
 20%|██        | 10/50 [02:27<10:59, 16.49s/trial, best loss: 0.49926950455459584][A
 22%|██▏       | 11/50 [02:39<09:45, 15.02s/trial, best los



------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.7879531887215627
Validation loss: 0.7255927324295044
RMSE on validation: 0.7301052808761597
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.5137398467517235
Validation loss: 0.5409338474273682
RMSE on validation: 0.5452302694320679
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.40659824139635325
Validation loss: 0.6459717154502869
RMSE on validation: 0.6502128839492798
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3201520646541891
Validation loss: 0.5065088272094727
RMSE on validation: 0.5094826221466064
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.256658088888081
Validation loss: 0.5837271213531494
RMSE on validation: 0.5902664661407471
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.2114535552305235
Validation loss: 0.5136123895645142
RMSE on validation: 0.5159213542938232
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.17272534294867178
Validation loss: 0.6274346709251404
RMSE on validation: 0.6339989304542542
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.14652504672256994
Validation loss: 0.5703384280204773
RMSE on validation: 0.5735203623771667
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5094826221466064
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
XGB head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:41<33:31, 41.06s/trial, best loss: 0.45900235820137225][A
  4%|▍         | 2/50 [01:04<24:36, 30.77s/trial, best loss: 0.45900235820137225][A
  6%|▌         | 3/50 [01:20<18:35, 23.74s/trial, best loss: 0.45900235820137225][A
  8%|▊         | 4/50 [01:49<20:01, 26.13s/trial, best loss: 0.45900235820137225][A
 10%|█         | 5/50 [02:11<18:29, 24.65s/trial, best loss: 0.45900235820137225][A
 12%|█▏        | 6/50 [02:20<14:05, 19.22s/trial, best loss: 0.45900235820137225][A
 14%|█▍        | 7/50 [02:58<18:12, 25.40s/trial, best loss: 0.45900235820137225][A
 16%|█▌        | 8/50 [03:14<15:40, 22.39s/trial, best loss: 0.45900235820137225][A
 18%|█▊        | 9/50 [03:30<13:51, 20.29s/trial, best loss: 0.45900235820137225][A
 20%|██        | 10/50 [03:33<09:59, 14.98s/trial, best loss: 0.45900235820137225][A
 22%|██▏       | 11/50 [04:06<13:20, 20.52s/trial, bes



------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.8141935381671073
Validation loss: 0.5809589624404907
RMSE on validation: 0.5846284627914429
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.5114493258822133
Validation loss: 0.524186909198761
RMSE on validation: 0.5280457735061646
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.40715011020361536
Validation loss: 0.708957850933075
RMSE on validation: 0.7138335704803467
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3118243521787751
Validation loss: 0.6069409251213074
RMSE on validation: 0.6114413142204285
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.2431780844926834
Validation loss: 0.5303031802177429
RMSE on validation: 0.5370751619338989
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.18889158051198637
Validation loss: 0.637169599533081
RMSE on validation: 0.6419011950492859
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.16585707643502196
Validation loss: 0.6045548915863037
RMSE on validation: 0.6087477803230286
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.13607999955264616
Validation loss: 0.6073943972587585
RMSE on validation: 0.6090526580810547
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5280457735061646
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
XGB head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:04<03:27,  4.23s/trial, best loss: 1.469914812011474][A
  4%|▍         | 2/50 [00:21<09:42, 12.13s/trial, best loss: 0.5276021452936488][A
  6%|▌         | 3/50 [00:39<11:30, 14.70s/trial, best loss: 0.5276021452936488][A
  8%|▊         | 4/50 [00:47<09:03, 11.82s/trial, best loss: 0.5065533886649435][A
 10%|█         | 5/50 [01:04<10:28, 13.96s/trial, best loss: 0.5065533886649435][A
 12%|█▏        | 6/50 [01:08<07:33, 10.31s/trial, best loss: 0.5065533886649435][A
 14%|█▍        | 7/50 [01:32<10:42, 14.95s/trial, best loss: 0.5065533886649435][A
 16%|█▌        | 8/50 [02:01<13:30, 19.31s/trial, best loss: 0.5065533886649435][A
 18%|█▊        | 9/50 [02:06<10:07, 14.81s/trial, best loss: 0.5065533886649435][A
 20%|██        | 10/50 [02:42<14:14, 21.36s/trial, best loss: 0.5055767056013686][A
 22%|██▏       | 11/50 [02:46<10:34, 16.26s/trial, best loss: 0.5

In [10]:
!rm -r rob rob_tok