In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv
/kaggle/input/roberta-base-save/rob_tok.zip
/kaggle/input/roberta-base-save/__results__.html
/kaggle/input/roberta-base-save/rob.zip
/kaggle/input/roberta-base-save/__notebook__.ipynb
/kaggle/input/roberta-base-save/__output__.json
/kaggle/input/roberta-base-save/custom.css


In [2]:
import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
import tqdm
from matplotlib import pyplot as plt
import copy
import gc
import pickle

In [3]:
%%bash
cp ../input/roberta-base-save/rob.zip .
cp ../input/roberta-base-save/rob_tok.zip .
unzip rob.zip
unzip rob_tok.zip 
rm -r rob.zip rob_tok.zip

Archive:  rob.zip
   creating: rob/
  inflating: rob/pytorch_model.bin   
  inflating: rob/config.json         
Archive:  rob_tok.zip
   creating: rob_tok/
  inflating: rob_tok/vocab.json      
  inflating: rob_tok/special_tokens_map.json  
  inflating: rob_tok/tokenizer_config.json  
  inflating: rob_tok/merges.txt      


In [4]:
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import StratifiedKFold
import tqdm
import gc
from sklearn.svm import SVR
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope as ho_scope

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [7]:
class ContinuousStratifiedKFold(StratifiedKFold):
    def split(self, X, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(X, bins, groups)
    
class BERTRegressor(torch.nn.Module): 
    def __init__(self, pretrained_src = 'rob'): 
        super().__init__()
        self.bert = RobertaModel.from_pretrained(pretrained_src)
        self.linear = torch.nn.Linear(768, 1)
        self.dropout = torch.nn.Dropout(0.15)
        
    def forward(self, input_ids, attention_mask): #x - tokenized batch
        hidden = self.bert(input_ids, 
                           attention_mask=attention_mask)[0][:, 0, :]#CLS token output                                                          
        output = self.linear(self.dropout(hidden))
        return output

class RMSELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y))
        return loss

def rmse_metric(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))


def evaluate(dataloader_val, model):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm.notebook.tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
        }
        target = batch[2]

        with torch.no_grad():        
            output = model(**inputs)
            
        loss = criterion(output, target.view(-1,1))
        loss_val_total += loss

        output = output.detach().cpu().numpy()
        target = target.cpu().numpy()
        predictions.append(output)
        true_vals.append(target)
    
    loss_val_avg = loss_val_total / len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


def get_bert_embeddings(embedder, dataloader, device = device):
    embedder.eval()
    embeddings_all = []
    for batch in tqdm.notebook.tqdm(dataloader):        
        batch = tuple(b.to(device) for b in batch)        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1]
                 }
        with torch.no_grad():        
            output = embedder(**inputs)[0][:, 0, :]#CLS token output 

        embeddings_batch = output.detach().cpu().numpy()
        embeddings_all.append(embeddings_batch)

    return np.vstack(embeddings_all)

In [8]:
tokenizer = RobertaTokenizer.from_pretrained(
    'rob_tok'
)

In [9]:
BATCH_SIZE = 16

warm_prop = 0.1
epochs = 8
clip = 1

In [10]:
kf = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print('START')
for k, (train_ids, val_ids) in enumerate(tqdm.notebook.tqdm(kf.split(X=data, y=data['target'].values))):
    print('**************')
    print('------------')
    print('**************')
    print(f'ITERATION {k} starts')

    gc.collect()

    print('------------')
    print('data slicing ...')

    data_train = data.iloc[train_ids]['excerpt'].values
    data_val = data.iloc[val_ids]['excerpt'].values
    print(f'train/val data shapes: {data_train.shape}, {data_val.shape}')
    target_train = data.iloc[train_ids]['target'].values
    target_val = data.iloc[val_ids]['target'].values


    print('------------')
    print('data preparation ...')
    encoded_data_train = tokenizer.batch_encode_plus(
        data_train,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=512,
        return_tensors='pt',
    )

    encoded_data_val = tokenizer.batch_encode_plus(
        data_val,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=512,
        return_tensors='pt'
    )


    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    values_train = torch.tensor(target_train, dtype=torch.float)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    values_val = torch.tensor(target_val, dtype=torch.float)

    dataset_train = TensorDataset(input_ids_train,
                                 attention_masks_train,
                                 values_train)
    dataset_val = TensorDataset(input_ids_val,
                                attention_masks_val,
                                values_val)

    dataloader_train = DataLoader(
        dataset_train,
        sampler=RandomSampler(dataset_train),
        batch_size=BATCH_SIZE
    )

    dataloader_val = DataLoader(
        dataset_val,
        sampler=RandomSampler(dataset_val),
        batch_size=2*BATCH_SIZE
    )

    print('------------')
    print('Roberta finetuning ...')

    model = BERTRegressor().to(device)
    criterion = RMSELoss()
    optimizer = AdamW(
        model.parameters(),
        lr= 3e-5,#the original paper:2e-5 -> 5e-5
        eps=1e-8
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(len(dataloader_train)*epochs * warm_prop),
        num_training_steps=len(dataloader_train)*epochs
    )


    best_val_loss = float('inf')
    for epoch in tqdm.notebook.tqdm(range(epochs)):
        print(f'Epoch {epoch}')
        model.train()

        epoch_loss = 0
        for batch in tqdm.notebook.tqdm(dataloader_train):

            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids':      batch[0],
                    'attention_mask': batch[1]
              }
            target = batch[2]

            optimizer.zero_grad()        

            output = model(**inputs)     
            loss = criterion(output, target.view(-1,1))      
            loss.backward()
            epoch_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)      
            optimizer.step()
            scheduler.step()     

        val_loss, predictions, true_vals = evaluate(dataloader_val, model)
        if val_loss < best_val_loss:
            best_val_loss = val_loss        
            torch.save(model.state_dict(), f'roberta_base_{k}.pt')
            best_model = copy.deepcopy(model).cpu()
        train_loss = epoch_loss / len(dataloader_train)
        rmse_val = rmse_metric(true_vals, predictions)
        print('-------')
        print(f'Training loss: {train_loss}')
        print(f'Validation loss: {val_loss}')
        print(f"RMSE on validation: {rmse_val}")
    
    print('loaded best model with lm head performance...')
    dataloader_val_frozen = DataLoader(
        dataset_val,
        batch_size=2*BATCH_SIZE
    )  
    
    model = BERTRegressor()
    PATH = f'roberta_base_{k}.pt'
    model.load_state_dict(torch.load(PATH), strict=False)
    model.to(device)
    val_loss, predictions, true_vals = evaluate(dataloader_val_frozen, model)
    rmse_val = rmse_metric(true_vals, predictions)
    print(f"RMSE on validation: {rmse_val}")
    
    del model
    gc.collect()

    print('------------')
    print('Embeddings extraction ...')

    embedder = best_model.bert.to(device)    
    
    dataloader_train_frozen = DataLoader(
        dataset_train,
        batch_size=2*BATCH_SIZE,
    )

    embeddings_train = get_bert_embeddings(embedder, dataloader_train_frozen)
    embeddings_val = get_bert_embeddings(embedder, dataloader_val_frozen)
    
    del embedder
    gc.collect()
    
    print('------------')
    print('SVR head HP tuning ...')
    def hyperopt_train_test(params):
        estimator = SVR(**params)  
        estimator.fit(embeddings_train, target_train)
        preds = estimator.predict(embeddings_val)
        metric = rmse_metric(target_val, preds)
        return metric

    space_svr = {'C':  hp.loguniform('C', np.log(0.0001), np.log(1000)) - 0.0001,
                 'gamma':  hp.loguniform('gamma', np.log(0.0001), np.log(1000)) - 0.0001
                }
    def f(params):
        rmse_metric_val = hyperopt_train_test(params)
        return {'loss': rmse_metric_val, 'status': STATUS_OK}

    trials = Trials()
    best = fmin(f, space_svr, algo = tpe.suggest, max_evals = 50, trials=trials)
    print('best', best)


    svr_head = SVR(**best)    
    svr_head.fit(embeddings_train, target_train)
    filename = f'svr_head_{k}.pkl'
    pickle.dump(svr_head, open(filename, 'wb'))
    svr_head = pickle.load(open(filename, 'rb'))
    preds_svr = svr_head.predict(embeddings_val)
    rmse_val = rmse_metric(target_val, preds_svr)
    print(f"RMSE on validation: {rmse_val}")    

    print('------------')
    print(f'Iteration {k} completed.')
    
print('**************')
print('------------')
print('**************')    
print('FINISH')

START


0it [00:00, ?it/s]

**************
------------
**************
ITERATION 0 starts


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


------------
data slicing ...
train/val data shapes: (2267,), (567,)
------------
data preparation ...




------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.8637828730361562
Validation loss: 0.7009828090667725
RMSE on validation: 0.7066155076026917
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.5875214174599714
Validation loss: 0.6054416298866272
RMSE on validation: 0.6066137552261353
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.4764502757871655
Validation loss: 0.5765679478645325
RMSE on validation: 0.5799669623374939
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3920715475376223
Validation loss: 0.6628894805908203
RMSE on validation: 0.6665931940078735
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3378178562286874
Validation loss: 0.6445518136024475
RMSE on validation: 0.6492509841918945
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.2845879657272722
Validation loss: 0.5995919108390808
RMSE on validation: 0.6044906973838806
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.2401280498630564
Validation loss: 0.6394162178039551
RMSE on validation: 0.6426202654838562
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.21501628781708193
Validation loss: 0.6110585331916809
RMSE on validation: 0.6137080788612366
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5799669623374939
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
SVR head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:05<04:47,  5.87s/trial, best loss: 0.4987192784363953][A
  4%|▍         | 2/50 [00:10<04:19,  5.41s/trial, best loss: 0.4987192784363953][A
  6%|▌         | 3/50 [00:16<04:16,  5.46s/trial, best loss: 0.4987192784363953][A
  8%|▊         | 4/50 [00:22<04:27,  5.82s/trial, best loss: 0.4987192784363953][A
 10%|█         | 5/50 [00:28<04:11,  5.60s/trial, best loss: 0.4987192784363953][A
 12%|█▏        | 6/50 [00:34<04:22,  5.96s/trial, best loss: 0.4987192784363953][A
 14%|█▍        | 7/50 [00:41<04:21,  6.08s/trial, best loss: 0.4987192784363953][A
 16%|█▌        | 8/50 [00:47<04:17,  6.12s/trial, best loss: 0.4987192784363953][A
 18%|█▊        | 9/50 [00:52<04:05,  5.99s/trial, best loss: 0.49762244668680317][A
 20%|██        | 10/50 [00:58<03:53,  5.83s/trial, best loss: 0.49730371805021567][A
 22%|██▏       | 11/50 [01:05<03:59,  6.14s/trial, best loss: 



------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.8604462730212951
Validation loss: 0.5783791542053223
RMSE on validation: 0.5821547508239746
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.5522583431341279
Validation loss: 0.5691306591033936
RMSE on validation: 0.5701982975006104
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.42725971120763834
Validation loss: 0.6673387289047241
RMSE on validation: 0.6706773638725281
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3602023462594395
Validation loss: 0.6731212735176086
RMSE on validation: 0.6748705506324768
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.28159619190953145
Validation loss: 0.6489837765693665
RMSE on validation: 0.6535226702690125
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.25058110719415505
Validation loss: 0.6393280625343323
RMSE on validation: 0.6445451378822327
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.19540103545910875
Validation loss: 0.6206755042076111
RMSE on validation: 0.6236213445663452
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.1715990338526981
Validation loss: 0.6199091076850891
RMSE on validation: 0.6262379884719849
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5701982975006104
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
SVR head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:06<05:12,  6.37s/trial, best loss: 1.0286026963684332][A
  4%|▍         | 2/50 [00:11<04:34,  5.71s/trial, best loss: 0.520674303301373] [A
  6%|▌         | 3/50 [00:18<04:46,  6.09s/trial, best loss: 0.520674303301373][A
  8%|▊         | 4/50 [00:23<04:25,  5.77s/trial, best loss: 0.520674303301373][A
 10%|█         | 5/50 [00:30<04:37,  6.16s/trial, best loss: 0.520674303301373][A
 12%|█▏        | 6/50 [00:36<04:24,  6.01s/trial, best loss: 0.5176895592331363][A
 14%|█▍        | 7/50 [00:42<04:31,  6.31s/trial, best loss: 0.5176895592331363][A
 16%|█▌        | 8/50 [00:49<04:32,  6.48s/trial, best loss: 0.5176895592331363][A
 18%|█▊        | 9/50 [00:56<04:23,  6.44s/trial, best loss: 0.5176895592331363][A
 20%|██        | 10/50 [01:01<04:05,  6.13s/trial, best loss: 0.5176895592331363][A
 22%|██▏       | 11/50 [01:08<04:03,  6.24s/trial, best loss: 0.517



------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.9099630167786504
Validation loss: 0.7930133938789368
RMSE on validation: 0.797556459903717
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.5756030366034575
Validation loss: 0.5562911033630371
RMSE on validation: 0.5606646537780762
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.47545298409294073
Validation loss: 0.5188327431678772
RMSE on validation: 0.520509660243988
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3853005585326275
Validation loss: 0.5954340696334839
RMSE on validation: 0.5978075265884399
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.31850494265976087
Validation loss: 0.607792317867279
RMSE on validation: 0.6096960306167603
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.28835182185743896
Validation loss: 0.6432875990867615
RMSE on validation: 0.6463445425033569
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.23093906936930939
Validation loss: 0.56149822473526
RMSE on validation: 0.568289041519165
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.20479331008145507
Validation loss: 0.5972393751144409
RMSE on validation: 0.6007833480834961
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.520509660243988
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
SVR head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:04<04:00,  4.92s/trial, best loss: 0.5015138925318672][A
  4%|▍         | 2/50 [00:11<04:32,  5.67s/trial, best loss: 0.5015138925318672][A
  6%|▌         | 3/50 [00:17<04:42,  6.00s/trial, best loss: 0.5015138925318672][A
  8%|▊         | 4/50 [00:24<04:45,  6.20s/trial, best loss: 0.5015138925318672][A
 10%|█         | 5/50 [00:30<04:48,  6.42s/trial, best loss: 0.5015138925318672][A
 12%|█▏        | 6/50 [00:36<04:24,  6.01s/trial, best loss: 0.4962983951751165][A
 14%|█▍        | 7/50 [00:42<04:30,  6.28s/trial, best loss: 0.4962983951751165][A
 16%|█▌        | 8/50 [00:48<04:08,  5.93s/trial, best loss: 0.4962983951751165][A
 18%|█▊        | 9/50 [00:54<04:10,  6.10s/trial, best loss: 0.4962983951751165][A
 20%|██        | 10/50 [00:59<03:50,  5.76s/trial, best loss: 0.4962983951751165][A
 22%|██▏       | 11/50 [01:05<03:51,  5.94s/trial, best loss: 0.



------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.985234871506691
Validation loss: 0.5714734196662903
RMSE on validation: 0.582249641418457
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.6143566594577171
Validation loss: 0.5125060081481934
RMSE on validation: 0.5198466181755066
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.4801234947123998
Validation loss: 0.6389082074165344
RMSE on validation: 0.6410710215568542
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.4021280588398517
Validation loss: 0.595134437084198
RMSE on validation: 0.6008562445640564
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3200328840126454
Validation loss: 0.5354238748550415
RMSE on validation: 0.5387282967567444
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.2786484851803578
Validation loss: 0.5327540636062622
RMSE on validation: 0.5356187224388123
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.24003794757832944
Validation loss: 0.5979734659194946
RMSE on validation: 0.6005948185920715
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.20335711339409923
Validation loss: 0.5671378374099731
RMSE on validation: 0.5698550939559937
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5198466181755066
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
SVR head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:05<04:46,  5.84s/trial, best loss: 0.49557003523592175][A
  4%|▍         | 2/50 [00:12<05:11,  6.49s/trial, best loss: 0.49557003523592175][A
  6%|▌         | 3/50 [00:18<04:50,  6.18s/trial, best loss: 0.4755706375706529] [A
  8%|▊         | 4/50 [00:25<04:56,  6.45s/trial, best loss: 0.4755706375706529][A
 10%|█         | 5/50 [00:31<04:51,  6.48s/trial, best loss: 0.4755706375706529][A
 12%|█▏        | 6/50 [00:38<04:41,  6.40s/trial, best loss: 0.4755706375706529][A
 14%|█▍        | 7/50 [00:43<04:22,  6.10s/trial, best loss: 0.4755706375706529][A
 16%|█▌        | 8/50 [00:49<04:13,  6.05s/trial, best loss: 0.4755706375706529][A
 18%|█▊        | 9/50 [00:55<04:01,  5.89s/trial, best loss: 0.4721788929712369][A
 20%|██        | 10/50 [01:01<03:57,  5.93s/trial, best loss: 0.4721788929712369][A
 22%|██▏       | 11/50 [01:07<03:57,  6.10s/trial, best loss:



------------
Roberta finetuning ...


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 0


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.9007599798726363
Validation loss: 0.6977971196174622
RMSE on validation: 0.7105844616889954
Epoch 1


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.5683708988444905
Validation loss: 0.5107472538948059
RMSE on validation: 0.5166632533073425
Epoch 2


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.4695585384755067
Validation loss: 0.6512863636016846
RMSE on validation: 0.6525678634643555
Epoch 3


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.3965828016820088
Validation loss: 0.6408436894416809
RMSE on validation: 0.6455747485160828
Epoch 4


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.31793836451752083
Validation loss: 0.6200503706932068
RMSE on validation: 0.6234598755836487
Epoch 5


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.27360694057924645
Validation loss: 0.6620919108390808
RMSE on validation: 0.6646731495857239
Epoch 6


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.23105201870203018
Validation loss: 0.6100252270698547
RMSE on validation: 0.6145696640014648
Epoch 7


  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

-------
Training loss: 0.20556354580420844
Validation loss: 0.6182405948638916
RMSE on validation: 0.6192435026168823
loaded best model with lm head performance...


  0%|          | 0/18 [00:00<?, ?it/s]

RMSE on validation: 0.5166632533073425
------------
Embeddings extraction ...


  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

------------
SVR head HP tuning ...

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?][A
  2%|▏         | 1/50 [00:06<05:36,  6.87s/trial, best loss: 1.032663477370938][A
  4%|▍         | 2/50 [00:13<05:31,  6.90s/trial, best loss: 1.032663477370938][A
  6%|▌         | 3/50 [00:20<05:19,  6.81s/trial, best loss: 1.032663477370938][A
  8%|▊         | 4/50 [00:26<04:56,  6.45s/trial, best loss: 0.4916965691596335][A
 10%|█         | 5/50 [00:32<04:47,  6.38s/trial, best loss: 0.4916965691596335][A
 12%|█▏        | 6/50 [00:38<04:25,  6.04s/trial, best loss: 0.4916592974774053][A
 14%|█▍        | 7/50 [00:44<04:23,  6.13s/trial, best loss: 0.4916592974774053][A
 16%|█▌        | 8/50 [00:49<04:09,  5.93s/trial, best loss: 0.4916592974774053][A
 18%|█▊        | 9/50 [00:55<04:00,  5.86s/trial, best loss: 0.4916592974774053][A
 20%|██        | 10/50 [01:01<04:00,  6.02s/trial, best loss: 0.4916592974774053][A
 22%|██▏       | 11/50 [01:08<04:06,  6.31s/trial, best loss: 0.491

In [11]:
!rm -r rob rob_tok