In [1]:
!pip install optuna

[0m

In [60]:
################ NOTES #####################
#  - based on https://github.com/gilfernandes/commonlit/blob/main/53_pytorch_transformers_deberta_large.ipynb
#
#
#
#
#
#
############################################


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc, warnings, random, time, os

from pathlib import Path

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import seaborn as sns

import gc
gc.enable()

import optuna

import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR
     
TOKENIZERS_PARALLELISM = False
                
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
                
'''
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



'''                

SyntaxError: invalid syntax (3309465316.py, line 49)

### Folders and Dataframes

In [3]:
DATA_PATH = "../input/feedback-prize-english-language-learning"
#assert DATA_PATH.exists()
MODELS_PATH = "../input/debertav3base"
#if not MODELS_PATH.exists():
#    os.mkdir(MODELS_PATH)
#assert MODELS_PATH.exists()

In [4]:
train_df = pd.read_csv(DATA_PATH + '/train.csv')
test_df = pd.read_csv(DATA_PATH + '/test.csv')
sample_df = pd.read_csv(DATA_PATH + '/sample_submission.csv')

In [5]:
def remove_unnecessary(df):
    df.drop(df[df['syntax'] == 0].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
remove_unnecessary(train_df)

In [11]:
train_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5


### Config and Seeding

In [12]:
class Config(): 
    NUM_FOLDS = 6
    NUM_EPOCHS = 3
    BATCH_SIZE = 2
    MAX_LEN = 248  #experiment with 512
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    ROBERTA_PATH = 'microsoft/deberta-v3-base'
    TOKENIZER_PATH = 'microsoft/deberta-v3-base'
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SEED = 3
    NUM_WORKERS = 2
    MODEL_FOLDER = MODELS_PATH #MODELS_PATH = "../input/debertav3base"
    model_name = 'microsoft/deberta-v3-base'
    svm_kernels = ['rbf']
    svm_c = 5
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

cfg = Config()

In [7]:
#if not cfg.MODEL_FOLDER.exists():
#    os.mkdir(cfg.MODEL_FOLDER)

In [7]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

### Dataset

In [8]:
def add_bins(train_df, num_bins):
    train_df.loc[:, 'bins'] = pd.cut(train_df['syntax'], bins=num_bins, labels=False)
    return num_bins

In [9]:
add_bins(train_df, cfg.NUM_FOLDS)

6

In [13]:
train_df.groupby(['bins'])[cfg.target_cols].agg(['count', 'mean'])

Unnamed: 0_level_0,cohesion,cohesion,syntax,syntax,vocabulary,vocabulary,phraseology,phraseology,grammar,grammar,conventions,conventions
Unnamed: 0_level_1,count,mean,count,mean,count,mean,count,mean,count,mean,count,mean
bins,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,40,1.6125,40,1.3625,40,1.9625,40,1.7375,40,1.6875,40,1.6125
1,410,2.439024,410,2.0,410,2.693902,410,2.45,410,2.326829,410,2.389024
2,2089,2.96673,2089,2.799186,2089,3.08832,2089,2.931067,2089,2.8382,2089,2.910244
3,867,3.448097,867,3.5,867,3.491926,867,3.452134,867,3.401384,867,3.427336
4,388,3.813144,388,4.0,388,3.795103,388,3.829897,388,3.742268,388,3.725515
5,117,4.264957,117,4.57265,117,4.448718,117,4.393162,117,4.358974,117,4.354701


In [14]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
#get target_cols
target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
class CommonLitDataset(Dataset):
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()
        self.df, self.inference_only = df, inference_only
        self.text = df['full_text'].tolist()
        self.bins = df['bins']
        if not inference_only:
            self.target = torch.tensor(df['syntax'].to_numpy(), dtype = torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',
            max_length = cfg.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:
            target = self.target[index]
            return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}
    
    def __len__(self):
        return len(self.df)

In [16]:
sample_ds = CommonLitDataset(train_df, tokenizer)

### Model

In [17]:
class AttentionHead(nn.Module):
    
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        
        self.hidden_layer = nn.Linear(in_features, hidden_dim)
        self.final_layer = nn.Linear(hidden_dim, num_targets)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.hidden_layer(features))
        score = self.final_layer(att)
        attention_weights = torch.softmax(score, dim=1)
        return attention_weights

In [18]:
class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        config = AutoConfig.from_pretrained(cfg.ROBERTA_PATH)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
        })
        self.transformer_model = AutoModel.from_pretrained(cfg.ROBERTA_PATH, config=config)
        self.attention = AttentionHead(config.hidden_size, 512, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        last_layer_hidden_states = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state']
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
        return self.regressor(context_vector), context_vector

In [19]:
sample_model = CommonLitModel()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
import re

for i, (name, param) in enumerate(sample_model.named_parameters()):
    if(name.find('layer') > -1):
        layer_name = re.sub(r'.+(layer\.\d+).+', r'\1', name)

In [36]:
for i, (name, param) in enumerate(sample_model.named_parameters()):
    print(i, name, param.size())

0 transformer_model.embeddings.word_embeddings.weight torch.Size([128100, 768])
1 transformer_model.embeddings.LayerNorm.weight torch.Size([768])
2 transformer_model.embeddings.LayerNorm.bias torch.Size([768])
3 transformer_model.encoder.layer.0.attention.self.query_proj.weight torch.Size([768, 768])
4 transformer_model.encoder.layer.0.attention.self.query_proj.bias torch.Size([768])
5 transformer_model.encoder.layer.0.attention.self.key_proj.weight torch.Size([768, 768])
6 transformer_model.encoder.layer.0.attention.self.key_proj.bias torch.Size([768])
7 transformer_model.encoder.layer.0.attention.self.value_proj.weight torch.Size([768, 768])
8 transformer_model.encoder.layer.0.attention.self.value_proj.bias torch.Size([768])
9 transformer_model.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
10 transformer_model.encoder.layer.0.attention.output.dense.bias torch.Size([768])
11 transformer_model.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
12

In [37]:
#experiment and change this to 6,248 when adding target_cols to commonlitdataset()

sample_input_ids = torch.randint(0, 1000, [8, 248])
sample_attention_mask = torch.randint(0, 1000, [8, 248])

In [38]:
sample_model(sample_input_ids, sample_attention_mask)[1].shape

torch.Size([8, 768])

In [39]:
torch.sum(torch.randn([8, 496, 768]), axis=1)

tensor([[-46.5890,  -3.7759,  13.3933,  ...,  -1.5699,  -1.8865, -40.4611],
        [-20.2272,  -7.4253,   3.0888,  ...,  -3.4732,   9.5524, -19.9908],
        [ 30.7539,  48.9657, -11.1575,  ..., -15.0397,  12.2902,  34.1028],
        ...,
        [ -4.2911,   0.1317,  12.5833,  ...,  -6.6508,  14.5507,   3.0757],
        [ 20.2373,  13.6206, -39.9987,  ...,  12.7297,  10.9560,  21.3071],
        [-11.3111,  -1.5464,  19.2736,  ...,  23.6864, -20.7362,  -8.4660]])

### Evaluation and Prediction

In [40]:
def eval_mse(model, data_loader):
    model.eval()
    mse_sum = 0
    mse_loss = nn.MSELoss(reduction='sum')
    
    with torch.no_grad():
        for batch_num, record in enumerate(data_loader):
            input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            mse_sum += mse_loss(pred.flatten().cpu(), target.cpu())
            
    return mse_sum / len(data_loader.dataset)

In [41]:
def predict(model, data_loader):
    model.eval()
    result = []
    
    with torch.no_grad():
        for batch_num, record in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids, attention_mask = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            result.extend(pred.flatten().to("cpu").tolist())
            
    return np.array(result)

In [42]:
sample_dl = DataLoader(sample_ds, shuffle=False, batch_size=16, num_workers=1)

### Optimizer and Sampler

In [43]:
5e-5 / 2.5, 5e-5 / 0.5, 5e-5

(2e-05, 0.0001, 5e-05)

In [44]:
def create_optimizer(model, base_lr=5e-5, last_lr=None):
    
    #layer wise learning. numerical arguments are from model.named_parameters, see above for list
    named_parameters = list(model.named_parameters())
    
    attention_param_start = 194 #end of last layer, 0-11 layers, layer 11 end
    regressor_param_start = 206 #named parameter regressor.weight
    roberta_parameters = named_parameters[:198] #transformer_model.pooler.dense.weight
    attention_parameters = named_parameters[202:regressor_param_start] #attention.hidden_layer_weight
    regressor_parameters = named_parameters[regressor_param_start:]
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]
    
    parameters = []
    if last_lr is not None:
        parameters.append({"params": attention_group, "lr": last_lr})
        parameters.append({"params": regressor_group, "lr": last_lr})
    else:
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
        
    # Change on different models
    layer_low_threshold = 99
    layer_middle_threshold = 130
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if 'bias' in name else 0.01
        
        lr = base_lr / 2.5 # 2e-05
        if layer_num >= layer_middle_threshold:
            lr = base_lr / 0.5 # 1e-4
        elif layer_num >= layer_low_threshold:        
            lr = base_lr    
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return AdamW(parameters)

In [45]:
sample_optimizer = create_optimizer(sample_model)

In [46]:
from torch.utils.data import Sampler,SequentialSampler,RandomSampler,SubsetRandomSampler
from collections import Counter

class WeightedSampler(Sampler):
    
    def __init__(self, dataset):
        
        self.indices = list(range(len(dataset)))
        self.num_samples = len(dataset)
        self.label_to_count = dict(Counter(dataset.bins))
        weights = [1/self.label_to_count[i] for i in dataset.bins]
        
        self.weights = torch.tensor(weights,dtype=torch.double)
        
    def __iter__(self):
        count = 0
        index = [self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True)]
        while count < self.num_samples:
            yield index[count]
            count += 1
    
    def __len__(self):
        return self.num_samples

### Training

In [47]:
def choose_eval_period(val_rmse):
    for rmse, period in cfg.EVAL_SCHEDULE:
        if val_rmse >= rmse:
            return period

In [55]:
def serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, model, model_path):
    if not best_val_rmse or val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        #if not model_path.parent.exists():
        #    os.makedirs(model_path.parent)
        
        #torch.save(model.state_dict(), model_path)
        torch.save(model.state_dict(), OUTPUT_DIR + "/" + f"{cfg.model_name.replace('/', '-')}_fold{fold}_best.pt")

        print(f"New best_val_rmse: {best_val_rmse:0.4}")
    else:       
        print(f"Still best_val_rmse: {best_val_rmse:0.4}",
              f"(from epoch {best_epoch})")
    return best_epoch, best_val_rmse

In [49]:
class Trainer():
    def __init__(self, scaler, model, model_path, train_loader, val_loader, optimizer, scheduler=None, num_epochs=cfg.NUM_EPOCHS):
        self.scaler, self.model, self.model_path, self.train_loader, self.val_loader, self.optimizer, self.scheduler, self.num_epochs = (
            scaler, model, model_path, train_loader, val_loader, optimizer, scheduler, num_epochs
        )
            
    def train(self):
        self.model.train()
        
        mse_loss = nn.MSELoss(reduction='mean')
        
        best_val_rmse = None
        best_epoch = 0
        step = 0
        last_eval_step = 0
        eval_period = cfg.EVAL_SCHEDULE[0][1]    

        start = time.time()
        
        tbar = tqdm(range(self.num_epochs), total=self.num_epochs)
        for epoch in tbar:
            tbar.set_description(f'Epoch: {epoch}')
            val_rmse = None
            for batch_num, record in enumerate(self.train_loader):
                input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
                
                self.optimizer.zero_grad()
                
                # Casts operations to mixed precision
                with torch.cuda.amp.autocast():
                    pred, _ = self.model(input_ids, attention_mask)
                    mse = mse_loss(pred.flatten(), target)
                    
                self.scaler.scale(mse).backward()
                self.scaler.step(self.optimizer)
                self.scaler.update()
                
                if self.scheduler:
                    self.scheduler.step()
                    
                if step >= last_eval_step + eval_period:
                    elapsed_seconds = time.time() - start
                    num_steps = step - last_eval_step
                    print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                    last_eval_step = step
                    
                    val_rmse = np.sqrt(eval_mse(self.model, self.val_loader))
                    print(f"Epoch: {epoch} batch_num: {batch_num}", f"val_rmse: {val_rmse:0.4} ", end='')
                    
                    eval_period = choose_eval_period(val_rmse)
                    best_epoch, best_val_rmse = serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, self.model, self.model_path)
                    start = time.time()
                    
                    
                    
                # Finish early on condition
                if epoch > 0 and best_val_rmse > 0.6:
                    return best_val_rmse
                
                step += 1
        return best_val_rmse

In [50]:
kfold = KFold(n_splits=cfg.NUM_FOLDS, random_state=cfg.SEED, shuffle=True)
splits = list(kfold.split(train_df))

### Main Training

In [52]:
def train_fold(base_lr, last_lr, fold = 0):
    
    print(f'##### Using fold {fold}')
    
    model_path = cfg.MODEL_FOLDER + f"/{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
   
    
    #tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    
    train_indices, val_indices = splits[fold]
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    set_random_seed(cfg.SEED + fold)
    
    model = CommonLitModel().to(cfg.DEVICE)
    
    optimizer = create_optimizer(model, base_lr=base_lr, last_lr=last_lr)
    
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    
    scaler = torch.cuda.amp.GradScaler()
    
    trainer = Trainer(scaler, model, model_path, train_loader, val_loader, optimizer, scheduler = scheduler)
    rmse_val = trainer.train()
    tokenizer.save_pretrained(str(model_path.parent))
    
    return rmse_val

In [49]:
# Best results
# fold 0: {'base_lr': 4.214048623230046e-05, 'last_lr': 0.00098671139242345}. Best is trial 0 with value: 0.46920305490493774.
# fold 1: {'base_lr': 3.4594372607385946e-05, 'last_lr': 0.0005479134338105077}. Best is trial 0 with value: 0.447492390871048
# fold 2: {'base_lr': 1.777623134028703e-05, 'last_lr': 0.004132549020616918}. Best is trial 0 with value: 0.46756473183631897
# fold 3: {'base_lr': 3.933402254716856e-05, 'last_lr': 0.0018473297738188957}. Best is trial 11 with value: 0.4719877541065216
# fold 4: {'base_lr': 1.845975941382356e-05, 'last_lr': 0.0006309278277674714}. Best is trial 15 with value: 0.46920618414878845
# fold 5: {'base_lr': 4.430444436442592e-05, 'last_lr': 0.000289231685619846}. Best is trial 6 with value: 0.4629150927066803

In [53]:
# 'base_lr': 6.589032198953331e-05, 'last_lr': 0.00022464473383019027,
# {'base_lr': 6.589032198953331e-05, 'last_lr':0.00022464473383019027},
lr_list = [
    
    {'base_lr': 6.589032198953331e-05, 'last_lr':0.00022464473383019027},
    {'base_lr': 6.589032198953331e-05, 'last_lr':0.00022464473383019027},
    {'base_lr': 6.589032198953331e-05, 'last_lr':0.00022464473383019027},
    {'base_lr': 6.589032198953331e-05, 'last_lr':0.00022464473383019027},
    {'base_lr': 6.589032198953331e-05, 'last_lr':0.00022464473383019027},
    {'base_lr': 6.589032198953331e-05, 'last_lr':0.00022464473383019027}
    
]

In [None]:
%%time

rmse_values = []
for i in range(len(list(splits))):
    fold = i
    lrs = lr_list[fold]
    rmse_val = train_fold(lrs['base_lr'], lrs['last_lr'], fold=fold)
    print(f'Final RMSE: {rmse_val}')
    rmse_values.append(rmse_val)

##### Using fold 0


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT e

  0%|          | 0/3 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

16 steps took 2.27 seconds
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism

In [39]:
f'mean RMSE values: {np.mean(np.array(rmse_values))}'

'mean RMSE values: 0.47969725728034973'

### Verify the model

In [40]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [41]:
cfg.model_offset = 0
cfg.model_limit = 6
cfg.n_folds = 5
cfg.svm_kernels = ['rbf']
cfg.svm_c = 5

In [42]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [43]:
%%time

inference_models = []
for i in range(1, cfg.NUM_FOLDS + 1):
    print(f'Model {i}')
    inference_model = CommonLitModel()
    inference_model = inference_model.cuda()
    inference_model.load_state_dict(torch.load(str(MODELS_PATHf"/{cfg.model_name.replace('/', '_')}_{i}/model_{i}.pth")))
    inference_model.eval();
    inference_models.append(inference_model)

Model 1


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']

- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model 2


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']

- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']

- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']

- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']

- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model 6


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']

- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU times: user 21.9 s, sys: 7.68 s, total: 29.6 s

Wall time: 36.2 s


In [44]:
from transformers import RobertaTokenizer

tokenizers = []
for i in range(1, cfg.NUM_FOLDS):
    tokenizer = RobertaTokenizer.from_pretrained(MODELS_PATH + f"/{cfg.model_name.replace('/', '_')}_{i}")
    tokenizers.append(tokenizer)

In [45]:
def get_cls_embeddings(dl, transformer_model):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output, context_vector = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
#             cls_embeddings.extend(output['last_hidden_state'][:,0,:].detach().cpu().numpy())
            embedding_out = context_vector.detach().cpu().numpy()
            cls_embeddings.extend(embedding_out)
    return np.array(cls_embeddings)

In [46]:
def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [47]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [48]:
def create_dl(df, tokenizer):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.MAX_LEN)
    return DataLoader(ds, 
                      batch_size = cfg.BATCH_SIZE,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [49]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
remove_unnecessary(train_df)

In [50]:
train_target_mean = train_df['target'].mean()
train_target_std = train_df['target'].std()
train_df['normalized_target'] = (train_df['target'] - train_target_mean) / train_target_std

In [51]:
%%time

train_target = train_df['normalized_target'].values

def calc_mean(scores):
    return np.mean(np.array(scores), axis=0)

final_scores = []
final_rmse = []
kernel_rmse_score_mean = []
final_kernel_predictions_means = []
for j, (inference_model, tokenizer) in enumerate(zip(inference_models, tokenizers)):
    print('Model', j)
    test_dl = create_dl(test_df, tokenizer)
    train_dl = create_dl(train_df, tokenizer)
    transformer_model = inference_model
    transformer_model.cuda()
    X = get_cls_embeddings(train_dl, transformer_model)
    
    y = train_target
    X_test = get_cls_embeddings(test_dl, transformer_model)
    
    kfold = StratifiedKFold(n_splits=cfg.NUM_FOLDS)
    scores = []
    rmse_scores = []
    kernel_predictions_means = []
    for kernel in cfg.svm_kernels:
        print('Kernel', kernel)
        kernel_scores = []
        kernel_rmse_scores = []
        kernel_predictions = []
        for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):

            print('Fold', k, train_idx.shape, valid_idx.shape)
            model = SVR(C=cfg.svm_c, kernel=kernel, gamma='auto')

            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_valid)
            kernel_predictions.append(prediction)
            kernel_rmse_scores.append(rmse_score(prediction, y_valid))
            print('rmse_score', kernel_rmse_scores[k])
            kernel_scores.append(model.predict(X_test))
        kernel_predictions_means.append(np.array([np.mean(kp) for kp in kernel_predictions]).mean())
        scores.append(calc_mean(kernel_scores))
        kernel_rmse_score = calc_mean(kernel_rmse_scores)
        kernel_rmse_score_mean.append(kernel_rmse_score)
        rmse_scores.append(kernel_rmse_score)
    final_kernel_predictions_means.append(kernel_predictions_means)
    final_scores.append(calc_mean(scores))
    final_rmse.append(calc_mean(rmse_scores))
print('FINAL RMSE score', np.mean(np.array(final_rmse)))

Model 0


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



Kernel rbf

Fold 0 (2360,) (473,)

rmse_score 0.30501553882506316

Fold 1 (2361,) (472,)

rmse_score 0.2942799843912106

Fold 2 (2361,) (472,)

rmse_score 0.2873513938158079

Fold 3 (2361,) (472,)

rmse_score 0.2724334336058498

Fold 4 (2361,) (472,)

rmse_score 0.29661669001236746

Fold 5 (2361,) (472,)

rmse_score 0.3089690509844019

Model 1


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



Kernel rbf

Fold 0 (2360,) (473,)

rmse_score 0.26543873333068385

Fold 1 (2361,) (472,)

rmse_score 0.29878920224596667

Fold 2 (2361,) (472,)

rmse_score 0.27060644162311037

Fold 3 (2361,) (472,)

rmse_score 0.2701009547627859

Fold 4 (2361,) (472,)

rmse_score 0.2754783996412547

Fold 5 (2361,) (472,)

rmse_score 0.29005162786728933

Model 2


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



Kernel rbf

Fold 0 (2360,) (473,)

rmse_score 0.2745643329844541

Fold 1 (2361,) (472,)

rmse_score 0.266427782722148

Fold 2 (2361,) (472,)

rmse_score 0.25850193827111845

Fold 3 (2361,) (472,)

rmse_score 0.27829180235278617

Fold 4 (2361,) (472,)

rmse_score 0.26420001498779505

Fold 5 (2361,) (472,)

rmse_score 0.2784100764656171

Model 3


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



Kernel rbf

Fold 0 (2360,) (473,)

rmse_score 0.32314840668850564

Fold 1 (2361,) (472,)

rmse_score 0.3347420729022927

Fold 2 (2361,) (472,)

rmse_score 0.31517397796618957

Fold 3 (2361,) (472,)

rmse_score 0.3144002433272706

Fold 4 (2361,) (472,)

rmse_score 0.3395801643511162

Fold 5 (2361,) (472,)

rmse_score 0.3287832850136045

Model 4


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



Kernel rbf

Fold 0 (2360,) (473,)

rmse_score 0.2885506731103428

Fold 1 (2361,) (472,)

rmse_score 0.2781721902685379

Fold 2 (2361,) (472,)

rmse_score 0.25838689286484945

Fold 3 (2361,) (472,)

rmse_score 0.2680443432003933

Fold 4 (2361,) (472,)

rmse_score 0.30500801292699997

Fold 5 (2361,) (472,)

rmse_score 0.29746383060394443

FINAL RMSE score 0.29023271640379195

CPU times: user 4min 53s, sys: 7.68 s, total: 5min 1s

Wall time: 4min 59s


In [52]:
final_kernel_predictions_means

[[-0.0012696901143242224],
 [-0.002859863838743065],
 [-0.0006647586921143039],
 [-0.0043345630228311316],
 [-0.00263015696570484]]

In [53]:
# (train_df['target'] - cfg.train_target_mean) / cfg.train_target_std
final_scores_normalized = np.array(final_scores) * train_target_std + train_target_mean

In [54]:
kernel_rmse_score_mean_array = np.array(kernel_rmse_score_mean)
kernel_rmse_score_mean_sum = np.sum(kernel_rmse_score_mean_array)
prop_losses = kernel_rmse_score_mean_array / kernel_rmse_score_mean_sum
prop_losses_sum = (1 - prop_losses).sum()
weights = (1 - prop_losses) / prop_losses_sum
weights

array([0.19933186, 0.20203661, 0.20347423, 0.19384311, 0.20131419])

In [55]:
def calc_mean(scores, weights=weights):
    return np.average(np.array(scores), weights=weights, axis=0)

In [56]:
target_mean = train_df['target'].mean()
final_scores_flat = calc_mean(final_scores_normalized).flatten()
final_scores_mean = final_scores_flat.mean()
target_mean, np.array(final_scores_normalized).mean()
# (-0.9579984513405823, -0.8029817438292849)

(-0.9596573929279916, -0.8947554519544649)

In [57]:
final_scores_flat

array([-0.30475986, -0.37745973, -0.59289101, -2.26781949, -1.63918861,
       -1.21112969,  0.13124965])

In [58]:
mean_diff = target_mean - final_scores_mean
mean_diff, mean_diff / len(final_scores)

(-0.06508614402176593, -0.013017228804353187)

In [59]:
sample_df['target'] = final_scores_flat + mean_diff
# sample_df['target'] = len(final_scores) / np.sum(1 / np.array(final_scores), axis=0) # harmonic mean
sample_df

Unnamed: 0,id,target
0,c0f722661,-0.369846
1,f0953f0a5,-0.442546
2,0df072751,-0.657977
3,04caf4e0c,-2.332906
4,0e63f8bea,-1.704275
5,12537fe78,-1.276216
6,965e592c0,0.066164


### Prepare Packaging

In [60]:
cfg.model_name

'roberta-large-mnli'

In [61]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [62]:
BEST_MODEL_FOLDER

PosixPath('/home/commonlit/models/roberta-large-mnli/best')

In [63]:
cfg.NUM_FOLDS

6

In [64]:
bestmodels = [MODELS_PATH + f'/{cfg.model_name}_{i + 1}' for i in range(0, cfg.NUM_FOLDS)]

In [65]:
bestmodels

[PosixPath('/home/commonlit/models/roberta-large-mnli_1'),
 PosixPath('/home/commonlit/models/roberta-large-mnli_2'),
 PosixPath('/home/commonlit/models/roberta-large-mnli_3'),
 PosixPath('/home/commonlit/models/roberta-large-mnli_4'),
 PosixPath('/home/commonlit/models/roberta-large-mnli_5'),
 PosixPath('/home/commonlit/models/roberta-large-mnli_6')]

In [66]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('', '')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    i = i + 1
    best_model_file = f'{best_model}/model_{i}.pth'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/tokenizer_config.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        merges = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/merges.txt'))
        assert merges.exists()
        copyfile(merges, tokenizer_path/'merges.txt')
    else:
        print(f'{best_model_file} is missing')

Processing 0th model

Processing 1th model

Processing 2th model

Processing 3th model

Processing 4th model

Processing 5th model


In [67]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

'/home/commonlit/models/roberta-large-mnli/best_models.zip'

In [68]:
!ls {MODELS_PATH/cfg.model_name}

best  best_models.zip


In [69]:
!mv {MODELS_PATH}/{cfg.model_name}.yaml {MODELS_PATH/cfg.model_name}

mv: cannot stat '/home/commonlit/models/roberta-large-mnli.yaml': No such file or directory


In [70]:
transformer_model.transformer_model.save_pretrained(save_directory=f'{MODELS_PATH/cfg.model_name}/lm')

In [71]:
!du -h {MODELS_PATH/cfg.model_name}/*

1.3M	/home/commonlit/models/roberta-large-mnli/best/tokenizer-1

1.3M	/home/commonlit/models/roberta-large-mnli/best/tokenizer-2

1.3M	/home/commonlit/models/roberta-large-mnli/best/tokenizer-3

1.3M	/home/commonlit/models/roberta-large-mnli/best/tokenizer-4

1.3M	/home/commonlit/models/roberta-large-mnli/best/tokenizer-5

1.3M	/home/commonlit/models/roberta-large-mnli/best/tokenizer-6

8.0G	/home/commonlit/models/roberta-large-mnli/best

7.3G	/home/commonlit/models/roberta-large-mnli/best_models.zip

1.4G	/home/commonlit/models/roberta-large-mnli/lm


In [72]:
shutil.make_archive(MODELS_PATH/cfg.model_name/'lm', 'zip', f'{MODELS_PATH/cfg.model_name}/lm')

'/home/commonlit/models/roberta-large-mnli/lm.zip'

In [73]:
!kaggle datasets init -p {MODELS_PATH/cfg.model_name}

Data package template written to: /home/commonlit/models/roberta-large-mnli/dataset-metadata.json


In [74]:
dataset_json_path = Path(MODELS_PATH/cfg.model_name/'dataset-metadata.json')
assert dataset_json_path.exists()

In [75]:
!cat {str(dataset_json_path)}

{

  "title": "INSERT_TITLE_HERE",

  "id": "gilfernandes/INSERT_SLUG_HERE",

  "licenses": [

    {

      "name": "CC0-1.0"

    }

  ]

}

In [76]:
with open(dataset_json_path, 'r') as f:
    dataset_json = f.read()
    dataset_json = dataset_json.replace('INSERT_TITLE_HERE', f'commonlit-{cfg.model_name}').replace('INSERT_SLUG_HERE', f'commonlit-{cfg.model_name}')
    print(dataset_json)
with(open(dataset_json_path, 'w')) as f:
    f.write(dataset_json)

{

  "title": "commonlit-roberta-large-mnli",

  "id": "gilfernandes/commonlit-roberta-large-mnli",

  "licenses": [

    {

      "name": "CC0-1.0"

    }

  ]

}


In [77]:
!rm -rf {MODELS_PATH/cfg.model_name}/best
!rm -rf {MODELS_PATH/cfg.model_name}/lm

In [78]:
!kaggle datasets create -p {MODELS_PATH/cfg.model_name}

Starting upload for file best_models.zip

100%|██████████████████████████████████████| 7.24G/7.24G [12:28<00:00, 10.4MB/s]

Upload successful: best_models.zip (7GB)

Starting upload for file lm.zip

100%|██████████████████████████████████████| 1.20G/1.20G [02:09<00:00, 10.0MB/s]

Upload successful: lm.zip (1GB)

Your private Dataset is being created. Please check progress at /api/v1/datasets/status//gilfernandes/commonlit-roberta-large-mnli


In [None]:
!kaggle datasets version -p {MODELS_PATH/cfg.model_name} -m "Version with merges.txt" -d

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)