In [1]:
import string
import copy
import time
%matplotlib inline
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, TensorDataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, AdamW, RobertaConfig
import random
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

SEED = 42
BATCH_SIZE = 16
MAX_LENGTH = 256
ROBERTA_FP = '../input/roberta-base'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

In [2]:
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [3]:
class RobertaForSequenceRegression(nn.Module):
    def __init__(self):
        super(RobertaForSequenceRegression, self).__init__()
        self.config = RobertaConfig.from_pretrained(ROBERTA_FP)
        self.config.update({"output_hidden_states":True, 
                       "layer_norm_eps": 1e-7})  
        self.roberta = RobertaModel.from_pretrained(ROBERTA_FP, config = self.config)
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.linear = nn.Linear(768, 1)
        
        self._init_weights(self.layer_norm)
        self._init_weights(self.linear)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)     

    def forward(self, input_ids, attention_mask):
        pooled_output = self.roberta(input_ids, attention_mask)
        last_layer_hidden_states = pooled_output.hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)
        pooled_output = self.layer_norm(context_vector)
        outputs = self.linear(pooled_output)
        return outputs.view(-1)

In [4]:
def RMSELoss(outputs, targets):
    return torch.sqrt(nn.MSELoss()(outputs, targets))

In [5]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer

class regressor_stratified_cv:
    def __init__(self, n_splits = 10, n_repeats = 2, group_count = 10,
                 random_state = 0, strategy = 'quantile'):
        self.group_count = group_count
        self.strategy = strategy
        self.cvkwargs = dict(n_splits = n_splits, n_repeats = n_repeats, 
                             random_state = random_state)
        self.cv = RepeatedStratifiedKFold(**self.cvkwargs)
        self.discretizer = KBinsDiscretizer(n_bins = self.group_count, encode = 'ordinal',
                                            strategy = self.strategy)  
            
    def split(self, X, y, groups = None):
        kgroups=self.discretizer.fit_transform(y[:, None])[:, 0]
        return self.cv.split(X, kgroups, groups)
    
    def get_n_splits(self, X, y, groups = None):
        return self.cv.get_n_splits(X, y, groups)

In [6]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 1e-5

        if layer_num >= 69:        
            lr = 2e-5

        if layer_num >= 133:
            lr = 7e-5

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [7]:
class TokenDataset(Dataset):
    def __init__(self, tokenizer, text, target = None, is_test=False):
        self.text = text.tolist()
        self.target = target
        self.is_test = is_test
        self.max_len = MAX_LENGTH
        self.tokenizer = tokenizer
         
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(
            self.text[idx],
            padding = 'max_length',            
            max_length = self.max_len,
            truncation = True,
            return_attention_mask=True
        ) 
        input_ids = torch.tensor(inputs['input_ids'])
        attention_mask = torch.tensor(inputs['attention_mask'])
        if self.is_test:
            return (input_ids, attention_mask)  
        else:    
            targets = torch.tensor(self.target[idx], dtype=torch.float)
            return (input_ids, attention_mask, targets)  

In [8]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [9]:
test["text"] = test['excerpt'].apply(lambda x: x.replace('\n',''))

In [10]:
def predicting(test, states):

    all_preds = []
    
    for state in states:
        model = RobertaForSequenceRegression()
        model.load_state_dict(state)
        model.to(device)
        model.eval()
        
        tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_FP)
        
        test_set = TokenDataset(tokenizer,
                        text = test['text'].values, 
                        is_test = True)
        
        test_dataloader = DataLoader(test_set, 
                             batch_size = BATCH_SIZE,
                             drop_last=False, 
                             shuffle=False, 
                             num_workers=2)
        
        preds = []
        with torch.no_grad():
            for batch_num, (input_ids, attention_mask) in enumerate(test_dataloader):
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)   
                output = model(input_ids, attention_mask)
                preds.append(output.cpu().numpy())

            preds = np.concatenate(preds)
            
            all_preds.append(preds)

    return all_preds

In [11]:
pathes = ['../input/models-batch/model_fold_1.bin', 
          '../input/models-batch/model_fold_2.bin',
          '../input/models-batch/model_fold_3.bin',
          '../input/models-batch/model_fold_4.bin',
          '../input/models-batch/model_fold_5.bin']

In [12]:
states = [torch.load(s) for s in pathes]

In [13]:
all_preds = predicting(test, states)

Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initi

In [14]:
predictions = pd.DataFrame(all_preds)
predictions = predictions.T

In [15]:
predictions = predictions.mean(axis=1)

In [16]:
submission = pd.DataFrame({'id':test['id'],'target':predictions})

In [17]:
submission.to_csv('submission.csv',index=False)

In [18]:
submission

Unnamed: 0,id,target
0,c0f722661,-0.439601
1,f0953f0a5,-0.560899
2,0df072751,-0.463929
3,04caf4e0c,-2.367995
4,0e63f8bea,-1.82655
5,12537fe78,-0.968265
6,965e592c0,0.100761


In [19]:
torch.cuda.empty_cache()