# Overview

This notebook combines three models.

In [1]:
import os
import math
import random
import time

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig

from sklearn.model_selection import KFold
from sklearn.svm import SVR

import gc
gc.enable()

In [2]:
BATCH_SIZE = 32
MAX_LEN = 248
EVAL_SCHEDULE = [(0.5, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1, 1)]
ROBERTA_PATH = "../input/roberta-transformers-pytorch/roberta-base"
TOKENIZER_PATH = "../input/roberta-transformers-pytorch/roberta-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DEVICE

'cuda'

In [3]:
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [4]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [5]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Model 1
Inspired from: https://www.kaggle.com/maunish/clrp-roberta-svm

In [6]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.25,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [7]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

# Inference

In [8]:
NUM_MODELS = 5

all_predictions = np.zeros((NUM_MODELS, len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in tqdm(range(NUM_MODELS)):            
    model_path = f"../input/commonlit-roberta-0467/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    all_predictions[model_index] = predict(model, test_loader)
            
    del model
    gc.collect()

  0%|          | 0/5 [00:00<?, ?it/s]


Using ../input/commonlit-roberta-0467/model_1.pth


 20%|██        | 1/5 [00:18<01:12, 18.01s/it]


Using ../input/commonlit-roberta-0467/model_2.pth


 40%|████      | 2/5 [00:26<00:38, 12.67s/it]


Using ../input/commonlit-roberta-0467/model_3.pth


 60%|██████    | 3/5 [00:34<00:21, 10.54s/it]


Using ../input/commonlit-roberta-0467/model_4.pth


 80%|████████  | 4/5 [00:42<00:09,  9.44s/it]


Using ../input/commonlit-roberta-0467/model_5.pth


100%|██████████| 5/5 [00:50<00:00, 10.09s/it]


In [9]:
model1_predictions = all_predictions.mean(axis=0)

In [10]:
ROBERTA_PATH = "../input/pre-trained-roberta-solution-in-pytorch"
TOKENIZER_PATH = "../input/pre-trained-roberta-solution-in-pytorch"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
NUM_MODELS = 5

all_predictions = np.zeros((NUM_MODELS, len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in tqdm(range(NUM_MODELS)):            
    model_path = f"../input/pre-trained-roberta-solution-in-pytorch/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    all_predictions[model_index] = predict(model, test_loader)
            
    del model
    gc.collect()

  0%|          | 0/5 [00:00<?, ?it/s]


Using ../input/pre-trained-roberta-solution-in-pytorch/model_1.pth


 20%|██        | 1/5 [00:10<00:43, 10.94s/it]


Using ../input/pre-trained-roberta-solution-in-pytorch/model_2.pth


 40%|████      | 2/5 [00:17<00:25,  8.64s/it]


Using ../input/pre-trained-roberta-solution-in-pytorch/model_3.pth


 60%|██████    | 3/5 [00:24<00:15,  7.85s/it]


Using ../input/pre-trained-roberta-solution-in-pytorch/model_4.pth


 80%|████████  | 4/5 [00:31<00:07,  7.55s/it]


Using ../input/pre-trained-roberta-solution-in-pytorch/model_5.pth


100%|██████████| 5/5 [00:39<00:00,  7.89s/it]


In [11]:
mdp = all_predictions.mean(axis=0)

# Model 2
Inspired from: [https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3)

In [12]:
test = test_df

from glob import glob
import os
import matplotlib.pyplot as plt
import json
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import RobertaConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup
)
from transformers import RobertaTokenizer
from transformers import RobertaModel
from IPython.display import clear_output

# Dataset

In [13]:
def convert_examples_to_features(data, tokenizer, max_len, is_test=False):
    data = data.replace('\n', '')
    tok = tokenizer.encode_plus(
        data, 
        max_length=max_len, 
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True
    )
    curr_sent = {}
    padding_length = max_len - len(tok['input_ids'])
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent

class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.values.tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.double),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

In [14]:
class CommonLitModel(nn.Module):
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        
        super(CommonLitModel, self).__init__()
        self.config = config
        self.roberta = RobertaModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
            
        self.regressor = nn.Linear(config.hidden_size, 1)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
 
    def _init_weights(self, module):
        
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
                
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
                
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
 
        # calculate loss
        loss = None
        
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

In [15]:
def make_model(model_name, num_labels=1):
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    config = RobertaConfig.from_pretrained(model_name)
    config.update({'num_labels':num_labels})
    model = CommonLitModel(model_name, config=config)
    return model, tokenizer

def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
):
    
    test_dataset = DatasetRetriever(data, tokenizer, max_len, is_test=True)
    test_sampler = SequentialSampler(test_dataset)
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size // 2, 
        sampler=test_sampler, 
        pin_memory=False, 
        drop_last=False, 
        num_workers=0
    )

    return test_loader

In [16]:
class Evaluator:
    def __init__(self, model, scalar=None):
        self.model = model
        self.scalar = scalar

    def evaluate(self, data_loader, tokenizer):
        preds = []
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                input_ids, attention_mask, token_type_ids = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids']
                input_ids, attention_mask, token_type_ids = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda()
                
                if self.scalar is not None:
                    with torch.cuda.amp.autocast():
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids
                    )
                
                logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
                preds += logits
        return preds

def config(fold, model_name, load_model_path):
    torch.manual_seed(2021)
    torch.cuda.manual_seed(2021)
    torch.cuda.manual_seed_all(2021)
    
    max_len = 250
    batch_size = 8

    model, tokenizer = make_model(
        model_name=model_name, 
        num_labels=1
    )
    
    model.load_state_dict(
        torch.load(f'{load_model_path}/model{fold}.bin')
    )
    
    test_loader = make_loader(
        test, tokenizer, max_len=max_len,
        batch_size=batch_size
    )

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')

    # scaler = torch.cuda.amp.GradScaler()
    scaler = None
    return (
        model, tokenizer, 
        test_loader, scaler
    )

# Inference

In [17]:
def run(fold=0, model_name=None, load_model_path=None):
    model, tokenizer, \
        test_loader, scaler = config(fold, model_name, load_model_path)
    
    import time
    wtf = "Variations of the delayed-write policy differ in when modified data blocks\
    are flushed to the server. One alternative is to flush a block when it is about to\
    be ejected from the client’s cache. This option can result in good performance,\
    but some blocks can reside in the client’s cache a long time before they are\
    written back to the server. A compromise between this alternative and the\
    write-through policy is to scan the cache at regular intervals and to flush\
    blocks that have been modified since the most recent scan, just as UNIX scans\
    its local cache. Sprite uses this policy with a 30-second interval. NFS uses the\
    policy for file data, but once a write is issued to the server during a cache\
    flush, the write must reach the server ’s disk before it is considered complete.\
    NFS treats metadata (directory data and file-attribute data) differently. Any\
    metadata changes are issued synchronously to the server. Thus, file-structure\
    loss and directory-structure corruption are avoided when a client or the server\
    crashes."

    evaluator = Evaluator(model, scaler)

    test_time_list = []

    torch.cuda.synchronize()
    tic1 = time.time()

    preds = evaluator.evaluate(test_loader, tokenizer)

    torch.cuda.synchronize()
    tic2 = time.time() 
    test_time_list.append(tic2 - tic1)
    
    del model, tokenizer, test_loader, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return preds

In [18]:
pred_df1 = pd.DataFrame()
#pred_df2 = pd.DataFrame()
pred_df3 = pd.DataFrame()

for fold in tqdm(range(5)):
    pred_df1[f'fold{fold}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-base/', '../input/commonlit-roberta-base-i/')
    #pred_df2[f'fold{fold+5}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-large', '../input/roberta-large-itptfit/')
    pred_df3[f'fold{fold+10}'] = run(fold%5, '../input/roberta-transformers-pytorch/roberta-large', '../input/commonlit-roberta-large-ii/')

  0%|          | 0/5 [00:00<?, ?it/s]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 20%|██        | 1/5 [00:42<02:50, 42.55s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 40%|████      | 2/5 [01:12<01:44, 34.95s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 60%|██████    | 3/5 [01:42<01:05, 32.89s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 80%|████████  | 4/5 [02:12<00:31, 31.75s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


100%|██████████| 5/5 [02:41<00:00, 32.34s/it]


In [19]:
pred_df1 = np.array(pred_df1)
#pred_df2 = np.array(pred_df2)
pred_df3 = np.array(pred_df3)

model2_predictions = (pred_df1.mean(axis=1) * 0.6) + (pred_df3.mean(axis=1) * 0.4)

## Model 3 

Inspired from: https://www.kaggle.com/jcesquiveld/best-transformer-representations

In [20]:
import os
import numpy as np
import pandas as pd
import random

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader

from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
logging.set_verbosity_error()

In [21]:
INPUT_DIR = '../input/commonlitreadabilityprize'
MODEL_DIR = '../input/roberta-transformers-pytorch/roberta-large'
CHECKPOINT_DIR = '../input/clrp-mean-pooling/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MAX_LENGTH = 248
TEST_BATCH_SIZE = 1
HIDDEN_SIZE = 1024

NUM_FOLDS = 5
SEEDS = [113]

test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))

In [22]:
class MeanPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(HIDDEN_SIZE, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.linear(mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds

In [23]:
def get_test_loader(data):

    x_test = data.excerpt.tolist()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

    encoded_test = tokenizer.batch_encode_plus(
        x_test, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )

    dataset_test = TensorDataset(
        encoded_test['input_ids'],
        encoded_test['attention_mask']
    )

    dataloader_test = DataLoader(
        dataset_test,
        sampler = SequentialSampler(dataset_test),
        batch_size=TEST_BATCH_SIZE
    )
    
    return dataloader_test

test_dataloader = get_test_loader(test)

In [24]:
all_predictions = []
for seed in SEEDS:
    
    fold_predictions = []
    
    for fold in tqdm(range(NUM_FOLDS)):
        model_path = f"model_{seed + 1}_{fold + 1}.pth"
        
        print(f"\nUsing {model_path}")
        
        model_path = CHECKPOINT_DIR + f"model_{seed + 1}_{fold + 1}.pth"
        model = MeanPoolingModel(MODEL_DIR)
        model.load_state_dict(torch.load(model_path)) 
        model.to(DEVICE)
        model.eval()

        predictions = []
        for batch in test_dataloader:

            batch = tuple(b.to(DEVICE) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         None,
                     }

     
            preds = model(**inputs).item()
            predictions.append(preds)
            
        del model 
        gc.collect()
            
        fold_predictions.append(predictions)
    all_predictions.append(np.mean(fold_predictions, axis=0).tolist())
    
model3_predictions = np.mean(all_predictions,axis=0)

  0%|          | 0/5 [00:00<?, ?it/s]


Using model_114_1.pth

Using model_114_2.pth

Using model_114_3.pth

Using model_114_4.pth

Using model_114_5.pth


In [25]:
import os
from pathlib import Path
from torch.utils.data import RandomSampler, SequentialSampler, Sampler
from torch.utils.data import Dataset, DataLoader

def convert_examples_to_features(text, tokenizer, max_len):

    tok = tokenizer.encode_plus(
        text, 
        max_length=max_len, 
        truncation=True,
        padding='max_length',
    )
    return tok

class CLRPDataset(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.tolist()
        if not is_test:
            self.targets = self.data.target.tolist()
            
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt = self.excerpts[item]
            label = self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.float),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, self.max_len
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

class AttentionHead(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class CLRPModel(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        self.head = AttentionHead(self.h_size*4)
        self.linear = nn.Linear(self.h_size*2, 1)
        self.linear_out = nn.Linear(self.h_size*8, 1)

              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
       
        all_hidden_states = torch.stack(transformer_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
        
        cls_pooling = cat_over_last_layers[:, 0]   
        head_logits = self.head(cat_over_last_layers)
        y_hat = self.linear_out(torch.cat([head_logits, cls_pooling], -1))
        
        return y_hat

def predic(data, in_folder_path):
    in_folder_path = Path(in_folder_path)
    mp = "Protocol stacks, as specified by network layering models, add information\
    to a message to ensure that it reaches its destination. A naming system (such\
    as DNS) must be used to translate from a host name to a network address, and\
    another protocol (such as ARP) may be needed to translate the network number\
    to a network device address (an Ethernet address, for instance). If systems are\
    located on separate networks, routers are needed to pass packets from source\
    network to destination network.\
    There are many challenges to overcome for a distributed system to work\
    correctly. Issues include naming of nodes and processes in the system, fault\
    tolerance, error recovery, and scalability.\
    A DFS is a file-service system whose clients, servers, and storage devices\
    are dispersed among the sites of a distributed system. Accordingly, service\
    activity has to be carried out across the network; instead of a single centralized\
    data repository, there are multiple independent storage devices.\
    Ideally, a DFS should look to its clients like a conventional, centralized\
    file system. The multiplicity and dispersion of its servers and storage devices\
    should be transparent. A transparent DFS facilitates client mobility by bringing\
    the client’s environment to the site where the client logs in.\
    There are several approaches to naming schemes in a DFS. In the simplest\
    approach, files are named by some combination of their host name and local\
    name, which guarantees a unique system-wide name. Another approach,\
    popularized by NFS, provides a means to attach remote directories to local\
    directories, thus giving the appearance of a coherent directory tree.\
    Requests to access a remote file are usually handled by two complementary\
    methods. With remote service, requests for accesses are delivered to the server.\
    The server machine performs the accesses, and the results are forwarded back\
    to the client. With caching, if the data needed to satisfy the access request are\
    not already cached, then a copy of the data is brought from the server to the\
    client. Accesses are performed on the cached copy. The problem of keeping the\
    cached copies consistent with the master file is the cache-consistency problem.\
    Practice Exercises\
    17.1 Why would it be a bad idea for gateways to pass broadcast packets\
    between networks? What would be the advantages of doing so?\
    17.2 Discuss the advantages and disadvantages of caching name transla-\
        tions for computers located in remote domains.\
    17.3 What are the advantages and disadvantages of using circuit switching?\
    For what kinds of applications is circuit switching a viable strategy?\
    17.4 What are two formidable problems that designers must solve to\
    implement a network system that has the quality of transparency?"

    
    models_folder_path = Path(in_folder_path / 'models')
    models_preds = []
    
    for model_num in range(5):
        
        tokenizer = AutoTokenizer.from_pretrained(in_folder_path)
            
        print(f'Inference#{model_num+1}/5')
        test_ds = CLRPDataset(data=data, tokenizer=tokenizer, max_len=256, is_test=True)
        test_sampler = SequentialSampler(test_ds)
        test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=4)
        model = torch.load(models_folder_path / f'best_model_{model_num}.pt').to('cuda')
        print(f'Model {models_folder_path} / best_model_{model_num}.pt is pushed to Device')

        all_preds = []
        model.eval()

        for step,batch in enumerate(test_dataloader):
            sent_id, mask = batch['input_ids'].to('cuda'), batch['attention_mask'].to('cuda')
            with torch.no_grad():
                preds = model(sent_id, mask)
                all_preds += preds.flatten().cpu().tolist()

        models_preds.append(all_preds)
        del model, tokenizer, test_dataloader, test_sampler, test_ds
        gc.collect()
        torch.cuda.empty_cache()
    return np.array(models_preds).mean(axis = 0)

In [26]:
pd1 = predic(test_df, '../input/comdbli')

Inference#1/5
Model ../input/comdbli/models / best_model_0.pt is pushed to Device
Inference#2/5
Model ../input/comdbli/models / best_model_1.pt is pushed to Device
Inference#3/5
Model ../input/comdbli/models / best_model_2.pt is pushed to Device
Inference#4/5
Model ../input/comdbli/models / best_model_3.pt is pushed to Device
Inference#5/5
Model ../input/comdbli/models / best_model_4.pt is pushed to Device


In [27]:
pd1

array([-0.29079663, -0.46820574, -0.38955538, -2.14445047, -1.80839045,
       -1.17013342,  0.25433525])

In [28]:
class CLRPModel(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        self.head = AttentionHead(self.h_size*4)
        self.linear = nn.Linear(self.h_size*8, self.h_size // 2)
        self.linear_out = nn.Linear(self.h_size // 2, 1)
        self.tanh = nn.Tanh()
              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
       
        all_hidden_states = torch.stack(transformer_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
        
        cls_pooling = cat_over_last_layers[:, 0]   
        head_logits = self.head(cat_over_last_layers)
        logits = self.tanh(self.linear(torch.cat([head_logits, cls_pooling], -1)))
        y_hat = self.linear_out(logits)
        
        return y_hat


In [29]:
pd3 = predic(test_df, '../input/dbx777')

Inference#1/5
Model ../input/dbx777/models / best_model_0.pt is pushed to Device
Inference#2/5
Model ../input/dbx777/models / best_model_1.pt is pushed to Device
Inference#3/5
Model ../input/dbx777/models / best_model_2.pt is pushed to Device
Inference#4/5
Model ../input/dbx777/models / best_model_3.pt is pushed to Device
Inference#5/5
Model ../input/dbx777/models / best_model_4.pt is pushed to Device


In [30]:
pd3

array([-0.29928167, -0.33505895, -0.41167788, -2.06349804, -1.85605731,
       -1.22650595,  0.26501839])

In [31]:
models_folder_path = Path('../input/notebook6921fb919e/models')
models_preds = []
    
for model_num in range(5):
        
    tokenizer = AutoTokenizer.from_pretrained('../input/comdbli')
    data = test_df
    print(f'Inference#{model_num+1}/5')
    test_ds = CLRPDataset(data=data, tokenizer=tokenizer, max_len=256, is_test=True)
    test_sampler = SequentialSampler(test_ds)
    test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=4)
    model = torch.load(models_folder_path / f'best_model_{model_num}.pt').to('cuda')
    print(f'Model {models_folder_path} / best_model_{model_num}.pt is pushed to Device')

    all_preds = []
    model.eval()

    for step,batch in enumerate(test_dataloader):
        sent_id, mask = batch['input_ids'].to('cuda'), batch['attention_mask'].to('cuda')
        with torch.no_grad():
            preds = model(sent_id, mask)
            all_preds += preds.flatten().cpu().tolist()

    models_preds.append(all_preds)
    del model, tokenizer, test_dataloader, test_sampler, test_ds
    gc.collect()
    torch.cuda.empty_cache()
pd4 = np.array(models_preds).mean(axis = 0)

Inference#1/5
Model ../input/notebook6921fb919e/models / best_model_0.pt is pushed to Device
Inference#2/5
Model ../input/notebook6921fb919e/models / best_model_1.pt is pushed to Device
Inference#3/5
Model ../input/notebook6921fb919e/models / best_model_2.pt is pushed to Device
Inference#4/5
Model ../input/notebook6921fb919e/models / best_model_3.pt is pushed to Device
Inference#5/5
Model ../input/notebook6921fb919e/models / best_model_4.pt is pushed to Device


In [32]:
pd4

array([-0.23531469, -0.44356807, -0.47543548, -2.14265709, -1.72869146,
       -1.15228729,  0.35644643])

In [33]:
class CLRPModel(nn.Module):
    def __init__(self,transformer,config):
        super(CLRPModel,self).__init__()
        self.h_size = config.hidden_size
        self.transformer = transformer
        self.head = AttentionHead(self.h_size*4)
        self.linear = nn.Linear(self.h_size*2, 1)
        self.linear_out = nn.Linear(self.h_size*8, 1)

              
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
       
        all_hidden_states = torch.stack(transformer_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
        
        cls_pooling = cat_over_last_layers[:, 0]   
        head_logits = self.head(cat_over_last_layers)
        y_hat = self.linear_out(torch.cat([head_logits, cls_pooling], -1))
        
        return y_hat
def predic2(data, in_folder_path):
    in_folder_path = Path(in_folder_path)

    tp = "In the final part of the book, we integrate the concepts described earlier\
    by examining real operating systems. We cover two such systems in\
    detail — Linux and Windows 7. We chose Linux for several reasons: it is\
    popular, it is freely available, and it represents a full-featured UNIX system.\
    This gives a student of operating systems an opportunity to read — and\
    modify — real operating-system source code.\
    We also cover Windows 7 in detail. This recent operating system from\
    Microsoft is gaining popularity not only in the standalone-machine market\
    but also in the workgroup – server market. We chose Windows 7 because\
    it provides an opportunity to study a modern operating system that has\
    a design and implementation drastically different from those of UNIX.\
    In addition, we briefly discuss other highly influential operating sys-\
    tems. Finally, we provide on-line coverage of two more systems: FreeBSD\
    and Mach. The FreeBSD system is another UNIX system. However,\
    whereas Linux combines features from several UNIX systems, FreeBSD\
    is based on the BSD model. FreeBSD source code, like Linux source\
    code, is freely available. Mach is a modern operating system that provides\
    compatibility with BSD UNIX."
    
    models_folder_path = Path(in_folder_path / 'models')
    models_preds = []
    
    for model_num in range(5):
        
        tokenizer = torch.load('../input/tokenizers/roberta-tokenizer.pt')
            
        print(f'Inference#{model_num+1}/5')
        test_ds = CLRPDataset(data=data, tokenizer=tokenizer, max_len=256, is_test=True)
        test_sampler = SequentialSampler(test_ds)
        test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=4)
        model = torch.load(models_folder_path / f'best_model_{model_num}.pt').to('cuda')
        print(f'Model {models_folder_path} / best_model_{model_num}.pt is pushed to Device')

        all_preds = []
        model.eval()

        for step,batch in enumerate(test_dataloader):
            sent_id, mask = batch['input_ids'].to('cuda'), batch['attention_mask'].to('cuda')
            with torch.no_grad():
                preds = model(sent_id, mask)
                all_preds += preds.flatten().cpu().tolist()

        models_preds.append(all_preds)
        del model, tokenizer, test_dataloader, test_sampler, test_ds
        gc.collect()
        torch.cuda.empty_cache()
    return np.array(models_preds).mean(axis = 0)

In [34]:
pd2 = predic2(test_df, '../input/0463-robertalarge')

Inference#1/5
Model ../input/0463-robertalarge/models / best_model_0.pt is pushed to Device
Inference#2/5
Model ../input/0463-robertalarge/models / best_model_1.pt is pushed to Device
Inference#3/5
Model ../input/0463-robertalarge/models / best_model_2.pt is pushed to Device
Inference#4/5
Model ../input/0463-robertalarge/models / best_model_3.pt is pushed to Device
Inference#5/5
Model ../input/0463-robertalarge/models / best_model_4.pt is pushed to Device


In [35]:
pd2

array([-0.34481332, -0.48203456, -0.48146576, -2.17766552, -1.91580055,
       -1.30661547,  0.17578244])

In [36]:
predictions = ((model1_predictions + mdp + pred_df1.mean(axis=1) + pred_df3.mean(axis=1) + model3_predictions)/5 + ((pd1 + pd4 + pd3)/3 + pd2)/2)/2
predictions

array([-0.37558921, -0.487815  , -0.45136422, -2.27267265, -1.81203254,
       -1.20362545,  0.19535162])

In [37]:
results = pd.DataFrame(np.vstack((model1_predictions, model2_predictions, model3_predictions, predictions)).transpose(), 
                       columns=['model1','model2','model3','ensemble'])

results

Unnamed: 0,model1,model2,model3,ensemble
0,-0.437155,-0.466744,-0.403613,-0.375589
1,-0.656492,-0.601198,-0.390972,-0.487815
2,-0.354968,-0.472127,-0.432696,-0.451364
3,-2.515151,-2.403332,-2.249735,-2.272673
4,-1.678813,-1.791369,-1.838367,-1.812033
5,-1.427003,-1.100678,-1.113217,-1.203625
6,0.050763,0.160442,0.106686,0.195352


In [38]:
submission_df.target = predictions
submission_df

Unnamed: 0,id,target
0,c0f722661,-0.375589
1,f0953f0a5,-0.487815
2,0df072751,-0.451364
3,04caf4e0c,-2.272673
4,0e63f8bea,-1.812033
5,12537fe78,-1.203625
6,965e592c0,0.195352


In [39]:
submission_df.to_csv("submission.csv", index=False)