In [1]:
import os
import numpy as np
import pandas as pd
import random

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader

from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
logging.set_verbosity_error()

In [2]:
INPUT_DIR = '../input/commonlitreadabilityprize'
MODEL_DIR = '../input/roberta-transformers-pytorch/roberta-large'
CHECKPOINT_DIR1 = '../input/clrp-mean-pooling/'

CHECKPOINT_DIR2 = '../input/clrp-mean-pooling-seeds-17-43/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MAX_LENGTH = 300
TEST_BATCH_SIZE = 1
HIDDEN_SIZE = 1024
 
NUM_FOLDS = 5
SEEDS = [17, 43]
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_DIR)
BATCH_SIZE = 8
MAX_LEN = 300

In [3]:
submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))

In [4]:
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
test.head(2)

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...


In [5]:
class MeanPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(HIDDEN_SIZE, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.linear(mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds

In [6]:

class CLRPDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encode = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            max_length=MAX_LEN,
            truncation=True,
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors='pt'
        ) 
        return encode

In [7]:
def get_test_loader(data):

    x_test = data.excerpt.tolist()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

    encoded_test = tokenizer.batch_encode_plus(
        x_test, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )

    dataset_test = TensorDataset(
        encoded_test['input_ids'],
        encoded_test['attention_mask']
    )

    dataloader_test = DataLoader(
        dataset_test,
        sampler = SequentialSampler(dataset_test),
        batch_size=TEST_BATCH_SIZE
    )
    
    return dataloader_test

test_dataloader = get_test_loader(test)

In [8]:

# CHECKPOINT_DIR1 = '../input/clrp-mean-pooling/'
# CHECKPOINT_DIR2 = '../input/clrp-mean-pooling-seeds-17-43/'

# all_predictions = [] 
# for seed in SEEDS:
    
#     fold_predictions = []
    
#     for fold in tqdm(range(NUM_FOLDS)):
        
#         model_path = f"model_{seed + 1}_{fold + 1}.pth" 
#         print(f"\nUsing {model_path}")
        
#         if seed in [113, 71]:
#             model_path = CHECKPOINT_DIR1 + f"model_{seed + 1}_{fold + 1}.pth"
            
#         if seed in [17, 43]:
#             model_path = CHECKPOINT_DIR2 + f"model_{seed + 1}_{fold + 1}.pth"            
            
#         model = MeanPoolingModel(MODEL_DIR)
#         model.load_state_dict(torch.load(model_path)) 
#         model.to(DEVICE)
#         model.eval()

#         predictions = []
#         for batch in test_dataloader:

#             batch = tuple(b.to(DEVICE) for b in batch)

#             inputs = {'input_ids':      batch[0],
#                       'attention_mask': batch[1],
#                       'labels':         None,
#                      }

     
#             preds = model(**inputs).item()
#             predictions.append(preds)
            
#         del model 
#         gc.collect()
            
#         fold_predictions.append(predictions)
#     all_predictions.append(np.mean(fold_predictions, axis=0).tolist())
    
# model_predictions = np.mean(all_predictions,axis=0)

In [9]:
def predict(df, model):
    
    ds = CLRPDataset(df.excerpt.tolist(), TOKENIZER)
    dl = DataLoader(
        ds,
        batch_size=BATCH_SIZE,
        shuffle=False,
        pin_memory=False
    )
    
    model.to(DEVICE)
    model.eval()
    model.zero_grad()
    
    predictions = []
    for batch in tqdm(dl):
        inputs = {key:val.reshape(val.shape[0], -1).to(DEVICE) for key,val in batch.items()}
        outputs = model(**inputs)
        predictions.extend(outputs.detach().cpu().numpy().ravel())
        
    return predictions

In [10]:
# Calculate predictions of each fold and average them
s=['../input/clrp-mean-pooling/model_72_1.pth','../input/clrp-mean-pooling/model_72_2.pth','../input/clrp-mean-pooling/model_72_3.pth','../input/clrp-mean-pooling/model_114_4.pth','../input/clrp-mean-pooling/model_114_5.pth']
fold_predictions = []
for i in range (5):
    path=s[i]
    model = MeanPoolingModel(MODEL_DIR)
    model.load_state_dict(torch.load(path))
#     fold = int(re.match(r'.*_f_?(\d)_.*', path).group(1))
    print(f'*** fold : {path} ***')
    y_pred = predict(test, model)
    fold_predictions.append(y_pred)
    
    # Free memory
    del model
    gc.collect()
    
model_predictions = np.mean(fold_predictions, axis=0)

*** fold : ../input/clrp-mean-pooling/model_72_1.pth ***


  0%|          | 0/1 [00:00<?, ?it/s]

*** fold : ../input/clrp-mean-pooling/model_72_2.pth ***


  0%|          | 0/1 [00:00<?, ?it/s]

*** fold : ../input/clrp-mean-pooling/model_72_3.pth ***


  0%|          | 0/1 [00:00<?, ?it/s]

*** fold : ../input/clrp-mean-pooling/model_114_4.pth ***


  0%|          | 0/1 [00:00<?, ?it/s]

*** fold : ../input/clrp-mean-pooling/model_114_5.pth ***


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
BATCH_SIZE = 32
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "/kaggle/input/roberta-base"
TOKENIZER_PATH = "/kaggle/input/roberta-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

In [13]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [14]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [15]:
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [16]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [17]:
test_dataset = LitDataset(test_df, inference_only=True)

In [18]:
NUM_MODELS = 5

all_predictions = np.zeros((NUM_MODELS, len(test_df)))



test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in range(NUM_MODELS):            
    model_path = f"../input/commonlit-roberta-0467/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    all_predictions[model_index] = predict(model, test_loader)
            
    del model
    gc.collect()


Using ../input/commonlit-roberta-0467/model_1.pth

Using ../input/commonlit-roberta-0467/model_2.pth

Using ../input/commonlit-roberta-0467/model_3.pth

Using ../input/commonlit-roberta-0467/model_4.pth

Using ../input/commonlit-roberta-0467/model_5.pth


In [19]:
model1_predictions = all_predictions.mean(axis=0)

In [20]:
predictions = model1_predictions * 0.5 + model_predictions * 0.5

In [21]:
submission['target'] = predictions
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,c0f722661,-0.374109
1,f0953f0a5,-0.520715
2,0df072751,-0.382832
3,04caf4e0c,-2.407074
4,0e63f8bea,-1.799853
