In [1]:
import string
import copy
import time
%matplotlib inline
import os
# os.listdir("../input/")
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
os.system('pip install pytorch_pretrained_bert --no-index --find-links="../input/pytorch-pretrained-bert/pytorch_pretrained_bert" ')
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
import random
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

SEED = 42
BATCH_SIZE = 32
MAX_LENGTH = 356
BERT_FP = '../input/bert-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

In [2]:
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [3]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)


def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [4]:
class BertForSequenceRegression(nn.Module):
    def __init__(self):
        super(BertForSequenceRegression, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_FP)
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(768, 256)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(256, 1)

    def forward(self, ids,  token_type_ids, attention_mask):
        _, pooled_output = self.bert(ids, token_type_ids, attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.linear1(pooled_output)
        pooled_output = self.relu(pooled_output)
        outputs = self.linear2(pooled_output)
        return outputs.view(-1)

In [5]:
def RMSELoss(outputs, targets):
    return torch.sqrt(nn.MSELoss()(outputs, targets))

In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer

class regressor_stratified_cv:
    def __init__(self, n_splits = 10, n_repeats = 2, group_count = 10,
                 random_state = 0, strategy = 'quantile'):
        self.group_count = group_count
        self.strategy = strategy
        self.cvkwargs = dict(n_splits = n_splits, n_repeats = n_repeats, 
                             random_state = random_state)
        self.cv = RepeatedStratifiedKFold(**self.cvkwargs)
        self.discretizer = KBinsDiscretizer(n_bins = self.group_count, encode = 'ordinal',
                                            strategy = self.strategy)  
            
    def split(self, X, y, groups = None):
        kgroups=self.discretizer.fit_transform(y[:, None])[:, 0]
        return self.cv.split(X, kgroups, groups)
    
    def get_n_splits(self, X, y, groups = None):
        return self.cv.get_n_splits(X, y, groups)

In [7]:
def text_preprocessing(excerpt):
    
    # lower casing
    excerpt = excerpt.lower()

    # removal of punctuation
    excerpt = excerpt.translate(str.maketrans('', '', string.punctuation))

        
    # removal of stopwords
#     from nltk.corpus import stopwords
#     ", ".join(stopwords.words('english'))
#     STOPWORDS = set(stopwords.words('english'))
#     excerpt = " ".join([word for word in str(excerpt).split() if word not in STOPWORDS])
        
    # lemmatization 
#     from nltk.stem import WordNetLemmatizer
#     lemmatizer = WordNetLemmatizer()
#     excerpt = " ".join([lemmatizer.lemmatize(word) for word in excerpt.split()])
        
                
    return excerpt

In [8]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained(BERT_FP, do_lower_case=True)

In [9]:
class TokenDataset(Dataset):
    def __init__(self, tokenizer, text, target = None, is_test=False):
        self.text = text
        self.target = target
        self.is_test = is_test
        self.max_len = MAX_LENGTH
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text = str(self.text[idx])
        text = ' '.join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
            }
        else:    
            targets = torch.tensor(self.target[idx], dtype=torch.float)
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                'targets': targets
            }

In [10]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [11]:
test["text"] = test["excerpt"].apply(lambda x: text_preprocessing(x))

In [12]:
excerpts = test.text.values

In [13]:
test_data = test[['text']]

In [14]:
test_set = TokenDataset(tokenizer,
                        text = test_data['text'].values, is_test = True
                        )

In [15]:
test_dataloader = DataLoader(test_set, batch_size = BATCH_SIZE, num_workers=8)

In [16]:
model = BertForSequenceRegression().to(device)

In [17]:
def predicting(test_dataloader, model, states):

    all_preds = []
    
    for state in states:
        model.load_state_dict(state)
        model.to(device)
        model.eval()
    
    
        preds = []
        with torch.no_grad():
            for batch in test_dataloader:
                ids = batch['ids'].to(device, dtype=torch.long)
                input_mask = batch['mask'].to(device, dtype=torch.long)
                type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
                output = model(ids,input_mask,type_ids)
                preds.append(output.cpu().numpy())

            preds = np.concatenate(preds)
            
            all_preds.append(preds)

    return all_preds

In [18]:
pathes = ['../input/model-1-063/model_epoch_6_loss_0.63.pt', 
          '../input/model-2-0628/model_epoch_5_loss_0.628.pt',
          '../input/model-3-0624/model_epoch_4_loss_0.624.pt',
          '../input/model-4-0626/model_epoch_6_loss_0.626.pt',
          '../input/model-5-064/model_epoch_5_loss_0.64.pt']

In [19]:
states = [torch.load(s) for s in pathes]

In [20]:
all_preds = predicting(test_dataloader,model,states)

In [21]:
predictions = pd.DataFrame(all_preds)
predictions = predictions.T

In [22]:
predictions = predictions.mean(axis=1)

In [23]:
submission = pd.DataFrame({'id':test['id'],'target':predictions})

In [24]:
submission.to_csv('submission.csv',index=False)

In [25]:
submission

Unnamed: 0,id,target
0,c0f722661,-0.125981
1,f0953f0a5,0.114056
2,0df072751,-0.108495
3,04caf4e0c,-2.000549
4,0e63f8bea,-1.332202
5,12537fe78,-0.439675
6,965e592c0,0.612411


In [26]:
torch.cuda.empty_cache()