In [96]:
import string
%matplotlib inline
import os
# os.listdir("../input/")
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
os.system('pip install pytorch_pretrained_bert --no-index --find-links="../input/pytorch-pretrained-bert/pytorch_pretrained_bert" ')
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
import random
from transformers import get_linear_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")

SEED = 42
BATCH_SIZE = 32
MAX_LENGTH = 256

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if n_gpu > 0:
    torch.cuda.manual_seed_all(SEED)

In [97]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)


def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [98]:
class BertForSequenceRegression(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForSequenceRegression, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.linear1 = nn.Linear(config.hidden_size, 256)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(256, 1)

    def forward(self, ids,  token_type_ids=None, attention_mask=None, targets=None):
        _, pooled_output = self.bert(ids, token_type_ids, attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.linear1(pooled_output)
        pooled_output = self.relu(pooled_output)
        outputs = self.linear2(pooled_output)
        return outputs.view(-1)

In [99]:
def RMSELoss(outputs, targets):
    return torch.sqrt(nn.MSELoss()(outputs, targets))

In [100]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import KBinsDiscretizer

class regressor_stratified_cv:
    def __init__(self, n_splits = 10, n_repeats = 2, group_count = 10,
                 random_state = 0, strategy = 'quantile'):
        self.group_count = group_count
        self.strategy = strategy
        self.cvkwargs = dict(n_splits = n_splits, n_repeats = n_repeats, 
                             random_state = random_state)
        self.cv = RepeatedStratifiedKFold(**self.cvkwargs)
        self.discretizer = KBinsDiscretizer(n_bins = self.group_count, encode = 'ordinal',
                                            strategy = self.strategy)  
            
    def split(self, X, y, groups = None):
        kgroups=self.discretizer.fit_transform(y[:, None])[:, 0]
        return self.cv.split(X, kgroups, groups)
    
    def get_n_splits(self, X, y, groups = None):
        return self.cv.get_n_splits(X, y, groups)

In [101]:
def text_preprocessing(excerpt):
    
    # lower casing
    excerpt = excerpt.lower()

    # removal of punctuation
    excerpt = excerpt.translate(str.maketrans('', '', string.punctuation))

        
    # removal of stopwords
    from nltk.corpus import stopwords
    ", ".join(stopwords.words('english'))
    STOPWORDS = set(stopwords.words('english'))
    excerpt = " ".join([word for word in str(excerpt).split() if word not in STOPWORDS])
        
    # lemmatization 
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    excerpt = " ".join([lemmatizer.lemmatize(word) for word in excerpt.split()])
        
                
    return excerpt

In [102]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [103]:
df["text"] = df["excerpt"].apply(lambda x: text_preprocessing(x))

In [104]:
excerpts = df.text.values
targets = df.target.values

In [105]:
BERT_FP = '../input/bert-base-uncased'

In [106]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained(BERT_FP, do_lower_case=True)

In [107]:
# convert sentences into tokens
input_ids = [tokenizer.encode(excerpt, add_special_tokens = True, max_length = MAX_LENGTH,
                              padding='max_length') for excerpt in excerpts]

input_ids = np.array(input_ids)
attention_masks = []
# create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
attention_masks = np.array(attention_masks)
# create token type ids
token_type_ids = [[0 for i in seq] for seq in input_ids]
token_type_ids = np.array(token_type_ids)

In [108]:
n_splits = 5
n_repeats = 1
group_count = 10
# n_epochs_stop = 3
epochs = 5
# epochs_no_improve = 0
cv = regressor_stratified_cv(n_splits = n_splits, n_repeats = n_repeats,
                           group_count = group_count, random_state = 0, strategy = 'quantile')

i = 0
eval_losses = []
for train_index, test_index in cv.split(input_ids, targets):
    print('======== Iter {:}  ========'.format(i))
    train_inputs, test_inputs = input_ids[train_index], input_ids[test_index]
    train_targets, test_targets = targets[train_index], targets[test_index]
    train_masks, test_masks = attention_masks[train_index], attention_masks[test_index]
    train_type_ids, test_type_ids = token_type_ids[train_index], token_type_ids[test_index]
    
    train_inputs = torch.tensor(train_inputs, dtype=torch.long)
    test_inputs = torch.tensor(test_inputs, dtype=torch.long)
    train_targets = torch.tensor(train_targets, dtype=torch.float)
    test_targets = torch.tensor(test_targets, dtype=torch.float)
    train_masks = torch.tensor(train_masks, dtype=torch.long)
    test_masks = torch.tensor(test_masks, dtype=torch.long)
    train_type_ids = torch.tensor(train_type_ids, dtype=torch.long)
    test_type_ids = torch.tensor(test_type_ids, dtype=torch.long)
    
    train_data = TensorDataset(train_inputs, train_masks, train_type_ids, train_targets)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = BATCH_SIZE)

    test_data = TensorDataset(test_inputs, test_masks, test_type_ids, test_targets)
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = BATCH_SIZE)
    
    model = BertForSequenceRegression.from_pretrained(BERT_FP)
    model.to(device)
    set_trainable(model, True)
    set_trainable(model.bert.embeddings, False)
    set_trainable(model.bert.encoder, False)
    optimizer = AdamW(model.parameters(),
                  lr = 5e-4,
                  eps = 1e-6 
                )
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
    iter_eval_loss = []
#     min_eval_loss = np.Inf
    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        # training
        model.train()
        tr_loss = []
        
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            ids, input_mask, type_ids, target = batch
            output = model(ids, input_mask, type_ids, target)
            loss = RMSELoss(output, target)
            tr_loss.append(loss.cpu().detach().numpy().tolist())
            loss.backward()  
            optimizer.step()
            optimizer.zero_grad()  
            scheduler.step()
            
        train_losses = np.mean(tr_loss)  
        print("Train loss: ", train_losses)
        all_targets, all_preds = [], []
        model.eval()   
        eval_loss = []
        # evaluation
        # disable gradients 
        with torch.no_grad(): 
            for batch in test_dataloader:
                batch = tuple(t.to(device) for t in batch)
                ids, input_mask, type_ids, target = batch
                output = model(ids, input_mask, type_ids, target)
                loss = RMSELoss(output, target)
            eval_loss.append(loss.cpu().detach().numpy().tolist())
            
        epoch_eval_loss = np.mean(eval_loss)
        print("Eval loss: ", epoch_eval_loss)
        iter_eval_loss.append(epoch_eval_loss) 
#         if epoch_eval_loss < min_eval_loss:
#             epochs_no_improve = 0
#             min_eval_loss = epoch_eval_loss
#         else:
#             epochs_no_improve += 1
#         if epoch_i > 4 and epochs_no_improve >= n_epochs_stop:
#             print('Early stopping! Epoch {:}'.format(epoch_i) )
#             break
#         else:
#             continue
          
    iter_eval_loss = np.mean(iter_eval_loss)
    print("Iter eval loss: ", iter_eval_loss)
    eval_losses.append(iter_eval_loss)     
    i += 1    
    torch.cuda.empty_cache()
mean_eval_loss = np.mean(eval_losses)
print(mean_eval_loss)



Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.8519517334414201
Eval loss:  0.6968808770179749

Train loss:  0.6969663878561745
Eval loss:  0.8589928150177002

Train loss:  0.6483965054364271
Eval loss:  0.6154387593269348

Train loss:  0.6539863654425446
Eval loss:  0.7765675783157349

Train loss:  0.6372475619886963
Eval loss:  0.736697256565094
Iter eval loss:  0.7369154572486878


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.8481592326097085
Eval loss:  0.609836757183075

Train loss:  0.6841135679836005
Eval loss:  0.8185145854949951

Train loss:  0.6561007848088171
Eval loss:  0.47506988048553467

Train loss:  0.6438469857397214
Eval loss:  0.4910270571708679

Train loss:  0.6350655106591506
Eval loss:  0.6351885795593262
Iter eval loss:  0.6059273719787598


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.8555132091884882
Eval loss:  0.6127740144729614

Train loss:  0.6942685615848487
Eval loss:  0.668502926826477

Train loss:  0.6664743012105915
Eval loss:  0.4267936050891876

Train loss:  0.6447444251725372
Eval loss:  0.5891205072402954

Train loss:  0.6437784920276051
Eval loss:  0.8199254870414734
Iter eval loss:  0.623423308134079


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.8159941082269373
Eval loss:  0.6420449018478394

Train loss:  0.6784639320742916
Eval loss:  0.6168662905693054

Train loss:  0.6641563386984275
Eval loss:  0.5887134075164795

Train loss:  0.6391049442996442
Eval loss:  0.6296753287315369

Train loss:  0.6337927932470617
Eval loss:  0.724498450756073
Iter eval loss:  0.6403596758842468


Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertForSequenceRegression: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceRegression were not initialized from the model checkpoint at ../inp


Train loss:  0.8602627331102398
Eval loss:  0.5353209972381592

Train loss:  0.6843631552978301
Eval loss:  0.5619345307350159

Train loss:  0.6505449896966907
Eval loss:  0.5930908918380737

Train loss:  0.6484680049855944
Eval loss:  0.6828595995903015

Train loss:  0.6357929249044875
Eval loss:  0.8085628151893616
Iter eval loss:  0.6363537669181824
0.6485959160327912


In [109]:
n_splits = 5
n_repeats = 2
group_count = 10
cv = regressor_stratified_cv(n_splits = n_splits, n_repeats = n_repeats,
                           group_count = group_count, random_state = 0, strategy = 'quantile')

for train_index, test_index in cv.split(input_ids, targets):
    train_inputs, test_inputs = input_ids[train_index], input_ids[test_index]
    train_targets, test_targets = targets[train_index], targets[test_index]
    train_masks, test_masks = attention_masks[train_index], attention_masks[test_index]
    train_type_ids, test_type_ids = token_type_ids[train_index], token_type_ids[test_index]

In [110]:
train_inputs = torch.tensor(train_inputs, dtype=torch.long)
test_inputs = torch.tensor(test_inputs, dtype=torch.long)
train_targets = torch.tensor(train_targets, dtype=torch.float)
test_targets = torch.tensor(test_targets, dtype=torch.float)
train_masks = torch.tensor(train_masks, dtype=torch.long)
test_masks = torch.tensor(test_masks, dtype=torch.long)
train_type_ids = torch.tensor(train_type_ids, dtype=torch.long)
test_type_ids = torch.tensor(test_type_ids, dtype=torch.long)

In [111]:
train_data = TensorDataset(train_inputs, train_masks, train_type_ids, train_targets)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = BATCH_SIZE)

test_data = TensorDataset(test_inputs, test_masks, test_type_ids, test_targets)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = BATCH_SIZE)

In [112]:
set_trainable(model, True)
set_trainable(model.bert.embeddings, True)    
set_trainable(model.bert.encoder, True)

In [126]:
epochs = 5
optimizer = AdamW(model.parameters(),
                  lr = 1e-6,
                  eps = 1e-6 
                )
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
eval_losses = []
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    # training
    model.train()
    tr_loss = []
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        ids, input_mask, type_ids, target = batch
        output = model(ids, input_mask, type_ids, target)
        loss = RMSELoss(output, target)
        tr_loss.append(loss.cpu().detach().numpy().tolist())
        loss.backward()  
        optimizer.step()
        optimizer.zero_grad()  
        scheduler.step()
            
    train_losses = np.mean(tr_loss)  
    print("Train loss: ", train_losses)
    all_targets, all_preds = [], []
    model.eval()   
    eval_loss = []
    # evaluation
    # disable gradients 
    with torch.no_grad(): 
        for batch in test_dataloader:
            batch = tuple(t.to(device) for t in batch)
            ids, input_mask, type_ids, target = batch
            output = model(ids, input_mask, type_ids, target)
            loss = RMSELoss(output, target)
        eval_loss.append(loss.cpu().detach().numpy().tolist())
            
    epoch_eval_loss = np.mean(eval_loss)
    print("Eval loss: ", epoch_eval_loss)

    eval_losses.append(epoch_eval_loss)   
torch.cuda.empty_cache()
mean_eval_loss = np.mean(eval_losses)
print(mean_eval_loss)


Train loss:  0.5632944694707092
Eval loss:  0.5638938546180725

Train loss:  0.5484871620863256
Eval loss:  0.5631497502326965

Train loss:  0.5405800640583038
Eval loss:  0.6666775345802307

Train loss:  0.5323134526400499
Eval loss:  0.6047433614730835

Train loss:  0.5210321541403381
Eval loss:  0.5654462575912476

Train loss:  0.527106869388634
Eval loss:  0.6008443236351013

Train loss:  0.5125665102206486
Eval loss:  0.6662572026252747
0.6044303263936724


In [114]:
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [115]:
test["text"] = test["excerpt"].apply(lambda x: text_preprocessing(x))

In [116]:
excerpts = test.text.values

In [117]:
# convert sentences into tokens
input_ids = [tokenizer.encode(excerpt, add_special_tokens = True, max_length = MAX_LENGTH,
                              padding='max_length') for excerpt in excerpts]

input_ids = np.array(input_ids)
attention_masks = []
# create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
attention_masks = np.array(attention_masks)
# create token type ids
token_type_ids = [[0 for i in seq] for seq in input_ids]
token_type_ids = np.array(token_type_ids)

In [118]:
input_ids = torch.tensor(input_ids, dtype=torch.long)
attention_masks = torch.tensor(attention_masks, dtype=torch.long)
token_type_ids = torch.tensor(token_type_ids, dtype=torch.long)

In [119]:
prediction_data = TensorDataset(input_ids, attention_masks, token_type_ids)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)

In [120]:
model.eval()   
predictions , true_labels = [], []
# evaluation
for batch in prediction_dataloader:
    # disable gradients 
    batch = tuple(t.to(device) for t in batch)
    ids, input_mask, type_ids = batch
    with torch.no_grad():    
        output = model(ids, input_mask, type_ids) 
    output = output.cpu().detach().numpy().tolist()
    predictions += output

In [121]:
submission = pd.DataFrame({'id':test['id'],'target':predictions})

In [122]:
submission.to_csv('submission.csv',index=False)

In [123]:
torch.cuda.empty_cache()