# Versions 
* Version1 : using ragnars method of reducing step size CV: 0.3299  LB: 0.491
* Version 2: removing early stopping CV: 0.2315 LB: 0.497
* Version 3 : using roberta large CV: 0.2217  LB:

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.svm import SVR

import h5py

import tensorflow as tf 
from tensorflow.keras.layers import Layer,Input,LSTM,Bidirectional,Embedding,Dense, Conv1D, Dropout , MaxPool1D , MaxPooling1D, GlobalAveragePooling2D , GlobalAveragePooling1D
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model,load_model,save_model, model_from_json , Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping ,LearningRateScheduler
from tensorflow.keras import backend as K
from tensorflow.keras.activations import tanh, softmax
import tensorflow_addons as tfa

from transformers import RobertaTokenizerFast,TFRobertaModel,TFAutoModel,AutoConfig


# Hyper parameters

In [2]:
max_len = 250
batch_size = 24
AUTOTUNE = tf.data.AUTOTUNE
SEED = 123
MODEL=['bert-base-uncased' ,'roberta-base' , 'roberta-large']

model_name = MODEL[2]

import os
os.makedirs("./result")

save_dir="./result"

In [3]:
paths=["/kaggle/input/commonlitreadabilityprize/sample_submission.csv",
"/kaggle/input/commonlitreadabilityprize/train.csv",
"/kaggle/input/commonlitreadabilityprize/test.csv"]

df_train=pd.read_csv(paths[1])
df_test=pd.read_csv(paths[2])
df_ss=pd.read_csv(paths[0])

In [4]:
df_train = df_train.drop(['url_legal','license','standard_error'],axis='columns')
df_test = df_test.drop(['url_legal','license'],axis='columns')

In [5]:
df_train.columns, df_test.columns

(Index(['id', 'excerpt', 'target'], dtype='object'),
 Index(['id', 'excerpt'], dtype='object'))

In [6]:
X= df_train['excerpt']
y=df_train['target']

X_test = df_test['excerpt']

# Define Tokenizer

In [7]:
tokenizer_path = "../input/huggingface-roberta/roberta-large"
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)
tokenizer.save_pretrained("./result/roberta-tokenizer")

('./result/roberta-tokenizer/tokenizer_config.json',
 './result/roberta-tokenizer/special_tokens_map.json',
 './result/roberta-tokenizer/vocab.json',
 './result/roberta-tokenizer/merges.txt',
 './result/roberta-tokenizer/added_tokens.json',
 './result/roberta-tokenizer/tokenizer.json')

# Dataset Prep

In [8]:
@tf.function
def map_function(encodings , target):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    target = tf.cast(target, tf.float32 )
    
    return {'input_ids': input_ids , 'attention_mask': attention_mask}, target

# Modelling

In [9]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [10]:
class RobertaBlock(Layer):
    def __init__(self, max_len ,name):
        super(RobertaBlock , self).__init__()
        self.config = AutoConfig.from_pretrained(name)
        self.config.update({"output_hidden_states":True,"hidden_dropout_prob": 0.0, "layer_norm_eps": 1e-7})
        self.roberta_model = TFAutoModel.from_pretrained(name , config = self.config)
        self.dense = Dense(1, activation='linear')
        
    def call(self,input_tensors):
        input_id = input_tensors[0]
        attention_mask = input_tensors[1]
        roberta_output = self.roberta_model.roberta(input_ids = input_id , attention_mask = attention_mask)
        roberta_output = roberta_output.hidden_states[-1]
        return roberta_output
    
class RegressionHead(Layer):
    def __init__(self):
        super(RegressionHead , self).__init__()
        self.dense = Dense(1, activation="linear")
    
    def call(self , input_tensors):
        x = self.dense(input_tensors)
        return x
        
class AttentionHead(Layer):
    def __init__(self):
        super(AttentionHead , self).__init__()
        self.dense1 = Dense(512)
        self.tanh =  tanh
        self.softmax = softmax
        self.dense2 = Dense(1,activation="softmax")
    
    def call(self , input_tensors):
        x = self.dense1(input_tensors)
        x = self.tanh(x)
        x = self.dense2(x)
        x = self.softmax(x , axis = 1)
        return x    


class CLRModel(Model):
    def __init__(self,max_len,name):
        super(CLRModel, self).__init__()
        self.roberta_model = RobertaBlock(max_len , name)
        self.attentionhead = AttentionHead()
        self.regressionhead = RegressionHead()
    
    def call(self,input_tensors):
        roberta_output = self.roberta_model(input_tensors)
        #print('shape of roberta output is' , roberta_output.shape)
        weights = self.attentionhead(roberta_output)
        #print('shape of attention head is',weights.shape)
        context_vector = tf.reduce_sum(weights * roberta_output, axis=1)
        #print("shape of context vector" , context_vector.shape)
        x = self.regressionhead(context_vector)
        return x
    
    def model(self):
        input_id = Input(shape = (max_len, ) ,dtype=tf.int32, name = 'input_ids')
        attention_mask = Input(shape=(max_len,) ,dtype=tf.int32, name = 'attention_mask')
        
        return Model(inputs = [input_id , attention_mask] , outputs = self.call([input_id , attention_mask]))



In [11]:
model_name

'roberta-large'

In [12]:
model_name = "../input/huggingface-roberta/roberta-large"
with strategy.scope():
    model = CLRModel(max_len , model_name).model()

    model.compile(
          optimizer = tfa.optimizers.AdamW(learning_rate=  2e-5 , weight_decay = 1e-7 ),
          metrics = RootMeanSquaredError(),
          loss= "mse"
        )
    


Some layers from the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta/roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 250)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 250)]        0                                            
__________________________________________________________________________________________________
roberta_block (RobertaBlock)    (None, 250, 1024)    355359744   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
attention_head (AttentionHead)  (None, 250, 1)       525313      roberta_block[0][0]          

# Kfold Training

In [14]:
scores=[]
iterations = 1
kfold = KFold(n_splits=5, shuffle= True , random_state = SEED)
for train_idx, test_idx in kfold.split(X,y):
    print("************** iteration",iterations,"**************")
    X_train = X.loc[train_idx]
    X_test = X.loc[test_idx]
    y_train = y.loc[train_idx]
    y_test = y.loc[test_idx]
    
    X_train = X_train.tolist()
    X_test = X_test.tolist()

    y_train = y_train.tolist()
    y_test = y_test.tolist()
    
    #tokenization
    print('tokenization')
    train_embeddings = tokenizer(X_train, truncation = True , padding = True , max_length=max_len)
    test_embeddings = tokenizer(X_test , truncation = True , padding =True , max_length = max_len)
    
    #print(train_embeddings.keys())
    train = tf.data.Dataset.from_tensor_slices((train_embeddings,y_train))

    train = (
            train
            .repeat()
            .shuffle(2048)
            .map(map_function, num_parallel_calls=AUTOTUNE)
            .batch(batch_size)
            .prefetch(AUTOTUNE)
        )
    
    test = tf.data.Dataset.from_tensor_slices((test_embeddings , y_test))
    test = (
        test
        .map(map_function, num_parallel_calls = AUTOTUNE)
        .batch(batch_size)
        .prefetch(AUTOTUNE)
    )

    train_step = np.ceil(len(X_train)/(batch_size*16))    
    
    #Clearing backend session
    K.clear_session()
    print("Backend Cleared")
    
    #early_stopping=EarlyStopping(monitor="val_root_mean_squared_error",min_delta=0,patience=20,verbose=1,mode="min",restore_best_weights=True)
    #reduce_lr=ReduceLROnPlateau(monitor="val_root_mean_squared_error",factor=0.1,patience=2,min_lr= 1e-8 , verbose=1)
    model_checkpoint = ModelCheckpoint(f'{save_dir}/roberta_weight_fold_{iterations}.h5',
                                                        monitor = 'val_root_mean_squared_error', 
                                                 verbose = 1, 
                                                        save_best_only = True,
                                                        save_weights_only = True, 
                                                        mode = 'min')

    hist=model.fit(train, steps_per_epoch = train_step,validation_data=test,epochs= 70,callbacks = [model_checkpoint])

    #prediction
    print("predicting")
    model.load_weights(f'{save_dir}/roberta_weight_fold_{iterations}.h5')
    y_pred = model.predict(test)
    print(np.sqrt(mse(y_pred,y_test)))
    scores.append(np.sqrt(mse(y_pred,y_test)))
    
    #saving model
    #print("saving model")
    #localhost_save_option = tf.saved_model.SaveOptions(experimental_io_device="/job:localhost")
    #model.save_weights(f'{save_dir}/roberta_weight_fold_{iterations}.h5', options=localhost_save_option)
    iterations+=1
    
print("the final average rmse is ", np.mean(scores))

************** iteration 1 **************
tokenization
Backend Cleared
Epoch 1/70

Epoch 00001: val_root_mean_squared_error improved from inf to 1.11859, saving model to ./result/roberta_weight_fold_1.h5
Epoch 2/70

Epoch 00002: val_root_mean_squared_error improved from 1.11859 to 0.96645, saving model to ./result/roberta_weight_fold_1.h5
Epoch 3/70

Epoch 00003: val_root_mean_squared_error improved from 0.96645 to 0.96271, saving model to ./result/roberta_weight_fold_1.h5
Epoch 4/70

Epoch 00004: val_root_mean_squared_error improved from 0.96271 to 0.86183, saving model to ./result/roberta_weight_fold_1.h5
Epoch 5/70

Epoch 00005: val_root_mean_squared_error did not improve from 0.86183
Epoch 6/70

Epoch 00006: val_root_mean_squared_error improved from 0.86183 to 0.71851, saving model to ./result/roberta_weight_fold_1.h5
Epoch 7/70

Epoch 00007: val_root_mean_squared_error improved from 0.71851 to 0.62356, saving model to ./result/roberta_weight_fold_1.h5
Epoch 8/70

Epoch 00008: val_