# Versions 
* Version 2 : using ragnars method of reducing step size CV: 0.3299 LB: 0.491
* Version 3: removing early stopping CV: 0.2315 LB: 0.497
* Version 4: used roberta large CV:- 0.2217 LB:-

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from glob import glob

from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error as mse

import tensorflow as tf 
from tensorflow.keras.layers import Layer,Input,LSTM,Bidirectional,Embedding,Dense
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model,load_model,save_model,model_from_json
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping
from tensorflow.keras.activations import tanh,softmax

from transformers import TFRobertaModel, RobertaTokenizerFast , AutoConfig , TFAutoModel



# Hyper parameters

In [2]:
max_len = 250
batch_size = 24
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
path=["/kaggle/input/commonlitreadabilityprize/sample_submission.csv",
"/kaggle/input/commonlitreadabilityprize/train.csv",
"/kaggle/input/commonlitreadabilityprize/test.csv"]

df_train = pd.read_csv(path[1])
df_test = pd.read_csv(path[2])
df_ss = pd.read_csv(path[0])

In [4]:
df_train = df_train.drop(['url_legal','license','standard_error'],axis='columns')
df_test = df_test.drop(['url_legal','license'],axis='columns')

In [5]:
df_train.columns, df_test.columns

(Index(['id', 'excerpt', 'target'], dtype='object'),
 Index(['id', 'excerpt'], dtype='object'))

In [6]:
X= df_train['excerpt']
y=df_train['target']

X_test = df_test['excerpt']

In [7]:
X = X.tolist()
y = y.tolist()
X_test = X_test.tolist()

# Initialise Tokenizer

In [8]:
tokenizer = RobertaTokenizerFast.from_pretrained("../input/robertabaseweights2/result/roberta-tokenizer")
tokenizer

PreTrainedTokenizerFast(name_or_path='../input/robertabaseweights2/result/roberta-tokenizer', vocab_size=50265, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

# Tokenize words

In [9]:
train_embeddings = tokenizer(X, truncation = True , padding ='max_length' , max_length=max_len)
test_embeddings = tokenizer(X_test,  truncation = True , padding = 'max_length'  ,max_length =max_len)

In [10]:
len(train_embeddings['input_ids'][6]),len(test_embeddings['input_ids'][6])

(250, 250)

In [11]:
@tf.function
def map_function(encodings , target):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    target = tf.cast(target, tf.float32 )
    
    return {'input_ids': input_ids , 'attention_mask': attention_mask}, target



@tf.function
def map_function_test(encodings):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    return {'input_ids': input_ids , 'attention_mask': attention_mask}

In [12]:
train = tf.data.Dataset.from_tensor_slices((train_embeddings,y))

train = (
        train
        .map(map_function, num_parallel_calls=AUTOTUNE)
        .batch(32)
        .prefetch(AUTOTUNE)
    )


test = tf.data.Dataset.from_tensor_slices((test_embeddings))

test = (
    test
    .map(map_function_test, num_parallel_calls = AUTOTUNE)
    .batch(32)
    .prefetch(AUTOTUNE)
)

# Loading Model

In [13]:
class RobertaBlock(Layer):
    def __init__(self, max_len ,name):
        super(RobertaBlock , self).__init__()
        self.config = AutoConfig.from_pretrained(name)
        self.config.update({"output_hidden_states":True})
        self.roberta_model = TFAutoModel.from_pretrained(name , config = self.config)
        self.dense = Dense(1, activation='linear')
        
    def call(self,input_tensors):
        input_id = input_tensors[0]
        attention_mask = input_tensors[1]
        roberta_output = self.roberta_model.roberta(input_ids = input_id , attention_mask = attention_mask)
        roberta_output = roberta_output.hidden_states[-1]
        return roberta_output

class RegressionHead(Layer):
    def __init__(self):
        super(RegressionHead , self).__init__()
        self.dense = Dense(1, activation="linear")
    
    def call(self , input_tensors):
        x = self.dense(input_tensors)
        return x
        
class AttentionHead(Layer):
    def __init__(self):
        super(AttentionHead , self).__init__()
        self.dense1 = Dense(512)
        self.tanh =  tanh
        self.softmax = softmax
        self.dense2 = Dense(1,activation="softmax")

    def call(self , input_tensors):
        x = self.dense1(input_tensors)
        x = self.tanh(x)
        x = self.dense2(x)
        x = self.softmax(x , axis = 1)
        return x  

class CLRModel(Model):
    def __init__(self,max_len,name):
        super(CLRModel, self).__init__()
        self.roberta_model = RobertaBlock(max_len , name)
        self.attentionhead = AttentionHead()
        self.regressionhead = RegressionHead()
    
    def call(self,input_tensors):
        roberta_output = self.roberta_model(input_tensors)
        #print('shape of roberta output is' , roberta_output.shape)
        weights = self.attentionhead(roberta_output)
        #print('shape of attention head is',weights.shape)
        context_vector = tf.reduce_sum(weights * roberta_output, axis=1)
        #print("shape of context vector" , context_vector.shape)
        x = self.regressionhead(context_vector)
        return x
    
    def model(self):
        input_id = Input(shape = (max_len, ) ,dtype=tf.int32, name = 'input_ids')
        attention_mask = Input(shape=(max_len,) ,dtype=tf.int32, name = 'attention_mask')
        
        return Model(inputs = [input_id , attention_mask] , outputs = self.call([input_id , attention_mask]))

In [14]:
def load_model_weights(model, path):
    print("loading weights")
    model.load_weights(path)
    return model

In [15]:
file_path = "../input/robertabaseweights2/result/*.h5"
model_weight_paths = glob(file_path)
model_weight_paths = sorted(model_weight_paths)


# json_path = "../input/robertabaseweights/model_structure.json"
# model_structure= load_model_from_json(json_path)

model_path = "../input/huggingface-roberta/roberta-large"
model = CLRModel(max_len , model_path).model()

model1 = load_model_weights(model , model_weight_paths[0])
model2 = load_model_weights(model , model_weight_paths[1])
model3 = load_model_weights(model , model_weight_paths[2])
model4 = load_model_weights(model , model_weight_paths[3])
model5 = load_model_weights(model , model_weight_paths[4])

Some layers from the model checkpoint at ../input/huggingface-roberta/roberta-large were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/huggingface-roberta/roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


loading weights
loading weights
loading weights
loading weights
loading weights


In [16]:
y_pred_1 = model1.predict(test , verbose = 1)
y_pred_2 = model2.predict(test , verbose = 1)
y_pred_3 = model3.predict(test, verbose = 1)
y_pred_4 = model4.predict(test, verbose = 1)
y_pred_5 = model5.predict(test, verbose = 1)



In [17]:
sub_scores=[]
for i,j,k,l,m in zip(y_pred_1,y_pred_2,y_pred_3,y_pred_4,y_pred_5):
    sub_scores.append(np.mean([i[0],j[0],k[0],l[0],m[0]]))

In [18]:
sub=pd.DataFrame({'id':df_ss['id'],'target':sub_scores})
sub.to_csv('submission.csv',index=False)
sub.head()

Unnamed: 0,id,target
0,c0f722661,-0.24749
1,f0953f0a5,-0.380616
2,0df072751,-0.463892
3,04caf4e0c,-2.186117
4,0e63f8bea,-1.851066
