# TF Model
This notebook is a fork of Xiang Zhang's notebook [here][1]. This notebook will save 5 fold model weights of TensorFlow deberta-v3-base model. This model is diverse from other models and helps improve the ensemble of 3rd place solution to Feedback Prize Learning English Language competition. Use ensemble inference notebook to make a submission csv file.

[1]: https://www.kaggle.com/code/electro/deberta-layerwiselr-lastlayerreinit-tensorflow

# Imports

In [1]:
import os, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
print(f'TF version: {tf.__version__}')
import tensorflow_addons as tfa
from tensorflow.keras import layers

import transformers
print(f'transformers version: {transformers.__version__}')
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

import sys
!pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

TF version: 2.6.4
transformers version: 4.20.1
Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7
[0m

In [2]:
def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
#     os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(42)

# Load DataFrame

In [3]:
df = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
display(df.head())
print('\n---------DataFrame Summary---------')
df.info()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5



---------DataFrame Summary---------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   text_id      3911 non-null   object 
 1   full_text    3911 non-null   object 
 2   cohesion     3911 non-null   float64
 3   syntax       3911 non-null   float64
 4   vocabulary   3911 non-null   float64
 5   phraseology  3911 non-null   float64
 6   grammar      3911 non-null   float64
 7   conventions  3911 non-null   float64
dtypes: float64(6), object(2)
memory usage: 244.6+ KB


# CV Split

In [4]:
N_FOLD = 5
TARGET_COLS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

skf = MultilabelStratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(skf.split(df, df[TARGET_COLS])):
    df.loc[val_index, 'fold'] = int(n)
df['fold'] = df['fold'].astype(int)
df['fold'].value_counts()

1    783
0    782
4    782
3    782
2    782
Name: fold, dtype: int64

In [5]:
df.to_csv('./df_folds.csv', index=False)

# Model Config

In [6]:
MAX_LENGTH = 512
BATCH_SIZE = 4
DEBERTA_MODEL = 'microsoft/deberta-v3-base'

In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained(DEBERTA_MODEL)
tokenizer.save_pretrained('./tokenizer/')

cfg = transformers.AutoConfig.from_pretrained(DEBERTA_MODEL, output_hidden_states=True)
cfg.hidden_dropout_prob = 0
cfg.attention_probs_dropout_prob = 0
cfg.save_pretrained('./tokenizer/')

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


# Data Process Function

In [8]:
def deberta_encode(texts, tokenizer=tokenizer):
    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=MAX_LENGTH, 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

In [9]:
def get_dataset(df):
    inputs = deberta_encode(df['full_text'])
    targets = np.array(df[TARGET_COLS], dtype="float32")
    return inputs, targets

# Model.

In [10]:
class MeanPool(tf.keras.layers.Layer):
    def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(mask, "float32"), -1)
        embedding_sum = tf.reduce_sum(inputs * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        return embedding_sum / mask_sum

In [11]:
class WeightsSumOne(tf.keras.constraints.Constraint):
    def __call__(self, w):
        return tf.nn.softmax(w, axis=0)

In [12]:
def get_model():
    input_ids = tf.keras.layers.Input(
        shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids"
    )
    
    attention_masks = tf.keras.layers.Input(
        shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_masks"
    )
   
    deberta_model = transformers.TFAutoModel.from_pretrained(DEBERTA_MODEL, config=cfg)
    
    #Last Layer Reinitialization or Partially Reinitialization
#     Uncommon next three lines to check deberta encoder block
#     print('DeBERTa Encoder Block:')
#     for layer in deberta_model.deberta.encoder.layer:
#         print(layer)
        
    REINIT_LAYERS = 1
    normal_initializer = tf.keras.initializers.GlorotUniform()
    zeros_initializer = tf.keras.initializers.Zeros()
    ones_initializer = tf.keras.initializers.Ones()

#     print(f'\nRe-initializing encoder block:')
    for encoder_block in deberta_model.deberta.encoder.layer[-REINIT_LAYERS:]:
#         print(f'{encoder_block}')
        for layer in encoder_block.submodules:
            if isinstance(layer, tf.keras.layers.Dense):
                layer.kernel.assign(normal_initializer(shape=layer.kernel.shape, dtype=layer.kernel.dtype))
                if layer.bias is not None:
                    layer.bias.assign(zeros_initializer(shape=layer.bias.shape, dtype=layer.bias.dtype))

            elif isinstance(layer, tf.keras.layers.LayerNormalization):
                layer.beta.assign(zeros_initializer(shape=layer.beta.shape, dtype=layer.beta.dtype))
                layer.gamma.assign(ones_initializer(shape=layer.gamma.shape, dtype=layer.gamma.dtype))

    deberta_output = deberta_model.deberta(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = deberta_output.hidden_states
    
    #WeightedLayerPool + MeanPool of the last 4 hidden states
    stack_meanpool = tf.stack(
        [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidden_states[-4:]], 
        axis=2)
    
    weighted_layer_pool = layers.Dense(1,
                                       use_bias=False,
                                       kernel_constraint=WeightsSumOne())(stack_meanpool)
    
    weighted_layer_pool = tf.squeeze(weighted_layer_pool, axis=-1)
    
    x = layers.Dense(6, activation='sigmoid')(weighted_layer_pool)
    output = layers.Rescaling(scale=4.0, offset=1.0)(x)
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)
    
    #Compile model with Layer-wise Learning Rate Decay
    layer_list = [deberta_model.deberta.embeddings] + list(deberta_model.deberta.encoder.layer)
    layer_list.reverse()
    
    INIT_LR = 1e-5
    LLRDR = 0.9
    LR_SCH_DECAY_STEPS = 1600 # 2 * len(train_df) // BATCH_SIZE
    
    lr_schedules = [tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=INIT_LR * LLRDR ** i, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3) for i in range(len(layer_list))]
    lr_schedule_head = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-4, 
        decay_steps=LR_SCH_DECAY_STEPS, 
        decay_rate=0.3)
    
    optimizers = [tf.keras.optimizers.Adam(learning_rate=lr_sch) for lr_sch in lr_schedules]
    
    optimizers_and_layers = [(tf.keras.optimizers.Adam(learning_rate=lr_schedule_head), model.layers[-4:])] +\
        list(zip(optimizers, layer_list))
    
#     Uncomment next three lines to check optimizers_and_layers
#     print('\nLayer-wise Learning Rate Decay Initial LR:')
#     for o,l in optimizers_and_layers:
#         print(f'{o._decayed_lr("float32").numpy()} for {l}')
        
    optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
    
    model.compile(optimizer=optimizer,
                 loss='huber_loss',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    return model

In [13]:
tf.keras.backend.clear_session()
model = get_model()
model.summary()

2022-12-14 18:25:33.220494: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-14 18:25:33.221595: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-14 18:25:33.222268: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-14 18:25:33.223132: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Downloading:   0%|          | 0.00/702M [00:00<?, ?B/s]

2022-12-14 18:26:29.732517: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
deberta (TFDebertaV2MainLayer)  TFBaseModelOutput(la 183831552   input_ids[0][0]                  
                                                                 attention_masks[0][0]            
__________________________________________________________________________________________________
mean_pool (MeanPool)            (None, 768)          0           deberta[0][9]                

# 5 Folds Training Loop

In [14]:
valid_rmses = []
for fold in range(N_FOLD):
    print(f'\n-----------FOLD {fold} ------------')
    
    #Create dataset
    train_df = df[df['fold'] != fold].reset_index(drop=True)
    valid_df = df[df['fold'] == fold].reset_index(drop=True)
    train_dataset = get_dataset(train_df)
    valid_dataset = get_dataset(valid_df)
    
    print('Data prepared.')
    print(f'Training data input_ids shape: {train_dataset[0][0].shape} dtype: {train_dataset[0][0].dtype}') 
    print(f'Training data attention_mask shape: {train_dataset[0][1].shape} dtype: {train_dataset[0][1].dtype}')
    print(f'Training data targets shape: {train_dataset[1].shape} dtype: {train_dataset[1].dtype}')
    print(f'Validation data input_ids shape: {valid_dataset[0][0].shape} dtype: {valid_dataset[0][0].dtype}')
    print(f'Validation data attention_mask shape: {valid_dataset[0][1].shape} dtype: {valid_dataset[0][1].dtype}')
    print(f'Validation data targets shape: {valid_dataset[1].shape} dtype: {valid_dataset[1].dtype}')
    
    #Create model
    tf.keras.backend.clear_session()
    model = get_model()
    print('Model prepared.')
    
    #Training model
    print('Start training...')
    callbacks = [
    tf.keras.callbacks.ModelCheckpoint(f"best_model_fold{fold}.h5",
                                       monitor="val_loss",
                                       mode="min",
                                       save_best_only=True,
                                       verbose=1,
                                       save_weights_only=True,),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                     min_delta=1e-5, 
                                     patience=3, 
                                     verbose=1,
                                     mode='min',)
    ]
    history = model.fit(x=train_dataset[0],
                        y=train_dataset[1],
                        validation_data=valid_dataset, 
                        epochs=10,
                        shuffle=True,
                        batch_size=BATCH_SIZE,
                        callbacks=callbacks
                       )
    
    valid_rmses.append(np.min(history.history['val_root_mean_squared_error']))
    print('Training finished.')
    del train_dataset, valid_dataset, train_df, valid_df
    gc.collect()


-----------FOLD 0 ------------
Data prepared.
Training data input_ids shape: (3129, 512) dtype: int32
Training data attention_mask shape: (3129, 512) dtype: int32
Training data targets shape: (3129, 6) dtype: float32
Validation data input_ids shape: (782, 512) dtype: int32
Validation data attention_mask shape: (782, 512) dtype: int32
Validation data targets shape: (782, 6) dtype: float32
Model prepared.
Start training...
Epoch 1/10


2022-12-14 18:26:52.930942: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)



Epoch 00001: val_loss improved from inf to 0.10474, saving model to best_model_fold0.h5
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.10474
Epoch 3/10

Epoch 00003: val_loss improved from 0.10474 to 0.10110, saving model to best_model_fold0.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.10110
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.10110
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.10110
Epoch 00006: early stopping
Training finished.

-----------FOLD 1 ------------
Data prepared.
Training data input_ids shape: (3128, 512) dtype: int32
Training data attention_mask shape: (3128, 512) dtype: int32
Training data targets shape: (3128, 6) dtype: float32
Validation data input_ids shape: (783, 512) dtype: int32
Validation data attention_mask shape: (783, 512) dtype: int32
Validation data targets shape: (783, 6) dtype: float32
Model prepared.
Start training...
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.10719, saving model to be

In [15]:
print(f'{len(valid_rmses)} Folds validation RMSE:\n{valid_rmses}')
print(f'Local CV Average score: {np.mean(valid_rmses)}')

5 Folds validation RMSE:
[0.4505632817745209, 0.459128201007843, 0.46264514327049255, 0.45495015382766724, 0.45215511322021484]
Local CV Average score: 0.4558883786201477


# Inference and Submission



In [16]:
test_df = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
test_df.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [17]:
test_dataset = deberta_encode(test_df['full_text'])

# 5 Folds ensemble prediction

In [18]:
fold_preds = []
for fold in range(N_FOLD):
    tf.keras.backend.clear_session()
    model = get_model()
    model.load_weights(f'best_model_fold{fold}.h5')
    print(f'\nFold {fold} inference...')
    pred = model.predict(test_dataset, batch_size=BATCH_SIZE)
    fold_preds.append(pred)
    gc.collect()


Fold 0 inference...

Fold 1 inference...

Fold 2 inference...

Fold 3 inference...

Fold 4 inference...


In [19]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

In [20]:
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(preds, columns=TARGET_COLS)], axis=1)
sub_df.to_csv('submission.csv', index=False)

In [21]:
sub_df.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.92725,2.759098,3.090906,2.983907,2.65453,2.625527
1,000BAD50D026,2.697297,2.502063,2.743973,2.359509,2.12162,2.607434
2,00367BB2546B,3.708724,3.460325,3.653716,3.513729,3.37132,3.262542
