# distilBERT <br/>
https://towardsdatascience.com/hugging-face-transformers-fine-tuning-distilbert-for-binary-classification-tasks-490f1d192379

In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from sklearn.model_selection import train_test_split
from transformers import TFDistilBertModel, DistilBertTokenizerFast

In [5]:
def batch_encode(tokenizer, texts, batch_size=256, max_length=100):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(batch, max_length=max_length, padding='longest', truncation=True, 
                                             return_attention_mask=True, return_token_type_ids=False)
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
        
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [23]:
df = pd.read_csv('data/train.csv')
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_classes = len(categories)
comments = list(df.comment_text.values)

X_train, X_valid, y_train, y_valid = train_test_split(df['comment_text'].values, df[categories].values, 
                                                      train_size=0.8, shuffle=True, random_state=42)
print(X_train.shape,y_train.shape)
print(X_valid.shape,y_valid.shape)

(127656,) (127656, 6)
(31915,) (31915, 6)


In [24]:
df_test = pd.read_csv('data/test.csv')
cols = df_test.columns
label_cols = list(cols[2:])
test_labels_df = pd.read_csv('data/test_labels.csv')
df_test = df_test.merge(test_labels_df, on='id', how='left')
test_label_cols = list(df_test.columns[2:])
df_test = df_test[~df_test[test_label_cols].eq(-1).any(axis=1)] # remove irrelevant rows/comments with -1 values

X_test = df_test['comment_text'].values
y_test = df_test[categories].values

print(X_test.shape,y_test.shape)

(63978,) (63978, 6)


In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
for layer in distilBERT.layers:
    layer.trainable = False

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [9]:
# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())
# Encode X_valid
X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist())
# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())

In [27]:
lr = 0.001

input_ids_layer = tf.keras.layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
input_attention_layer = tf.keras.layers.Input(shape=(max_length,), name='input_attention', dtype='int32')
last_hidden_state = distilBERT([input_ids_layer, input_attention_layer])[0]
cls_token = last_hidden_state[:, 0, :]
output = tf.keras.layers.Dense(num_classes, activation='softmax')(cls_token)
model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
model.compile(tf.keras.optimizers.Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
epochs = 6
batch_size = 64
num_steps = len(X_train) // batch_size

history = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train,
    epochs = epochs,
    batch_size = batch_size,
    steps_per_epoch = num_steps,
    validation_data = ([X_valid_ids, X_valid_attention], y_valid),
    verbose=1
)