# distilBERT <br/>
https://towardsdatascience.com/hugging-face-transformers-fine-tuning-distilbert-for-binary-classification-tasks-490f1d192379

In [25]:
from google.colab import drive
drive.mount('/content/drive') # /content/drive/MyDrive/HLT

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
!pip install transformers



In [27]:
!git clone https://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@github.com/antodima/toxic-comment.git
!mv toxic-comment/data/ .

fatal: destination path 'toxic-comment' already exists and is not an empty directory.
mv: cannot stat 'toxic-comment/data/': No such file or directory


In [28]:
import numpy as np
import pandas as pd
import datetime
from sklearn.metrics import accuracy_score, coverage_error
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from sklearn.model_selection import train_test_split
from transformers import TFDistilBertModel, DistilBertTokenizerFast

tf.compat.v1.enable_eager_execution()

In [29]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [30]:
def batch_encode(tokenizer, texts, batch_size=256, max_length=100):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(batch, max_length=max_length, padding='longest', truncation=True, 
                                             return_attention_mask=True, return_token_type_ids=False)
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
        
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [31]:
df = pd.read_csv('data/train.csv')
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_classes = len(categories)
comments = list(df.comment_text.values)

X_train, X_valid, y_train, y_valid = train_test_split(df['comment_text'].values, df[categories].values, 
                                                      train_size=0.8, shuffle=True, random_state=42)
print(X_train.shape,y_train.shape)
print(X_valid.shape,y_valid.shape)

(127656,) (127656, 6)
(31915,) (31915, 6)


In [32]:
df_test = pd.read_csv('data/test.csv')
cols = df_test.columns
label_cols = list(cols[2:])
test_labels_df = pd.read_csv('data/test_labels.csv')
df_test = df_test.merge(test_labels_df, on='id', how='left')
test_label_cols = list(df_test.columns[2:])
df_test = df_test[~df_test[test_label_cols].eq(-1).any(axis=1)] # remove irrelevant rows/comments with -1 values

X_test = df_test['comment_text'].values
y_test = df_test[categories].values

print(X_test.shape,y_test.shape)

(63978,) (63978, 6)


In [33]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
for layer in distilBERT.layers:
    layer.trainable = False

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [34]:
max_length = 100
# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist(), max_length=max_length)
# Encode X_valid
X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist(), max_length=max_length)
# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist(), max_length=max_length)

In [35]:
input_ids_layer = tf.keras.layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
input_attention_layer = tf.keras.layers.Input(shape=(max_length,), name='input_attention', dtype='int32')
last_hidden_state = distilBERT([input_ids_layer, input_attention_layer])[0]
cls_token = last_hidden_state[:, 0, :]
output = tf.keras.layers.Dense(50, activation="relu", kernel_initializer='he_uniform')(cls_token)
output = tf.keras.layers.Dropout(0.1)(output)
output = tf.keras.layers.Dense(num_classes, activation='sigmoid', name='classifier', kernel_initializer='glorot_uniform')(output)

model = tf.keras.Model([input_ids_layer, input_attention_layer], output, name='')
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[
                  'accuracy',
                  'binary_accuracy', 
                  tf.keras.metrics.AUC(multi_label=True),
                  tf.keras.metrics.Precision(),
                  tf.keras.metrics.Recall()
              ])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_attention (InputLayer)    [(None, 100)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_2 (TFDisti TFBaseModelOutput(la 66362880    input_ids[0][0]                  
                                                                 input_attention[0][0]            
__________________________________________________________________________________________________
tf.__operators__.getitem_2 (Sli (None, 768)          0           tf_distil_bert_model_2[0][0

In [36]:
mode = 0  # 0: train, 1: evaluate, 2: finetuning

if mode == 0:  # training
  now = datetime.datetime.now()
  ckp_dir = f"/content/drive/MyDrive/HLT/checkpoint_{now.strftime('%Y%m%d_%H%M%S')}/distil-bert"
  print(f"Training {ckp_dir} . . .")
  epochs = 4
  batch_size = 64
  num_steps = len(X_train) // batch_size

  with tf.device('/GPU:0'):
    history = model.fit(
        x = [X_train_ids, X_train_attention],
        y = y_train,
        epochs = epochs,
        batch_size = batch_size,
        steps_per_epoch = num_steps,
        validation_data = ([X_valid_ids, X_valid_attention], y_valid),
        verbose=1,
        callbacks=[
          ModelCheckpoint(filepath=ckp_dir, save_weights_only=True, monitor='val_binary_accuracy', mode='max', save_best_only=True)
        ]
    )

    scores = model.evaluate([X_test_ids, X_test_attention], y_test, verbose=2)
    print(f"Test {model.metrics_names}: {scores}")
    scores = model.evaluate([X_valid_ids, X_valid_attention], y_valid, verbose=2)
    print(f"Validation {model.metrics_names}: {scores}")

elif mode == 1:  # evaluate
  path = '/content/drive/MyDrive/HLT/checkpoint_20210920_105809/'
  print(f"Evaluating {path} . . .")
  model.load_weights(tf.train.latest_checkpoint(path))

  scores = model.evaluate([X_test_ids, X_test_attention], y_test, verbose=2)
  print(f"Test {model.metrics_names}: {scores}")
  scores = model.evaluate([X_valid_ids, X_valid_attention], y_valid, verbose=2)
  print(f"Validation {model.metrics_names}: {scores}")

elif mode == 2:  # fnetuning
  path = '/content/drive/MyDrive/HLT/checkpoints_9751/'
  model.load_weights(tf.train.latest_checkpoint(path))
  
  scores = model.evaluate([X_test_ids, X_test_attention], y_test, verbose=2)
  print(f"Test {model.metrics_names} (pre): {scores}")
  scores = model.evaluate([X_valid_ids, X_valid_attention], y_valid, verbose=2)
  print(f"Validation {model.metrics_names} (pre): {scores}")

  print(f"Finetuning {path} . . .")
  for layer in model.layers[-3].layers:
    layer.trainable = True
  model.summary()
  epochs = 2
  batch_size = 64
  num_steps = len(X_train) // batch_size

  now = datetime.datetime.now()
  ckp_dir = f"/content/drive/MyDrive/HLT/checkpoint_ft_{now.strftime('%Y%m%d_%H%M%S')}/distil-bert-finetuned"

  with tf.device('/GPU:0'):
    history = model.fit(
        x = [X_train_ids, X_train_attention],
        y = y_train,
        epochs = epochs,
        batch_size = batch_size,
        steps_per_epoch = num_steps,
        validation_data = ([X_valid_ids, X_valid_attention], y_valid),
        verbose=1,
        callbacks=[
          ModelCheckpoint(filepath=ckp_dir, save_weights_only=True, monitor='val_binary_accuracy', mode='max', save_best_only=True)
        ]
    )

  scores = model.evaluate([X_test_ids, X_test_attention], y_test, verbose=2)
  print(f"Test {model.metrics_names}: {scores}")
  scores = model.evaluate([X_valid_ids, X_valid_attention], y_valid, verbose=2)
  print(f"Validation {model.metrics_names}: {scores}")

Training /content/drive/MyDrive/HLT/checkpoint_20210920_195726/distil-bert . . .
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
2000/2000 - 386s - loss: 0.0790 - accuracy: 0.9914 - binary_accuracy: 0.9713 - auc_2: 0.9411 - precision_2: 0.6544 - recall_2: 0.5109
Test ['loss', 'accuracy', 'binary_accuracy', 'auc_2', 'precision_2', 'recall_2']: [0.07895965129137039, 0.991387665271759, 0.9713393449783325, 0.941145122051239, 0.6544442772865295, 0.5108980536460876]
998/998 - 193s - loss: 0.0569 - accuracy: 0.9889 - binary_accuracy: 0.9791 - auc_2: 0.9567 - precision_2: 0.8315 - recall_2: 0.5441
Validation ['loss', 'accuracy', 'binary_accuracy', 'auc_2', 'precision_2', 'recall_2']: [0.05693058669567108, 0.9888767004013062, 0.9790856242179871, 0.9567497372627258, 0.831497073173523, 0.5441051721572876]
