# Transformer Based Enconding and Model


In [1]:
!pip install transformers==2.3.0

Collecting transformers==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 4.2MB/s eta 0:00:01
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/28/78/fef8d089db5b97546fd6d1ff2e813b8544e85670bf3a8c378c9d0250b98d/sacremoses-0.0.53.tar.gz (880kB)
[K     |████████████████████████████████| 880kB 32.8MB/s eta 0:00:01
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.53-cp36-none-any.whl size=895254 sha256=3fc4a7282b5ca286abd4c8f0fc9e020a7329c09369115751c4d7ce756deacad9
  Stored in directory: /root/.cache/pip/wheels/56/d5/b2/bc878b2bbddfbcc8fd62ca73c4fd842bd28c1fd3dbdf424c74
Successfully built sacremoses
Installing collected packages: sacremoses, transformers
Successf

In [2]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

### BERT Embedding

In [9]:
# Loading the data

train_df = pd.read_csv('/kaggle/input/fea-eng-toxiccomments/final_data.csv')


# features = ['sentence_count', 'word_count', 'unique_word_count', 
#             'length', 'punctuation_count', 'upper_case_count', 
#             'stopword_count', '#_count', 'unique_word_count_percent', 
#             'Punctuation_percent', 'ip_count','link_count', 
#             'article_id_count', 'username_count', 'clean_comment']

target_col = ['toxic', 'severe_toxic', 'obscene', 'threat',
               'insult', 'identity_hate']
train_df = train_df[['id', 'clean_comment', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']]
train_df = train_df.dropna()

In [10]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 128

def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

input_ids = tokenize_sentences(train_df['clean_comment'], tokenizer, MAX_LEN)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
attention_masks = create_attention_masks(input_ids)

HBox(children=(IntProgress(value=0, max=159508), HTML(value='')))




### Splitting train and test dataset

In [11]:
# splitting train and test dataset

from sklearn.model_selection import train_test_split

labels =  train_df[target_col].values

train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=0, test_size=0.1)
train_masks, test_masks, train_labels2, test_labels2 = train_test_split(attention_masks, labels, random_state=0, test_size=0.1)

train_size = len(train_inputs)
test_size = len(test_inputs)

### Creating TensorFlow dataset

In [12]:
BATCH_SIZE = 32
NR_EPOCHS = 1

def create_dataset(data_tuple, epochs=1, batch_size=32, buffer_size=10000, train=True):
    dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
    if train:
        dataset = dataset.shuffle(buffer_size=buffer_size)
    dataset = dataset.repeat(epochs)
    dataset = dataset.batch(batch_size)
    if train:
        dataset = dataset.prefetch(1)
    
    return dataset

train_dataset = create_dataset((train_inputs, train_masks, train_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)
test_dataset = create_dataset((test_inputs, test_masks, test_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)

#### BERT Modelling

In [13]:
from transformers import TFBertModel
from tensorflow.keras.layers import Dense, Flatten

class BertClassifier(tf.keras.Model):    
    def __init__(self, bert: TFBertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = Dense(num_classes, activation='sigmoid')
        
    @tf.function
    # Loading pre-trained BERT model
    def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        outputs = self.bert(input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask)
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
                
        return cls_output

model = BertClassifier(TFBertModel.from_pretrained(bert_model_name), len(target_col))

In [14]:
import time
from transformers import create_optimizer

steps_per_epoch = train_size // BATCH_SIZE
test_steps = test_size // BATCH_SIZE

# loss Function
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=False)
train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')

# Optimizer 
warmup_steps = steps_per_epoch // 3
total_steps = steps_per_epoch * NR_EPOCHS - warmup_steps
optimizer = create_optimizer(init_lr=2e-5, num_train_steps=total_steps, num_warmup_steps=warmup_steps)

# Metrics
train_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(target_col))]
test_auc_metrics = [tf.keras.metrics.AUC() for i in range(len(target_col))]


@tf.function
def train_step(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    with tf.GradientTape() as tape:
        predictions = model(token_ids, attention_mask=masks)
        loss = loss_object(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables), 1.0)

    train_loss(loss)

    for i, auc in enumerate(train_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
        
@tf.function
def predict(model, token_ids, masks, labels):
    labels = tf.dtypes.cast(labels, tf.float32)

    predictions = model(token_ids, attention_mask=masks, training=False)
    v_loss = loss_object(labels, predictions)

    test_loss(v_loss)
    for i, auc in enumerate(test_auc_metrics):
        auc.update_state(labels[:,i], predictions[:,i])
        
    return predictions
                                              
def train(model, train_dataset, test_dataset, train_steps_per_epoch, test_steps_per_epoch, epochs):
    for epoch in range(epochs):
        for i, (token_ids, masks, labels) in enumerate(tqdm(train_dataset, total=train_steps_per_epoch)):
            train_step(model, token_ids, masks, labels)
            if i % 1000 == 0:
                print(f'\nTrain Step: {i}, Loss: {train_loss.result()}')
        
        


In [15]:
# Training the Model
train(model, train_dataset, test_dataset, train_steps_per_epoch=steps_per_epoch, 
      test_steps_per_epoch=test_steps, epochs=NR_EPOCHS)

HBox(children=(IntProgress(value=0, max=4486), HTML(value='')))


Train Step: 0, Loss: 0.8382728099822998

Train Step: 1000, Loss: 0.14716094732284546

Train Step: 2000, Loss: 0.09841518849134445

Train Step: 3000, Loss: 0.08033288270235062

Train Step: 4000, Loss: 0.07092995196580887



#### Prediction and Evaluation

In [16]:
TEST_BATCH_SIZE = 32
test_steps = len(test_inputs) // TEST_BATCH_SIZE
result = []

for i, (token_ids, masks, labels) in enumerate(tqdm(test_dataset, total=test_steps)):
    predictions = predict(model, token_ids, masks, labels)
    result.append(predictions)

flat_list = [item for sublist in result for item in sublist]
predictions = []
for res in flat_list:
    predictions.append(np.array(res))
    
predictions = pd.DataFrame(predictions, columns=target_col)

# Converting the score to label using thresholding
for target_label in target_col:
    predictions.loc[predictions[target_label] >= 0.5, target_label] = 1
    predictions.loc[predictions[target_label] < 0.5, target_label] = 0
test_labels = pd.DataFrame(test_labels, columns=target_col)

HBox(children=(IntProgress(value=0, max=498), HTML(value='')))




In [17]:
def evaluation_metrics(y_test, y_pred):
    result = {}
    result['Accuracy'] = accuracy_score(y_test, y_pred)
    result['Precision'] = precision_score(y_test, y_pred, average='weighted')
    result['Recall'] = recall_score(y_test, y_pred, average='weighted')
    result['F1 Score'] = f1_score(y_test, y_pred, average='weighted')
    return result

In [18]:
result_df = pd.DataFrame(['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])
result_list = []

for target_label in target_col:
    res = evaluation_metrics(test_labels[target_label],predictions[target_label])
    res = list(res.values())
    res.insert(0, target_label)
    result_list.append(res) 

pd.DataFrame(result_list, columns=['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,Target Variable,Accuracy,Precision,Recall,F1 Score
0,toxic,0.838004,0.831326,0.838004,0.834632
1,severe_toxic,0.986208,0.979383,0.986208,0.982784
2,obscene,0.906526,0.901649,0.906526,0.904072
3,threat,0.997116,0.994241,0.997116,0.995676
4,insult,0.908156,0.907985,0.908156,0.908071
5,identity_hate,0.988277,0.98515,0.988277,0.986704
