<a href="https://colab.research.google.com/github/bryanbayup/phising-detection/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import os
import json
import numpy as np
import tensorflow as tf
from transformers import (
    BertTokenizer,
    TFAutoModelForMaskedLM,
    create_optimizer,
    AutoTokenizer
)
from tensorflow.keras.utils import Sequence

In [11]:
class ConversationDataset(Sequence):
    def __init__(self,
                 data_path,
                 tokenizer,
                 max_len=128,
                 batch_size=16,
                 mlm_probability=0.15,
                 nsp_ratio=0.5):
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size
        self.mlm_probability = mlm_probability
        self.nsp_ratio = nsp_ratio

        with open(self.data_path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)

        self.samples = self.create_samples(self.data)

    def create_samples(self, data):
        samples = []
        for conv in data:
            turns = conv['turns']
            for i in range(len(turns)-1):
                current_utt = turns[i]['utterance']
                next_utt = turns[i+1]['utterance']

                if np.random.rand() < self.nsp_ratio:
                    is_next = 1
                else:
                    random_conv = np.random.choice(data)
                    random_turn = np.random.choice(random_conv['turns'])
                    next_utt = random_turn['utterance']
                    is_next = 0

                encoded = self.tokenizer.encode_plus(
                    current_utt,
                    next_utt,
                    max_length=self.max_len,
                    truncation=True,
                    padding='max_length',
                    return_tensors='np'
                )

                input_ids = encoded['input_ids'][0]
                attention_mask = encoded['attention_mask'][0]
                token_type_ids = encoded['token_type_ids'][0]

                input_ids_masked, mlm_labels = self.mask_tokens(input_ids)
                dialog_context = np.zeros((768,), dtype=np.float32)

                samples.append({
                    'input_ids': input_ids_masked,
                    'attention_mask': attention_mask,
                    'token_type_ids': token_type_ids,
                    'mlm_labels': mlm_labels,
                    'nsp_label': is_next,
                    'dialog_context': dialog_context
                })
        return samples

    def mask_tokens(self, input_ids):
        input_ids = input_ids.copy()
        mlm_labels = np.full_like(input_ids, -100)
        special_ids = {101, 102, 0}
        candidate_positions = [i for i, token_id in enumerate(input_ids) if token_id not in special_ids]

        num_to_mask = max(1, int(len(candidate_positions)*self.mlm_probability))
        mask_positions = np.random.choice(candidate_positions, num_to_mask, replace=False)

        for pos in mask_positions:
            mlm_labels[pos] = input_ids[pos]
            rand = np.random.rand()
            if rand < 0.8:
                input_ids[pos] = 103
            elif rand < 0.9:
                input_ids[pos] = np.random.randint(999, 30000)
            else:
                pass
        return input_ids, mlm_labels

    def __len__(self):
        return int(np.ceil(len(self.samples)/self.batch_size))

    def __getitem__(self, idx):
        batch = self.samples[idx*self.batch_size: (idx+1)*self.batch_size]
        input_ids = np.array([s['input_ids'] for s in batch], dtype=np.int32)
        attention_mask = np.array([s['attention_mask'] for s in batch], dtype=np.int32)
        token_type_ids = np.array([s['token_type_ids'] for s in batch], dtype=np.int32)
        mlm_labels = np.array([s['mlm_labels'] for s in batch], dtype=np.int32)
        nsp_labels = np.array([s['nsp_label'] for s in batch], dtype=np.int32)
        dialog_context = np.array([s['dialog_context'] for s in batch], dtype=np.float32)

        return (input_ids, attention_mask, token_type_ids, dialog_context), {'labels': mlm_labels, 'next_sentence_label': nsp_labels}

In [12]:
# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("cahya/bert-base-indonesian-522M")
# Gunakan model TensorFlow
base_model = TFAutoModelForMaskedLM.from_pretrained("cahya/bert-base-indonesian-522M", from_pt=True)

class CustomDialogModel(tf.keras.Model):
    def __init__(self, base_model):
        super(CustomDialogModel, self).__init__()
        self.bert_pretrain = base_model
        self.context_dense = tf.keras.layers.Dense(768, activation='relu')

    def call(self, inputs, training=False):
        input_ids, attention_mask, token_type_ids, dialog_context = inputs
        # TFAutoModelForMaskedLM memiliki model TF di dalamnya
        outputs = self.bert_pretrain.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        sequence_output = outputs.last_hidden_state
        pooled_output = outputs.pooler_output

        dialog_ctx_emb = self.context_dense(dialog_context)
        combined_pooled = pooled_output + dialog_ctx_emb

        prediction_scores, seq_relationship_score = self.bert_pretrain.cls(
            sequence_output,
            combined_pooled
        )

        return prediction_scores, seq_relationship_score

class CustomTrainer(tf.keras.Model):
    def __init__(self, dialog_model):
        super(CustomTrainer, self).__init__()
        self.dialog_model = dialog_model
        self.loss_fct_mlm = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        self.loss_fct_nsp = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def call(self, inputs, training=False):
        (input_ids, attention_mask, token_type_ids, dialog_context) = inputs
        prediction_scores, seq_relationship_score = self.dialog_model(
            (input_ids, attention_mask, token_type_ids, dialog_context),
            training=training
        )
        return prediction_scores, seq_relationship_score

    def train_step(self, data):
        (input_ids, attention_mask, token_type_ids, dialog_context), labels = data
        mlm_labels = labels['labels']
        nsp_labels = labels['next_sentence_label']

        with tf.GradientTape() as tape:
            prediction_scores, seq_relationship_score = self(
                (input_ids, attention_mask, token_type_ids, dialog_context),
                training=True
            )

            mlm_active_loss = tf.not_equal(mlm_labels, -100)
            mlm_loss = self.loss_fct_mlm(mlm_labels, prediction_scores)
            mlm_loss = (tf.reduce_sum(mlm_loss * tf.cast(mlm_active_loss, dtype=mlm_loss.dtype)) /
                         (tf.reduce_sum(tf.cast(mlm_active_loss, tf.float32)) + 1e-5))

            nsp_loss = self.loss_fct_nsp(nsp_labels, seq_relationship_score)
            nsp_loss = tf.reduce_mean(nsp_loss)

            total_loss = mlm_loss + nsp_loss

        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        self.compiled_metrics.update_state(total_loss)
        return {"loss": total_loss, "mlm_loss": mlm_loss, "nsp_loss": nsp_loss}

    def test_step(self, data):
        (input_ids, attention_mask, token_type_ids, dialog_context), labels = data
        mlm_labels = labels['labels']
        nsp_labels = labels['next_sentence_label']

        prediction_scores, seq_relationship_score = self(
            (input_ids, attention_mask, token_type_ids, dialog_context),
            training=False
        )

        mlm_active_loss = tf.not_equal(mlm_labels, -100)
        mlm_loss = self.loss_fct_mlm(mlm_labels, prediction_scores)
        mlm_loss = (tf.reduce_sum(mlm_loss * tf.cast(mlm_active_loss, dtype=mlm_loss.dtype)) /
                     (tf.reduce_sum(tf.cast(mlm_active_loss, tf.float32)) + 1e-5))

        nsp_loss = self.loss_fct_nsp(nsp_labels, seq_relationship_score)
        nsp_loss = tf.reduce_mean(nsp_loss)

        total_loss = mlm_loss + nsp_loss
        return {"loss": total_loss, "mlm_loss": mlm_loss, "nsp_loss": nsp_loss}

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForMaskedLM: ['bert.embeddings.position_ids', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing TFBertForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [13]:
data_path = "data2.json"
train_dataset = ConversationDataset(data_path=data_path, tokenizer=tokenizer, max_len=128, batch_size=8)
val_dataset = ConversationDataset(data_path=data_path, tokenizer=tokenizer, max_len=128, batch_size=8)

num_epochs = 2
initial_lr = 3e-5
optimizer = tf.keras.optimizers.Adam(learning_rate=initial_lr)

dialog_model = CustomDialogModel(base_model)
trainer_model = CustomTrainer(dialog_model)
trainer_model.compile(optimizer=optimizer)

history = trainer_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=num_epochs
)

Epoch 1/2


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''None values not supported.''
1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Exception encountered when calling CustomDialogModel.call().

[1mNone va

ValueError: Exception encountered when calling CustomDialogModel.call().

[1mNone values not supported.[0m

Arguments received by CustomDialogModel.call():
  • inputs=('tf.Tensor(shape=(None, 128), dtype=int32)', 'tf.Tensor(shape=(None, 128), dtype=int32)', 'tf.Tensor(shape=(None, 128), dtype=int32)', 'tf.Tensor(shape=(None, 768), dtype=float32)')
  • training=True