<a href="https://colab.research.google.com/github/bryanbayup/phising-detection/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import json
import numpy as np
import tensorflow as tf
from transformers import (
    BertTokenizer,
    TFBertForPreTraining,
    create_optimizer,
    AutoTokenizer,
    AutoModelForMaskedLM
)
from tensorflow.keras.utils import Sequence

In [2]:
class ConversationDataset(Sequence):
    """
    Contoh class Dataset untuk meload data percakapan.
    Anda harus menyesuaikan:
    - Path dataset
    - Cara mengambil utterance, melakukan masking,
      dan menghasilkan label NSP.
    - Mengolah konteks multi-turn.
    """

    def __init__(self,
                 data_path,
                 tokenizer,
                 max_len=128,
                 batch_size=16,
                 mlm_probability=0.15,
                 nsp_ratio=0.5):
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size
        self.mlm_probability = mlm_probability
        self.nsp_ratio = nsp_ratio

        # Load data dari JSON
        with open(self.data_path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)

        # Expecting self.data to be a list of conversations
        # Each conversation: { "conversation_id": ..., "turns": [...] }

        self.samples = self.create_samples(self.data)

    def create_samples(self, data):
        # Contoh pembuatan sampel:
        # Kita akan mengambil setiap turn dan memasangkannya dengan turn berikutnya
        # untuk keperluan NSP. Untuk MLM, kita akan mask input_ids.
        samples = []
        for conv in data:
            turns = conv['turns']
            for i in range(len(turns)-1):
                current_utt = turns[i]['utterance']
                next_utt = turns[i+1]['utterance']

                # Label NSP: 1 jika next_utt memang kelanjutan, 0 jika kita ambil kalimat random
                # (contoh sederhana: dengan probabilitas nsp_ratio kita gunakan next_utt benar,
                # sisanya gunakan utterance dari percakapan lain).
                if np.random.rand() < self.nsp_ratio:
                    # next_utt sesuai urutan -> NSP = 1
                    is_next = 1
                else:
                    # next_utt acak dari tempat lain -> NSP = 0
                    random_conv = np.random.choice(data)
                    random_turn = np.random.choice(random_conv['turns'])
                    next_utt = random_turn['utterance']
                    is_next = 0

                # Gabungkan current_utt [SEP] next_utt sesuai format NSP BERT
                encoded = self.tokenizer.encode_plus(
                    current_utt,
                    next_utt,
                    max_length=self.max_len,
                    truncation=True,
                    padding='max_length',
                    return_tensors='np'
                )

                input_ids = encoded['input_ids'][0]
                attention_mask = encoded['attention_mask'][0]
                token_type_ids = encoded['token_type_ids'][0]

                # Buat label MLM
                input_ids_masked, mlm_labels = self.mask_tokens(input_ids)

                # Di sini kita belum memasukkan konteks dialog yang kompleks.
                # Misalnya, kita dapat mengambil beberapa turn sebelumnya untuk dialog_context.
                # Untuk sederhana, gunakan vektor nol sebagai konteks.
                dialog_context = np.zeros((768,), dtype=np.float32)  # Dummy context

                samples.append({
                    'input_ids': input_ids_masked,
                    'attention_mask': attention_mask,
                    'token_type_ids': token_type_ids,
                    'mlm_labels': mlm_labels,
                    'nsp_label': is_next,
                    'dialog_context': dialog_context
                })

        return samples

    def mask_tokens(self, input_ids):
        # Masking MLM secara sederhana:
        # 1. Tentukan indeks mana yang akan di-mask.
        # 2. Ganti dengan [MASK] (id=103), atau random token.
        # Pastikan untuk tidak memask token khusus (CLS, SEP).
        # Ini adalah simplifikasi.
        input_ids = input_ids.copy()
        mlm_labels = np.full_like(input_ids, -100)

        # Cari indeks token yang bisa di-mask (bukan [CLS]=101 dan bukan [SEP]=102, bukan padding=0)
        special_ids = {101, 102, 0}
        candidate_positions = [i for i, token_id in enumerate(input_ids) if token_id not in special_ids]

        num_to_mask = max(1, int(len(candidate_positions)*self.mlm_probability))
        mask_positions = np.random.choice(candidate_positions, num_to_mask, replace=False)

        for pos in mask_positions:
            mlm_labels[pos] = input_ids[pos]
            # 80% ganti dengan [MASK]
            # 10% ganti dengan token random
            # 10% biarkan token asli
            rand = np.random.rand()
            if rand < 0.8:
                input_ids[pos] = 103  # [MASK]
            elif rand < 0.9:
                input_ids[pos] = np.random.randint(999, 30000)  # random token id
            else:
                # dibiarkan sama
                pass

        return input_ids, mlm_labels

    def __len__(self):
        return int(np.ceil(len(self.samples)/self.batch_size))

    def __getitem__(self, idx):
        batch = self.samples[idx*self.batch_size: (idx+1)*self.batch_size]
        max_len = self.max_len

        input_ids = np.array([s['input_ids'] for s in batch], dtype=np.int32)
        attention_mask = np.array([s['attention_mask'] for s in batch], dtype=np.int32)
        token_type_ids = np.array([s['token_type_ids'] for s in batch], dtype=np.int32)
        mlm_labels = np.array([s['mlm_labels'] for s in batch], dtype=np.int32)
        nsp_labels = np.array([s['nsp_label'] for s in batch], dtype=np.int32)
        dialog_context = np.array([s['dialog_context'] for s in batch], dtype=np.float32)

        # Output untuk pretraining BERT adalah: (logits_mlm, logits_nsp)
        # Kita akan menyusun label sesuai kebutuhan:
        # Biasanya TFBertForPreTraining menggunakan label:
        # {'labels': mlm_labels, 'next_sentence_label': nsp_labels}
        return (input_ids, attention_mask, token_type_ids, dialog_context), {'labels': mlm_labels, 'next_sentence_label': nsp_labels}

In [5]:
# Load tokenizer & model backbone
tokenizer = AutoTokenizer.from_pretrained("cahya/bert-base-indonesian-522M")
base_model = AutoModelForMaskedLM.from_pretrained("cahya/bert-base-indonesian-522M")

# Kita perlu membungkus model ini agar dapat memasukkan dialog_context
class CustomDialogModel(tf.keras.Model):
    def __init__(self, base_model):
        super(CustomDialogModel, self).__init__()
        self.bert_pretrain = base_model
        # Dense untuk memproses dialog_context
        self.context_dense = tf.keras.layers.Dense(768, activation='relu')

    def call(self, inputs, training=False):
        input_ids, attention_mask, token_type_ids, dialog_context = inputs

        # Pass to BERT
        outputs = self.bert_pretrain.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            training=training
        )
        sequence_output = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
        pooled_output = outputs.pooler_output        # [batch, hidden_size]

        # Proses dialog context, gabung ke pooled_output (contoh sederhana)
        dialog_ctx_emb = self.context_dense(dialog_context)  # [batch, hidden_size]
        # Gabung dengan pooled_output
        combined_pooled = pooled_output + dialog_ctx_emb  # [batch, hidden_size]

        # Gunakan head MLM & NSP dari model asli:
        prediction_scores, seq_relationship_score = self.bert_pretrain.cls(
            sequence_output,
            combined_pooled,
            training=training
        )

        return prediction_scores, seq_relationship_score

# Define loss dan train_step custom, karena kita punya dua label: MLM & NSP
class CustomTrainer(tf.keras.Model):
    def __init__(self, dialog_model):
        super(CustomTrainer, self).__init__()
        self.dialog_model = dialog_model

        # Loss untuk MLM (SparseCategoricalCrossentropy) dan NSP (SparseCategoricalCrossentropy)
        self.loss_fct_mlm = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        self.loss_fct_nsp = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def call(self, inputs, training=False):
        (input_ids, attention_mask, token_type_ids, dialog_context), labels = inputs
        mlm_labels = labels['labels']
        nsp_labels = labels['next_sentence_label']

        prediction_scores, seq_relationship_score = self.dialog_model(
            (input_ids, attention_mask, token_type_ids, dialog_context),
            training=training
        )

        # Hitung MLM loss: ignore_index = -100
        mlm_active_loss = tf.not_equal(mlm_labels, -100)
        mlm_loss = self.loss_fct_mlm(mlm_labels, prediction_scores)
        mlm_loss = tf.reduce_sum(mlm_loss * tf.cast(mlm_active_loss, dtype=mlm_loss.dtype)) / (tf.reduce_sum(tf.cast(mlm_active_loss, tf.float32)) + 1e-5)

        # Hitung NSP loss
        nsp_loss = self.loss_fct_nsp(nsp_labels, seq_relationship_score)
        nsp_loss = tf.reduce_mean(nsp_loss)

        total_loss = mlm_loss + nsp_loss
        self.add_metric(mlm_loss, name='mlm_loss', aggregation='mean')
        self.add_metric(nsp_loss, name='nsp_loss', aggregation='mean')
        return total_loss

tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/230k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at cahya/bert-base-indonesian-522M were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClas

In [7]:
data_path = "data2.json"
train_dataset = ConversationDataset(data_path=data_path, tokenizer=tokenizer, max_len=128, batch_size=8)
val_dataset = ConversationDataset(data_path=data_path, tokenizer=tokenizer, max_len=128, batch_size=8)  # Contoh sama, seharusnya beda data

train_steps = len(train_dataset)
val_steps = len(val_dataset)

num_epochs = 2
batch_size = 8
initial_lr = 3e-5
optimizer = tf.keras.optimizers.Adam(learning_rate=initial_lr)

dialog_model = CustomDialogModel(base_model)
trainer_model = CustomTrainer(dialog_model)
trainer_model.compile(optimizer=optimizer)

history = trainer_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=num_epochs
)

Epoch 1/2


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''too many values to unpack (expected 2)''


ValueError: Exception encountered when calling CustomTrainer.call().

[1mtoo many values to unpack (expected 2)[0m

Arguments received by CustomTrainer.call():
  • inputs=('tf.Tensor(shape=(None, 128), dtype=int32)', 'tf.Tensor(shape=(None, 128), dtype=int32)', 'tf.Tensor(shape=(None, 128), dtype=int32)', 'tf.Tensor(shape=(None, 768), dtype=float32)')
  • training=True