<a href="https://colab.research.google.com/github/bryanbayup/Machine-Learning/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install torch



In [2]:
import os
import json
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFGPT2LMHeadModel
import tensorflow as tf

# Parameter
model_name = "cahya/gpt2-small-indonesian-522M"
data_path = "data2.json"  # Sesuaikan dengan path dataset Anda
max_len = 256
batch_size = 2
epochs = 1
turns_context = 3  # berapa banyak turn sebelumnya dijadikan konteks

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if tokenizer.eos_token is None:
    tokenizer.eos_token = tokenizer.pad_token

# Load model GPT-2 ke TensorFlow
model = TFGPT2LMHeadModel.from_pretrained(model_name, from_pt=True)

# Load dataset (asumsi: data adalah list of conversations)
with open(data_path, 'r', encoding='utf-8') as f:
    conversations = json.load(f)

# Fungsi untuk membuat contoh training dari dataset
def build_examples(conversations, tokenizer, max_len=256, turns_context=3):
    examples = []
    # Format prompt: kita akan membentuk teks seperti ini:
    # User: ...
    # Bot: ...
    # User: ...
    # Bot:
    # Lalu model diminta melanjutkan teks setelah "Bot:".
    for conv in conversations:
        turns = conv["turns"]
        dialog_history = []
        for i, turn in enumerate(turns):
            speaker = turn["speaker"]
            text = turn["utterance"]
            dialog_history.append(f"{speaker.capitalize()}: {text}")
            # Setiap kali kita mendapatkan bot turn (selain yang pertama), kita buat satu contoh
            if speaker == "bot" and i > 0:
                # Ambil beberapa turn sebelumnya sebagai konteks
                start_idx = max(0, i - turns_context*2)
                context_lines = dialog_history[start_idx:i]  # semua turn sebelum bot turn ini
                prompt = "\n".join(context_lines) + "\nBot:"
                full_text = prompt + " " + text
                # Tokenisasi
                tokenized = tokenizer(
                    full_text,
                    truncation=True,
                    max_length=max_len,
                    return_tensors='np'
                )
                input_ids = tokenized["input_ids"][0]
                # labels = input_ids juga, causal LM akan shift internal
                examples.append(input_ids)
    return examples

examples = build_examples(conversations, tokenizer, max_len=max_len, turns_context=turns_context)

# Sekarang kita punya sekumpulan input_ids
# Kita akan membuat tf.data.Dataset dari examples
def gen():
    for ex in examples:
        yield {"input_ids": ex}

# Kita butuh fungsi pading di level tf.data
def encode_map_fn(features):
    # features["input_ids"] sudah tokenized, tinggal pad
    input_ids = features["input_ids"]
    length = tf.size(input_ids)
    # Kita pad hingga max_len
    padded = tf.pad(input_ids, [[0, max_len - length]], constant_values=tokenizer.pad_token_id)
    # labels = input_ids juga
    return (padded, padded)

dataset = tf.data.Dataset.from_generator(
    gen,
    output_signature={
        "input_ids": tf.TensorSpec(shape=(None,), dtype=tf.int32)
    }
)
dataset = dataset.map(encode_map_fn, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.shuffle(1000).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

# Sekarang kita akan membuat custom training loop atau gunakan compile & fit dengan loss
# TFGPT2LMHeadModel jika dipanggil dengan labels akan mengembalikan loss otomatis
# Namun, untuk integrasi Keras, kita butuh sedikit kustomisasi

class GPT2Trainer(tf.keras.Model):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def train_step(self, data):
        input_ids, labels = data
        with tf.GradientTape() as tape:
            outputs = self.model(input_ids, labels=labels, training=True)
            loss = outputs.loss
        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        # Pass both labels and predictions (which are the same as labels in this case)
        self.compiled_metrics.update_state(labels, labels)  # Changed line
        return {"loss": loss}

# Inisialisasi trainer model
trainer_model = GPT2Trainer(model)
trainer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5))

# Fine-tuning
trainer_model.fit(dataset, epochs=epochs)

# Setelah training, kita bisa menggunakan model untuk generasi teks multi-turn
def generate_response(history, max_length=50, temperature=0.8, top_p=0.9, top_k=50):
    prompt = "\n".join(history) + "\nBot:"
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    output_ids = model.generate(
        input_ids,
        max_length=(tf.shape(input_ids)[1] + max_length),
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature
    )
    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Ambil teks setelah "Bot:"
    if "Bot:" in generated:
        response = generated.split("Bot:", 1)[1].strip()
    else:
        response = generated
    return response

# Contoh inferensi
test_history = [
    "User: Kucing saya demam.",
    "Bot: Pastikan kucing tetap terhidrasi. Apakah ada gejala lain?",
    "User: Ia tampak lesu."
]
bot_resp = generate_response(test_history)
print("Bot:", bot_resp)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['lm_head.weight', 'transformer.h.10.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.7.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

Bot: Pastikan kucing tetap terhidrasi. Apakah ada gejala lain?
User: Ia tampak lesu.
Bot: Segera konsultasikan ke dokter hewan.
User: Apakah ada gejala serius?
Bot: Jika gejala sudah parah, segera bawa kucing Anda ke dokter hewan.
