<a href="https://colab.research.google.com/github/bryanbayup/Machine-Learning/blob/main/Untitled15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow==2.13.0
!pip install sentencepiece



In [2]:
import tensorflow as tf
import numpy as np
import json
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
# Memuat dataset dari file JSON
with open('dataaa.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [4]:
utterances = []
intents = []
entities_list = []

for item in data:
    utterances.append(item['utterances'])
    intents.append(item['intent'])
    entities_list.append(item['entities'])

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

cleaned_utterances = [clean_text(utt) for utt in utterances]

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(cleaned_utterances)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [7]:
sequences = tokenizer.texts_to_sequences(cleaned_utterances)
max_seq_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
intents_encoded = label_encoder.fit_transform(intents)
num_intents = len(label_encoder.classes_)
intents_one_hot = to_categorical(intents_encoded, num_classes=num_intents)

In [9]:
# Membuat daftar semua entitas unik
unique_entities = set()
for entities in entities_list:
    for ent in entities:
        unique_entities.add(ent['entity'])

entity_to_id = {entity: idx + 1 for idx, entity in enumerate(unique_entities)}  # Mulai dari 1
entity_to_id['O'] = 0  # Label untuk 'Outside'
num_entities = len(entity_to_id)

# Membuat label NER
ner_labels = []

for i, entities in enumerate(entities_list):
    seq_len = len(padded_sequences[i])
    labels = ['O'] * seq_len
    for ent in entities:
        start = ent['start']
        end = ent['end']
        value = ent['value']
        entity = ent['entity']
        # Temukan indeks token yang sesuai
        token_seq = sequences[i]
        tokens = tokenizer.texts_to_sequences([clean_text(value)])[0]
        for idx in range(len(token_seq)):
            if token_seq[idx:idx+len(tokens)] == tokens:
                for j in range(len(tokens)):
                    labels[idx + j] = entity
                break
    # Konversi label ke ID
    labels_id = [entity_to_id[label] for label in labels]
    ner_labels.append(labels_id)

ner_labels = pad_sequences(ner_labels, maxlen=max_seq_length, padding='post')
ner_labels = [to_categorical(label_seq, num_classes=num_entities) for label_seq in ner_labels]
ner_labels = np.array(ner_labels)

In [10]:
class PetPointEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_size, hidden_size, max_seq_length):
        super(PetPointEmbedding, self).__init__()
        self.word_embeddings = tf.keras.layers.Embedding(vocab_size, embedding_size, name='word_embeddings')
        self.position_embeddings = tf.keras.layers.Embedding(max_seq_length, embedding_size, name='position_embeddings')
        self.embedding_hidden_mapping_in = tf.keras.layers.Dense(hidden_size, name='embedding_hidden_mapping_in')
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name='LayerNorm')
        self.dropout = tf.keras.layers.Dropout(0.1)

    def call(self, input_ids, training=False):
        seq_length = tf.shape(input_ids)[1]
        position_ids = tf.range(seq_length)[tf.newaxis, :]

        word_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        embeddings = word_embeddings + position_embeddings
        embeddings = self.embedding_hidden_mapping_in(embeddings)
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings, training=training)
        return embeddings

In [11]:
class PetPointTransformerBlock(tf.keras.layers.Layer):
    def __init__(self, hidden_size, num_attention_heads, intermediate_size):
        super(PetPointTransformerBlock, self).__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=num_attention_heads, key_dim=hidden_size // num_attention_heads, name='self_attention')
        self.attention_dropout = tf.keras.layers.Dropout(0.1)
        self.attention_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12)

        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(intermediate_size, activation='gelu'),
            tf.keras.layers.Dense(hidden_size)
        ])
        self.ffn_dropout = tf.keras.layers.Dropout(0.1)
        self.ffn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12)

    def call(self, inputs, training=False):
        attention_output = self.attention(inputs, inputs)
        attention_output = self.attention_dropout(attention_output, training=training)
        attention_output = self.attention_layer_norm(inputs + attention_output)

        ffn_output = self.ffn(attention_output)
        ffn_output = self.ffn_dropout(ffn_output, training=training)
        ffn_output = self.ffn_layer_norm(attention_output + ffn_output)

        return ffn_output

In [12]:
class PetPointModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_hidden_layers, num_attention_heads, intermediate_size, max_seq_length):
        super(PetPointModel, self).__init__()
        self.embedding = PetPointEmbedding(vocab_size, embedding_size, hidden_size, max_seq_length)
        self.encoder = PetPointTransformerBlock(hidden_size, num_attention_heads, intermediate_size)
        self.num_hidden_layers = num_hidden_layers

    def call(self, input_ids, training=False):
        hidden_states = self.embedding(input_ids, training=training)
        for _ in range(self.num_hidden_layers):
            hidden_states = self.encoder(hidden_states, training=training)
        return hidden_states

In [13]:
class PetPointForIntentClassification(tf.keras.Model):
    def __init__(self, petpoint_model, num_labels):
        super(PetPointForIntentClassification, self).__init__()
        self.petpoint = petpoint_model
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.classifier = tf.keras.layers.Dense(num_labels, name='classifier')

    def call(self, input_ids, training=False):
        outputs = self.petpoint(input_ids, training=training)
        pooled_output = tf.reduce_mean(outputs, axis=1)
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)
        return logits

In [14]:
class PetPointForNER(tf.keras.Model):
    def __init__(self, petpoint_model, num_labels):
        super(PetPointForNER, self).__init__()
        self.petpoint = petpoint_model
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.classifier = tf.keras.layers.Dense(num_labels, name='classifier')

    def call(self, input_ids, training=False):
        outputs = self.petpoint(input_ids, training=training)
        sequence_output = self.dropout(outputs, training=training)
        logits = self.classifier(sequence_output)
        return logits

In [15]:
embedding_size = 128
hidden_size = 768
num_hidden_layers = 12
num_attention_heads = 12
intermediate_size = 3072

petpoint_model = PetPointModel(
    vocab_size=vocab_size,
    embedding_size=embedding_size,
    hidden_size=hidden_size,
    num_hidden_layers=num_hidden_layers,
    num_attention_heads=num_attention_heads,
    intermediate_size=intermediate_size,
    max_seq_length=max_seq_length
)

In [16]:
model_intent = PetPointForIntentClassification(petpoint_model, num_labels=num_intents)

In [17]:
model_intent.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [18]:
history_intent = model_intent.fit(
    padded_sequences,
    intents_one_hot,
    validation_split=0.1,
    epochs=5,
    batch_size=16
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
model_ner = PetPointForNER(petpoint_model, num_labels=num_entities)

In [20]:
model_ner.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [22]:
history_ner = model_ner.fit(
    padded_sequences,
    ner_labels,
    validation_split=0.1,
    epochs=5,
    batch_size=16
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
loss, accuracy = model_intent.evaluate(padded_sequences, intents_one_hot)
print(f'Akurasi Model Klasifikasi Intent: {accuracy * 100:.2f}%')

Akurasi Model Klasifikasi Intent: 69.83%


In [24]:
loss, accuracy = model_ner.evaluate(padded_sequences, ner_labels)
print(f'Akurasi Model NER: {accuracy * 100:.2f}%')

Akurasi Model NER: 97.09%
