<a href="https://colab.research.google.com/github/bryanbayup/Machine-Learning/blob/main/Untitled7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install Sastrawi
!pip install gensim

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [3]:
# Import Libraries
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from gensim.models import KeyedVectors
import re
import os
import pickle
from sklearn.model_selection import train_test_split

In [5]:
# # Download FastText
!wget -O id.tar.gz "https://www.dropbox.com/scl/fi/sju4o3keikox69euw51vy/id.tar.gz?rlkey=5jr3ijtbdwfahq7xcgig28qvy&e=1&st=gntzkzeo&dl=1"
!tar -xzf id.tar.gz

--2024-12-02 14:11:31--  https://www.dropbox.com/scl/fi/sju4o3keikox69euw51vy/id.tar.gz?rlkey=5jr3ijtbdwfahq7xcgig28qvy&e=1&st=gntzkzeo&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc3e9de2d8c78ba56429d0ccf7bb.dl.dropboxusercontent.com/cd/0/inline/Cfc-n03DTvA-tvGxdfuaBtx1FYn7WID4H4MbpqWMF0X_a0c-hvnwvNmnN7VewMOqTLzBji9jKKCwZvNfEbrUl6SaHhZxrznLPxuI0rCu8WfSW4Dx0D-V7JtJKQGcPkdU16w/file?dl=1# [following]
--2024-12-02 14:11:32--  https://uc3e9de2d8c78ba56429d0ccf7bb.dl.dropboxusercontent.com/cd/0/inline/Cfc-n03DTvA-tvGxdfuaBtx1FYn7WID4H4MbpqWMF0X_a0c-hvnwvNmnN7VewMOqTLzBji9jKKCwZvNfEbrUl6SaHhZxrznLPxuI0rCu8WfSW4Dx0D-V7JtJKQGcPkdU16w/file?dl=1
Resolving uc3e9de2d8c78ba56429d0ccf7bb.dl.dropboxusercontent.com (uc3e9de2d8c78ba56429d0ccf7bb.dl.dropboxusercontent.com)... 162.125.5.15, 2620:100:601d:15::a2

In [7]:
# 1. Load FastText Embedding
fasttext_model = KeyedVectors.load_word2vec_format('id.vec', binary=False)

# 2. Load Dataset
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 3. Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocess_text(text):
    text = clean_text(text)
    tokens = text.split()
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

utterances = []
intents = []
entities = []

for conversation in data:
    for turn in conversation:
        if turn['speaker'] == 'user':
            utterances.append(preprocess_text(turn['utterance']))
            intents.append(turn.get('intent', 'unknown'))
            entities.append(turn.get('entities', []))

# 4. Tokenization and Padding
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(utterances)
vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(utterances)
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# 5. Prepare Intent Labels
unique_intents = sorted(set(intents))
intent_to_idx = {intent: idx for idx, intent in enumerate(unique_intents)}
intent_labels = [intent_to_idx[intent] for intent in intents]
intent_labels_cat = to_categorical(intent_labels, num_classes=len(unique_intents))

# 6. Create Embedding Matrix
embedding_dim = fasttext_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in tokenizer.word_index.items():
    if word in fasttext_model:
        embedding_matrix[idx] = fasttext_model[word]

# 7. Split Data for Intent Classification
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, intent_labels_cat, test_size=0.2, random_state=42
)

In [8]:
# 8. Build Intent Classification Model
def build_intent_model():
    inputs = Input(shape=(max_seq_length,))
    embedding = tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_seq_length,
        trainable=False
    )(inputs)
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
    global_pool = GlobalMaxPooling1D()(conv)
    dense = Dense(64, activation='relu')(global_pool)
    dropout = Dropout(0.5)(dense)
    outputs = Dense(len(unique_intents), activation='softmax')(dropout)
    model = Model(inputs=inputs, outputs=outputs)
    return model

model_intent = build_intent_model()
model_intent.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [9]:
# 9. Train Intent Classification Model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_intent.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20, batch_size=16,
    callbacks=[early_stopping]
)

Epoch 1/20


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00


<keras.src.callbacks.history.History at 0x7ca02742fee0>

In [11]:
# 10. Prepare Data for NER
def prepare_ner_data(entities, tokenizer, max_seq_length):
    texts = []
    labels = []
    all_labels = set()

    # Buat label encoder untuk semua label NER
    for entity_list in entities:
        for entity in entity_list:
            all_labels.add(f"B-{entity['entity']}")
            all_labels.add(f"I-{entity['entity']}")
    all_labels.add("O")
    ner_label_encoder = {label: idx for idx, label in enumerate(sorted(all_labels))}
    ner_label_decoder = {idx: label for label, idx in ner_label_encoder.items()}

    # Iterasi setiap utterance
    for i, utterance in enumerate(utterances):
        tokens = tokenizer.texts_to_sequences([utterance])[0]
        tokenized_text = tokenizer.sequences_to_texts([tokens])[0].split()  # Token berdasarkan tokenizer
        label_seq = ["O"] * len(tokenized_text)

        for entity in entities[i]:
            entity_text = preprocess_text(entity['value'])
            entity_tokens = tokenizer.texts_to_sequences([entity_text])[0]
            entity_len = len(entity_tokens)

            # Cari kecocokan entitas dalam teks tokenized
            for idx in range(len(tokenized_text) - entity_len + 1):
                if tokens[idx:idx + entity_len] == entity_tokens:
                    label_seq[idx] = f"B-{entity['entity']}"
                    for j in range(1, entity_len):
                        label_seq[idx + j] = f"I-{entity['entity']}"
                    break

        # Tambahkan ke dataset
        texts.append(tokens)
        labels.append([ner_label_encoder[label] for label in label_seq])

    # Padding sequences
    texts_padded = pad_sequences(texts, maxlen=max_seq_length, padding='post')
    labels_padded = pad_sequences(labels, maxlen=max_seq_length, padding='post')
    labels_cat = to_categorical(labels_padded, num_classes=len(ner_label_encoder))

    return texts_padded, labels_cat, ner_label_encoder, ner_label_decoder

X_ner, y_ner, ner_label_encoder, ner_label_decoder = prepare_ner_data(entities, tokenizer, max_seq_length)

X_train_ner, X_val_ner, y_train_ner, y_val_ner = train_test_split(
    X_ner, y_ner, test_size=0.2, random_state=42
)

In [13]:
# 11. Build NER Model
def build_ner_model():
    inputs = Input(shape=(max_seq_length,))
    embedding = tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_seq_length,
        trainable=False
    )(inputs)
    lstm = Bidirectional(LSTM(64, return_sequences=True))(embedding)
    dropout = Dropout(0.5)(lstm)
    outputs = TimeDistributed(Dense(len(ner_label_encoder), activation='softmax'))(dropout)
    model = Model(inputs=inputs, outputs=outputs)
    return model

model_ner = build_ner_model()
model_ner.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 12. Train NER Model
model_ner.fit(
    X_train_ner, y_train_ner,
    validation_data=(X_val_ner, y_val_ner),
    epochs=20, batch_size=16,
    callbacks=[early_stopping]
)

Epoch 1/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 83ms/step - accuracy: 0.4226 - loss: 2.1344 - val_accuracy: 0.8494 - val_loss: 1.1819
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.8089 - loss: 1.1045 - val_accuracy: 0.8222 - val_loss: 0.7242
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8093 - loss: 0.7690 - val_accuracy: 0.8691 - val_loss: 0.5488


<keras.src.callbacks.history.History at 0x7ca01f4d10c0>