<a href="https://colab.research.google.com/github/bryanbayup/Machine-Learning/blob/main/Untitled15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=31291aad39b2dd7138bb96742a1c7f94fe27d0060d5c9baed04b289b062c5f56
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [3]:
import json
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from seqeval.metrics import classification_report as seq_classification_report
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import ipywidgets as widgets
from IPython.display import display, clear_output

In [4]:
# Memuat dataset dari file JSON
with open('dataaa.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Mengubah dataset menjadi DataFrame
df = pd.DataFrame(data)

In [5]:
# Menentukan jumlah sampel maksimum per intent
max_samples = 50

df_list = []
for intent in df['intent'].unique():
    df_intent = df[df['intent'] == intent]
    if len(df_intent) > max_samples:
        df_intent = resample(df_intent, replace=False, n_samples=max_samples, random_state=42)
    df_list.append(df_intent)

df_balanced = pd.concat(df_list).reset_index(drop=True)

In [6]:
# Encode intents
label_encoder = LabelEncoder()
df_balanced['intent_label'] = label_encoder.fit_transform(df_balanced['intent'])

# Simpan mapping label untuk penggunaan nanti
intent_mapping = dict(zip(df_balanced['intent_label'], df_balanced['intent']))

In [7]:
# Memfilter data yang memiliki entitas
df_ner = df[df['entities'].map(lambda d: len(d)) > 0].reset_index(drop=True)

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [9]:
texts = df_balanced['utterances'].apply(clean_text).tolist()
labels = df_balanced['intent_label'].tolist()

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

In [10]:
# Membuat tokenizer
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Mengonversi teks ke sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

# Padding sequences
max_seq_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in val_sequences))
train_padded = pad_sequences(train_sequences, maxlen=max_seq_length, padding='post')
val_padded = pad_sequences(val_sequences, maxlen=max_seq_length, padding='post')

# Mengonversi labels ke categorical
num_classes = len(label_encoder.classes_)
train_labels_cat = to_categorical(train_labels, num_classes=num_classes)
val_labels_cat = to_categorical(val_labels, num_classes=num_classes)

In [11]:
def prepare_ner_data(df, tokenizer, max_seq_length):
    texts = []
    labels = []
    for index, row in df.iterrows():
        text = clean_text(row['utterances'])
        entities = row['entities']
        tokens = tokenizer.texts_to_sequences([text])[0]
        label_seq = ['O'] * len(tokens)
        for ent in entities:
            ent_text = clean_text(ent['value'])
            ent_tokens = tokenizer.texts_to_sequences([ent_text])[0]
            for i in range(len(tokens)):
                if tokens[i:i+len(ent_tokens)] == ent_tokens:
                    label_seq[i] = 'B-' + ent['entity']
                    for j in range(1, len(ent_tokens)):
                        label_seq[i+j] = 'I-' + ent['entity']
                    break
        texts.append(tokens)
        labels.append(label_seq)
    # Padding
    texts_padded = pad_sequences(texts, maxlen=max_seq_length, padding='post')
    return texts_padded, labels

# Membuat label encoder untuk NER
all_labels = set()
for labels in df_ner['entities']:
    for ent in labels:
        all_labels.add('B-' + ent['entity'])
        all_labels.add('I-' + ent['entity'])
all_labels.add('O')
ner_label_encoder = {label: idx for idx, label in enumerate(sorted(all_labels))}
ner_label_decoder = {idx: label for label, idx in ner_label_encoder.items()}

# Siapkan data NER
train_texts_ner, train_labels_ner = prepare_ner_data(df_ner, tokenizer, max_seq_length)

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

model_intent = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_length),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model_intent.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [19]:
# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2),
    ModelCheckpoint(
        filepath='best_model_intent.h5',  # Nama file untuk menyimpan seluruh model
        save_best_only=True,
        save_weights_only=False  # Bisa dihapus karena default-nya sudah False
    )
]

history_intent = model_intent.fit(
    train_padded,
    train_labels_cat,
    validation_data=(val_padded, val_labels_cat),
    epochs=10,
    batch_size=16,
    callbacks=callbacks
)

ValueError: The filepath provided must end in `.keras` (Keras model format). Received: filepath=best_model_intent.h5

In [25]:
# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2),
    ModelCheckpoint(
        filepath='best_model_intent.weights.h5',  # Ubah nama file di sini
        save_best_only=True,
        save_weights_only=True
    )
]

history_intent = model_intent.fit(
    train_padded,
    train_labels_cat,
    validation_data=(val_padded, val_labels_cat),
    epochs=10,
    batch_size=16,
    callbacks=callbacks
)

Epoch 1/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 1.0000 - loss: 0.0097 - val_accuracy: 0.9615 - val_loss: 0.0990
Epoch 2/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 1.0000 - loss: 0.0071 - val_accuracy: 0.9487 - val_loss: 0.0981
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 1.0000 - loss: 0.0056 - val_accuracy: 0.9615 - val_loss: 0.1066
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 1.0000 - loss: 0.0043 - val_accuracy: 0.9615 - val_loss: 0.1105


In [26]:
def encode_ner_labels(labels, max_seq_length, ner_label_encoder):
    labels_encoded = []
    for label_seq in labels:
        label_ids = [ner_label_encoder[label] for label in label_seq]
        label_ids = label_ids + [ner_label_encoder['O']] * (max_seq_length - len(label_ids))
        labels_encoded.append(label_ids)
    labels_encoded = np.array(labels_encoded)
    labels_encoded = to_categorical(labels_encoded, num_classes=len(ner_label_encoder))
    return labels_encoded

train_labels_ner_encoded = encode_ner_labels(train_labels_ner, max_seq_length, ner_label_encoder)

In [27]:
from tensorflow.keras.layers import TimeDistributed, Bidirectional

model_ner = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    TimeDistributed(Dense(len(ner_label_encoder), activation='softmax'))
])

model_ner.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
# Callbacks
callbacks_ner = [
    EarlyStopping(monitor='val_loss', patience=2),
    ModelCheckpoint(
        filepath='best_model_ner.weights.h5',
        save_best_only=True,
        save_weights_only=True
    )
]

history_ner = model_ner.fit(
    train_texts_ner,
    train_labels_ner_encoded,
    validation_split=0.1,
    epochs=10,
    batch_size=16,
    callbacks=callbacks_ner
)

Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step - accuracy: 0.5653 - loss: 2.1907 - val_accuracy: 0.9120 - val_loss: 0.7124
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.7411 - loss: 1.0118 - val_accuracy: 0.9120 - val_loss: 0.6026
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7386 - loss: 0.8122 - val_accuracy: 0.9120 - val_loss: 0.5604
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7260 - loss: 0.7389 - val_accuracy: 0.9120 - val_loss: 0.4325
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.7561 - loss: 0.6261 - val_accuracy: 0.9120 - val_loss: 0.3436
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8188 - loss: 0.5329 - val_accuracy: 0.9120 - val_loss: 0.2563
Epoch 7/10
[1m13/13[0m [32m━━━━

In [29]:
# Load best weights
model_intent.load_weights('best_model_intent.weights.h5')

# Evaluasi
loss, accuracy = model_intent.evaluate(val_padded, val_labels_cat)
print(f'Akurasi Model Klasifikasi Intent: {accuracy * 100:.2f}%')

# Predict on validation data
val_preds = model_intent.predict(val_padded)
val_preds = np.argmax(val_preds, axis=1)
val_true = np.argmax(val_labels_cat, axis=1)

# Classification report
print(classification_report(val_true, val_preds, target_names=label_encoder.classes_))

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9548 - loss: 0.1035 
Akurasi Model Klasifikasi Intent: 94.87%
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
                        precision    recall  f1-score   support

        cat_healthcare       1.00      1.00      1.00         6
disease_prevention_cat       1.00      1.00      1.00        10
disease_prevention_dog       0.89      1.00      0.94         8
        dog_healthcare       1.00      0.88      0.93         8
  general_conversation       0.90      0.90      0.90        10
   medical_inquiry_cat       1.00      1.00      1.00        10
   medical_inquiry_dog       0.91      1.00      0.95        10
  science_conversation       0.88      0.88      0.88         8
  symptom_analysis_cat       1.00      1.00      1.00         4
  symptom_analysis_dog       1.00      0.75      0.86         4

              accuracy                           0.95        78
           

In [30]:
# Load best weights
model_ner.load_weights('best_model_ner.weights.h5')

# Evaluasi
loss, accuracy = model_ner.evaluate(train_texts_ner, train_labels_ner_encoded)
print(f'Akurasi Model NER: {accuracy * 100:.2f}%')

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9107 - loss: 0.2630
Akurasi Model NER: 92.45%


In [31]:
# Simpan model dan tokenizer
model_intent.save('model_intent.h5')
model_ner.save('model_ner.h5')
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('ner_label_encoder.pickle', 'wb') as handle:
    pickle.dump(ner_label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [32]:
# Memuat model dan tokenizer
from tensorflow.keras.models import load_model
model_intent = load_model('model_intent.h5')
model_ner = load_model('model_ner.h5')

import pickle
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
with open('label_encoder.pickle', 'rb') as handle:
    label_encoder = pickle.load(handle)
with open('ner_label_encoder.pickle', 'rb') as handle:
    ner_label_encoder = pickle.load(handle)
ner_label_decoder = {idx: label for label, idx in ner_label_encoder.items()}



In [33]:
def predict_intent(text):
    text_clean = clean_text(text)
    seq = tokenizer.texts_to_sequences([text_clean])
    padded_seq = pad_sequences(seq, maxlen=max_seq_length, padding='post')
    pred = model_intent.predict(padded_seq)
    predicted_label = np.argmax(pred, axis=1)[0]
    intent = label_encoder.inverse_transform([predicted_label])[0]
    return intent

In [34]:
def predict_entities(text):
    text_clean = clean_text(text)
    seq = tokenizer.texts_to_sequences([text_clean])
    padded_seq = pad_sequences(seq, maxlen=max_seq_length, padding='post')
    pred = model_ner.predict(padded_seq)
    pred_labels = np.argmax(pred, axis=-1)[0]
    tokens = tokenizer.sequences_to_texts(seq)[0].split()
    entities = []
    for idx, label_id in enumerate(pred_labels[:len(tokens)]):
        label = ner_label_decoder[label_id]
        if label != 'O':
            entities.append({'entity': label.split('-')[1], 'value': tokens[idx]})
    return entities

In [35]:
# Membuat DataFrame utterances dan responses
df_utterances = df_balanced[['utterances', 'responses']].reset_index(drop=True)
df_utterances['utterances_clean'] = df_utterances['utterances'].apply(clean_text)

# Menghitung TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_utterances['utterances_clean'])

In [36]:
def get_response(user_input):
    # Clean user input
    user_input_clean = clean_text(user_input)
    # Transform input pengguna
    user_tfidf = vectorizer.transform([user_input_clean])

    # Hitung cosine similarity
    similarities = cosine_similarity(user_tfidf, tfidf_matrix)

    # Dapatkan indeks dengan similarity tertinggi
    most_similar_idx = np.argmax(similarities[0])

    # Jika similarity rendah, berikan respon default
    if similarities[0][most_similar_idx] < 0.2:
        return "Maaf, saya tidak memahami pertanyaan Anda."

    # Ambil respon yang sesuai
    response = df_utterances.iloc[most_similar_idx]['responses']

    return response

In [37]:
def chatbot_response(user_input):
    # Prediksi intent dan entitas
    intent = predict_intent(user_input)
    entities = predict_entities(user_input)
    response = get_response(user_input)
    # Modifikasi respon berdasarkan entitas jika diperlukan
    return response

In [38]:
# Membuat widget input dan output
input_box = widgets.Text(
    value='',
    placeholder='Ketik pesan Anda...',
    description='Anda:',
    disabled=False
)

output_area = widgets.Output()

def on_submit(sender):
    user_input = input_box.value
    input_box.value = ''
    response = chatbot_response(user_input)
    with output_area:
        print(f"Anda: {user_input}")
        print(f"Chatbot: {response}\n")

input_box.on_submit(on_submit)

display(input_box, output_area)

Text(value='', description='Anda:', placeholder='Ketik pesan Anda...')

Output()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [40]:
# Contoh kalimat
test_sentence = "kucing saya sering muntah dan tidak mau makan."

# Prediksi intent
predicted_intent = predict_intent(test_sentence)
print(f"Intent yang diprediksi: {predicted_intent}")

# Prediksi entitas
predicted_entities = predict_entities(test_sentence)
print("Entitas yang ditemukan:")
for entity in predicted_entities:
    print(f"- {entity['entity']}: {entity['value']}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Intent yang diprediksi: symptom_analysis_cat
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Entitas yang ditemukan:
- animal: kucing
- symptom: sering
- symptom: muntah
- symptom: tidak
- symptom: mau
- symptom: makan
