In [1]:
# Import library yang diperlukan
import numpy as np
import pandas as pd
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load data
with open('Data_New.json', encoding='utf-8') as content:
    data = json.load(content)

# Persiapkan data
patterns = []
tags = []
responses = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(pattern.lower())  # Lowercase untuk konsistensi
        tags.append(intent['tag'])
        responses.append(intent['responses'])

df = pd.DataFrame({
    'patterns': patterns,
    'tag': tags,
    'responses': responses
})

# Tampilkan beberapa data untuk memastikan struktur yang benar
print("Sample data:")
print(df.head())
print("\nTotal patterns:", len(patterns))
print("Unique tags:", len(set(tags)))

Sample data:
  patterns       tag                                          responses
0    hallo  greeting  [Hai! Unibot di sini. Ada yang bisa saya bantu...
1      hai  greeting  [Hai! Unibot di sini. Ada yang bisa saya bantu...
2     halo  greeting  [Hai! Unibot di sini. Ada yang bisa saya bantu...
3      hei  greeting  [Hai! Unibot di sini. Ada yang bisa saya bantu...
4       hi  greeting  [Hai! Unibot di sini. Ada yang bisa saya bantu...

Total patterns: 2922
Unique tags: 750


In [3]:
# Tokenisasi patterns
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['patterns'])
total_words = len(tokenizer.word_index) + 1

In [4]:
# Convert patterns ke sequences
X = tokenizer.texts_to_sequences(df['patterns'])
max_seq_len = max([len(x) for x in X])
X_pad = pad_sequences(X, maxlen=max_seq_len, padding='post')

In [5]:
# Encode tags
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['tag'])

print("\nVocabulary size:", total_words)
print("Max sequence length:", max_seq_len)
print("Number of classes:", len(set(y)))


Vocabulary size: 318
Max sequence length: 12
Number of classes: 750


In [6]:
# Buat model
model = Sequential([
    Embedding(total_words, 128, input_length=max_seq_len),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(set(y)), activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)



In [7]:
# Training model
history = model.fit(
    X_pad, y,
    epochs=450,
    batch_size=64,
    validation_split=0.2,
    verbose=1
)

Epoch 1/450
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 42ms/step - accuracy: 0.0030 - loss: 6.6200 - val_accuracy: 0.0000e+00 - val_loss: 6.6548
Epoch 2/450
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0054 - loss: 6.5978 - val_accuracy: 0.0000e+00 - val_loss: 7.3094
Epoch 3/450
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.0107 - loss: 6.5008 - val_accuracy: 0.0000e+00 - val_loss: 7.8653
Epoch 4/450
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0072 - loss: 6.4495 - val_accuracy: 0.0000e+00 - val_loss: 8.6157
Epoch 5/450
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.0107 - loss: 6.3489 - val_accuracy: 0.0000e+00 - val_loss: 9.4822
Epoch 6/450
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.0061 - loss: 6.1446 - val_accuracy: 0.0000e+00 - val_loss: 10.0432
Epo

In [24]:
model.summary()

In [25]:
# Fungsi untuk generate response
def generate_response(user_input):
    # Preprocess input
    user_input = user_input.lower()
    input_sequence = tokenizer.texts_to_sequences([user_input])
    input_padded = pad_sequences(input_sequence, maxlen=max_seq_len, padding='post')

    # Predict tag
    prediction = model.predict(input_padded, verbose=0)
    predicted_tag_index = np.argmax(prediction, axis=-1)[0]
    predicted_tag = label_encoder.inverse_transform([predicted_tag_index])[0]

    # Get confidence score
    confidence = np.max(prediction)

    # Get response if confidence is above threshold
    if confidence > 0.9:  # You can adjust this threshold
        matching_responses = df[df['tag'] == predicted_tag]['responses'].values
        if len(matching_responses) > 0:
            responses = matching_responses[0]
            return np.random.choice(responses), confidence

    return "Maaf, saya tidak yakin dengan jawaban untuk pertanyaan ini.", confidence

In [26]:
# Test the model
test_inputs = [
    "hi",
    "Kampus yang berada di Jakarta Barat?",
    "Unibot, yang buat kamu siapa sih?",
    "Berapa biaya kuliah di Universitas Muhammadiyah Jakarta?",
    "bye"
]

print("\nTest Results:")
for input_text in test_inputs:
    response, confidence = generate_response(input_text)
    print(f"\nInput: {input_text}")
    print(f"Response: {response}")
    print(f"Confidence: {confidence:.2%}")


Test Results:

Input: hi
Response: Hai! Selamat datang di layanan Unibot. Ada info kampus yang ingin kamu cari?
Confidence: 100.00%

Input: Kampus yang berada di Jakarta Barat?
Response: Berikut adalah daftar universitas di Jakarta Barat:
1. Universitas Kristen Krida Wacana
2. Universitas Satyagama
3. Universitas Bina Nusantara
4. Universitas Dian Nusantara
5. Universitas Media Nusantara Citra
6. Universitas Esa Unggul
7. Universitas Agung Podomoro
8. Universitas Trisakti
9. Universitas Tarumanagara
10. Universitas Mercu Buana
11. Universitas Timbul Nusantara
Confidence: 100.00%

Input: Unibot, yang buat kamu siapa sih?
Response: Penciptaku adalah mahasiswa dari Universitas Tarumanagara. Berkat ide-ide nya, saya bisa hadir dan membantu kamu saat ini!
Confidence: 100.00%

Input: Berapa biaya kuliah di Universitas Muhammadiyah Jakarta?
Response: Biaya per semester di Universitas Muhammadiyah Jakarta (UMJ) berkisar antara Rp275.000 hingga Rp21.600.000, tergantung program studi yang dipil

In [27]:
# simpan model dan komponen yang diperlukan
import pickle
import json
from tensorflow.keras.models import load_model

# Simpan model
model.save('chatbot_model.h5')

# Simpan tokenizer dan label encoder
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Simpan max_seq_len dan responses
model_config = {
    'max_seq_len': max_seq_len,
}
with open('model_config.json', 'w') as f:
    json.dump(model_config, f)

# Simpan responses dictionary
responses_dict = {}
for _, row in df.iterrows():
    tag = row['tag']
    if tag not in responses_dict:
        responses_dict[tag] = row['responses']

with open('responses.json', 'w') as f:
    json.dump(responses_dict, f)

print("Model dan komponen pendukung telah disimpan!")



Model dan komponen pendukung telah disimpan!
