In [1]:
import numpy as np
import pandas as pd
import nltk
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
# Load JSON data
with open('Data_New.json', encoding='utf-8') as content:
    data1 = json.load(content)
df = pd.DataFrame(data1["intents"])

In [73]:
df

Unnamed: 0,tag,patterns,responses
0,greeting,"[hallo, hai, halo, hei, hi, hy, pagi, morning,...",[Hai! Unibot di sini. Mau tau informasi tentan...
1,Unibot,"[Apa itu Unibot?, Siapa Unibot?]",[Unibot adalah aplikasi chatbot informasi tent...
2,pencipta_Unibot,"[Siapa pembuatmu?, Unibot, yang buat kamu siap...",[UnibotBot diciptakan oleh mahasiswa dan mahas...
3,goodbye,"[Dah, Dadah, Bye, Byee, Good bye, Selamat ting...","[Bye!, Dadahh!, Good bye!, Dahh, semoga harimu..."
4,Kampus_Jakarta_Pusat,"[Kampus yang berada di Jakarta Pusat?, Univers...",[Beberapa universitas di Jakarta Pusat antara ...
...,...,...,...
678,Link_Website_Universitas_Cakrawala,"[Apa website resmi Universitas Cakrawala?, Web...",[www.cakrawala.ac.id]
679,Program_Studi_Universitas_Cakrawala,[Program studi apa saja di Universitas Cakrawa...,"[Bahasa Inggris, Bisnis Digital, Ekonomi Keuan..."
680,Akreditasi_Program_Studi_Universitas_Cakrawala,[Apa akreditasi masing-masing program studi di...,"[-, -, -, -, -, -, -, -]"
681,Latar_Belakang_Universitas_Cakrawala,[Apa latar belakang dari Universitas Cakrawala...,[Tentang Universitas Cakrawala Universitas Cak...


Pemrosessan data

In [5]:
# Tokenisasi
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['patterns'].tolist() + df['responses'].tolist())
total_words= len(tokenizer.word_index) + 1

In [6]:
#mengonversi input dan mengatur respons ke urutan token
input_sequences = tokenizer.texts_to_sequences(df['patterns'])
response_sequences = tokenizer.texts_to_sequences(df['responses'])

In [7]:
#padding bertujuan agar semua memiliki panjang yang sama
max_seq_len = max([len(x) for x in input_sequences + response_sequences])
input_padded = pad_sequences(input_sequences, maxlen=max_seq_len, padding='post')
response_padded = pad_sequences(response_sequences, maxlen=max_seq_len, padding='post')

In [8]:
# melakukan encoding tags maxlen tentukan sesuai tag
#encoding tags
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(df['tag'])
tag_sequences = tag_tokenizer.texts_to_sequences(df['tag'])
tag_padded = pad_sequences(tag_sequences, maxlen=1, padding='post')

In [9]:
#membangun model LSTM
model = Sequential([
    Embedding(total_words, 128, input_length=max_seq_len),
    LSTM(128, return_sequences=True),
    LSTM(32),
    Dense(32, activation='relu'),
    Dense(len(tag_tokenizer.word_index) + 1, activation='softmax')
])



In [10]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary

<bound method Model.summary of <Sequential name=sequential, built=False>>

In [11]:
#melatih model
X = input_padded
y = np.array(tag_padded)

In [12]:
#fitting
model.fit(X,y, epochs= 200, batch_size=2, verbose=1)

Epoch 1/200
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.1391 - loss: 4.5415
Epoch 2/200
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1360 - loss: 3.9019
Epoch 3/200
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1477 - loss: 3.8173
Epoch 4/200
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1496 - loss: 3.7765
Epoch 5/200
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1352 - loss: 3.5251
Epoch 6/200
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1244 - loss: 3.3838
Epoch 7/200
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1148 - loss: 3.3200
Epoch 8/200
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.1442 - loss: 3.2428
Epoch 9/200
[1m342/342

<keras.src.callbacks.history.History at 0x247e5f517c0>

In [13]:
model.summary()

In [14]:
# untuk megnhasilkan respon
def generate_response(text):
    sequences = tokenizer.texts_to_sequences([text])
    padded =  pad_sequences(sequences,maxlen=max_seq_len, padding='post')
    predicted = model.predict(padded, verbose=0)
    tag = tag_tokenizer.index_word[np.argmax(predicted)]

    response = df[df['tag'] == tag]['responses'].values
    return np.random.choice(response)

In [74]:
model.save('chatbot_model.h5')



In [75]:
# simpan tokenisasi
import pickle

with open ('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)
with open ('tag_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_padded, tag_padded, test_size=0.2, random_state=42)

model = Sequential([
    Embedding(total_words, 128, input_length=max_seq_len),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dense(len(tag_tokenizer.word_index) + 1, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Melatih model
model.fit(X_train, y_train, epochs=200, batch_size=2, verbose=1)



Epoch 1/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.1076 - loss: 4.4872
Epoch 2/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.1220 - loss: 3.9381
Epoch 3/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.1056 - loss: 3.8811
Epoch 4/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.1122 - loss: 3.7053
Epoch 5/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.1215 - loss: 3.6974
Epoch 6/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.1389 - loss: 3.3172
Epoch 7/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.1621 - loss: 3.1787
Epoch 8/200
[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.1496 - loss: 3.0500
Epoch 9/200
[1m273/273

<keras.src.callbacks.history.History at 0x247f1e43ad0>

In [79]:
from sklearn.metrics import classification_report
# Memprediksi tag untuk data uji
predictions = model.predict(X_test)
predicted_tags = [np.argmax(pred) for pred in predictions]

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 138ms/step


In [95]:
import numpy as np
from sklearn.metrics import classification_report

# Memprediksi tag untuk data uji
predictions = model.predict(X_test)

# Mengonversi prediksi menjadi tag
predicted_tags = [np.argmax(pred) for pred in predictions]

# Menentukan kelas yang ada dalam y_test
unique_classes = np.unique(y_test)

# Menghasilkan laporan klasifikasi dengan nama target sesuai kelas yang ada
report = classification_report(y_test, predicted_tags,
                            target_names=list(tag_tokenizer.word_index.keys()),
                            zero_division=0)
print(report)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 


ValueError: Number of classes, 51, does not match size of target_names, 134. Try specifying the labels parameter

In [84]:
print("Ukuran y_test_labels:", y_test_labels.shape)
print("Ukuran predicted_labels:", predicted_labels.shape)

Ukuran y_test_labels: (137,)
Ukuran predicted_labels: (18495,)
