In [1]:
import json
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
import random
import nltk
from nltk.stem import WordNetLemmatizer
import pickle
import os

lemmatizer = WordNetLemmatizer()

words = []
classes = []
documents = []
ignore_words = ['?', '!']

# Sesuaikan jalur file data.json dengan lokasi sebenarnya
data_file = open('C:/Users/chady/Downloads/chatbo/model/data.json').read()
intents = json.loads(data_file)

for intent in intents['intents']:
    for pattern in intent['patterns']:
        # tokenize each word
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        # add to documents
        documents.append((w, intent['tag']))
        # add to classes
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

print(len(documents), "documents")
print(len(classes), "classes", classes)
print(len(words), "unique lemmatized words", words)

# Buat direktori 'model' jika belum ada
if not os.path.exists('C:/Users/chady/Downloads/chatbo/model'):
    os.makedirs('C:/Users/chady/Downloads/chatbo/model')

pickle.dump(words, open('C:/Users/chady/Downloads/chatbo/model/texts.pkl', 'wb'))
pickle.dump(classes, open('C:/Users/chady/Downloads/chatbo/model/labels.pkl', 'wb'))

training = []
output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    pattern_words = doc[0]
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

random.shuffle(training)

# Extract features and labels
train_x = []
train_y = []
for feature, label in training:
    train_x.append(feature)
    train_y.append(label)

train_x = np.array(train_x)
train_y = np.array(train_y)

print("Training data created")

# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

# Fit the model
hist = model.fit(train_x, train_y, epochs=200, batch_size=5, verbose=1)
model.save('C:/Users/chady/Downloads/chatbo/model/models.h5', hist)

print("model created")


81 documents
34 classes ['beasiswa', 'biaya_kuliah', 'biaya_kuliah_d3', 'biaya_kuliah_d4', 'fasilitas_kampus', 'greeting1', 'greeting2', 'greeting3', 'greeting4', 'greeting5', 'greeting6', 'greeting_malam', 'info_kampus', 'jadwal_kuliah', 'jurusan', 'kegiatan_kampus', 'kontak', 'lokasi', 'misi', 'noanswer', 'pendaftaran', 'perpustakaan', 'prodi_administrasi_bisnis', 'prodi_akuntansi', 'prodi_pariwisata', 'prodi_teknik_elektro', 'prodi_teknik_mesin', 'prodi_teknik_sipil', 'prodi_teknologi_informasi', 'program_studi', 'tidak', 'visi', 'visi_dan_misi', 'ya']
70 unique lemmatized words ['ab', 'administrasi', 'aktivitas', 'akuntansi', 'alamat', 'beasiswa', 'biaya', 'bisnis', 'cara', 'contact', 'd3', 'd4', 'daftar', 'dan', 'elektro', 'event', 'fasilitas', 'ga', 'hai', 'halo', 'harga', 'hello', 'hubungi', 'info', 'informasi', 'iya', 'jadwal', 'jrsn', 'jurusan', 'kampus', 'kegiatan', 'kontak', 'kuliah', 'lembaga', 'lokasi', 'malam', 'malem', 'mendaftar', 'mesin', 'misi', 'ngga', 'nggak', 'no',