<a href="https://colab.research.google.com/github/ayumawaddawarohma/ML_exercise/blob/main/Chatbot_kebudayaan_Indonesia_(BUDI).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Library dan Download Package**

Reference 
[code](https://projectgurukul.org/deep-learning-python-chatbot/)

**Retrieval based model : Chatbot_kebudayaan Indonesia (BUDI)**

In [None]:
import json
import nltk
import random
import string
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import IPython.display as ipd 
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Model
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout , Activation, Flatten , Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import SGD

# kalau file untuk data exploration nya mau dipisah dengan Modelling harusnya library nya gk sebanyak ini ya

In [None]:
# Package sentence tokenizer
nltk.download('punkt') 
# Package lemmatization
nltk.download('wordnet')
# Package multilingual wordnet data
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# **Load Dataset**

File dataset budi.json https://drive.google.com/file/d/1sLJpvSFVOSYKuZR2TufW_YuTTHo5j0gq/view?usp=sharing

In [None]:
from urllib import request
Dataset= 'https://raw.githubusercontent.com/ayumawaddawarohma/ML_exercise/main/Chatbot%20kebudayaan%20Indonesia%20(BUDI)%20_PA_Kel_4/budi.json'
request.urlretrieve(Dataset, "budi.json")

('budi.json', <http.client.HTTPMessage at 0x7fee4772a490>)

In [None]:
df = pd.read_json(Dataset)

In [None]:
tags = [] # data tag
inputs = [] # data input atau pattern
responses = {} # data respon
words = [] # Data kata 
classes = [] # Data Kelas atau Tag
documents = [] # Data Kalimat Dokumen
ignore_words = ['?','!'] # Mengabaikan tanda spesial karakter (Kurang tanda seru, karena difile json nya ada tanda seru )

In [None]:
for intent in df['intents']:
  responses[intent['tag']]=intent['responses']
  for lines in intent['patterns']:
    inputs.append(lines)
    tags.append(intent['tag'])
    for pattern in intent['patterns']:
      w = nltk.word_tokenize(pattern)
      words.extend(w)
      documents.append((w, intent['tag']))
      # add to our classes list
      if intent['tag'] not in classes:
        classes.append(intent['tag'])

In [None]:
# Konversi data json ke dalam dataframe
data = pd.DataFrame({"patterns":inputs, "tags":tags})

In [None]:
data

Unnamed: 0,patterns,tags
0,hallo,greetings
1,hai,greetings
2,halo,greetings
3,hei,greetings
4,hi,greetings
...,...,...
3088,Nama Senjata dari Maluku Utara?,senjata_maluku_utara
3089,Nama Senjata asal Maluku Utara?,senjata_maluku_utara
3090,Apa Senjata tradisional Maluku Utara?,senjata_maluku_utara
3091,Apa nama Senjata tradisional Maluku Utara?,senjata_maluku_utara


# **Tahap Data Preprocessing**

In [None]:
# Removing Punctuations (Menghilangkan Punktuasi)
data['patterns'] = data['patterns'].apply(lambda wrd:[ltrs.lower() for ltrs in wrd if ltrs not in string.punctuation])
data['patterns'] = data['patterns'].apply(lambda wrd: ''.join(wrd))

In [None]:
# Proses lematisasi
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in ignore_words]
words = sorted(list(set(words)))

print (len(words), "Kata Unik", words)

123 Kata Unik ['aceh', 'adat', 'afternoon', 'alat', 'apa', 'asal', 'babel', 'bai', 'bali', 'bangka', 'banten', 'banyak', 'barat', 'belitung', 'bengkulu', 'berasal', 'bro', 'budi', 'bye', 'byee', 'dadah', 'daerah', 'dah', 'dari', 'di', 'diambil', 'dihasilkan', 'diy', 'dki', 'good', 'gorontalo', 'hai', 'hallo', 'halo', 'hei', 'hi', 'hy', 'informasi', 'istimewa', 'itu', 'jabar', 'jakarta', 'jambi', 'jateng', 'jatim', 'jawa', 'jogja', 'jumpa', 'kalbar', 'kalimantan', 'kalsel', 'kaltara', 'kalteng', 'kaltim', 'kasih', 'kawan', 'kenal', 'kepri', 'kepualauan', 'kepulauan', 'kerajinan', 'khas', 'khast', 'lagu', 'lampung', 'makanan', 'makasih', 'malam', 'maluku', 'malut', 'mana', 'morning', 'musik', 'nama', 'ntb', 'ntt', 'nusa', 'oleh', 'pagi', 'pakaian', 'papua', 'pegunungan', 'provinsi', 'referensi', 'riau', 'riau/kepri', 'rumah', 'saja', 'salam', 'sampai', 'see', 'selamat', 'selatan', 'senjata', 'si', 'siang', 'sore', 'sulawesi', 'sulbar', 'sulsel', 'sulteng', 'sultra', 'sulut', 'sumatera', 

In [None]:
# Mensort tag
classes = sorted(list(set(classes)))
print(len(classes), "Label", classes)

278 Label ['Pakaian_Daerah_Bali', 'Pakaian_Daerah_Bangka_Belitung', 'Pakaian_Daerah_Banten', 'Pakaian_Daerah_Bengkulu', 'Pakaian_Daerah_Gorontalo', 'Pakaian_Daerah_Jakarta', 'Pakaian_Daerah_Jambi', 'Pakaian_Daerah_Jawa_Barat', 'Pakaian_Daerah_Jawa_Tengah', 'Pakaian_Daerah_Jawa_Timur', 'Pakaian_Daerah_Kalimantan_Barat', 'Pakaian_Daerah_Kalimantan_Selatan', 'Pakaian_Daerah_Kalimantan_Tengah', 'Pakaian_Daerah_Kalimantan_Timur', 'Pakaian_Daerah_Kalimantan_Utara', 'Pakaian_Daerah_Kepulauan_Riau', 'Pakaian_Daerah_Lampung', 'Pakaian_Daerah_Maluku', 'Pakaian_Daerah_Maluku_Utara', 'Pakaian_Daerah_Nanggroe_Aceh_Darussalam', 'Pakaian_Daerah_Nusa_Tenggara_Barat', 'Pakaian_Daerah_Nusa_Tenggara_Timur', 'Pakaian_Daerah_Papua', 'Pakaian_Daerah_Papua_Barat', 'Pakaian_Daerah_Riau', 'Pakaian_Daerah_Sulawesi_Barat', 'Pakaian_Daerah_Sulawesi_Selatan', 'Pakaian_Daerah_Sulawesi_Tengah', 'Pakaian_Daerah_Sulawesi_Tenggara', 'Pakaian_Daerah_Sulawesi_Utara', 'Pakaian_Daerah_Sumatera_Barat', 'Pakaian_Daerah_Sumat

In [None]:
# Melihat keseluruhan data teks
print (len(documents), "documents")

36555 documents


In [None]:
# Tokenisasi data
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(data['patterns'])
train = tokenizer.texts_to_sequences(data['patterns'])
train

[[80],
 [81],
 [82],
 [88],
 [89],
 [90],
 [91],
 [92],
 [93],
 [94],
 [95],
 [96],
 [80, 79],
 [81, 79],
 [82, 79],
 [97],
 [98],
 [99],
 [100, 101],
 [3, 102, 79],
 [5, 103, 104, 105, 44, 106],
 [26, 107],
 [108],
 [109],
 [83],
 [110],
 [111, 83],
 [112, 113],
 [114, 115],
 [116],
 [117, 84],
 [118],
 [119, 84],
 [85, 86],
 [120],
 [85, 86, 121],
 [3, 15, 5, 41],
 [3, 49, 15, 26, 41],
 [3, 1, 15, 6, 41],
 [3, 1, 15, 5, 9, 41],
 [3, 1, 15, 41],
 [15, 41],
 [15, 6, 41],
 [15, 9, 41],
 [15, 2, 41],
 [15, 26, 41],
 [15, 44, 50, 51, 41],
 [3, 15, 5, 24, 19],
 [3, 49, 15, 26, 24, 19],
 [3, 1, 15, 6, 24, 19],
 [3, 1, 15, 5, 9, 24, 19],
 [3, 1, 15, 24, 19],
 [15, 24, 19],
 [15, 6, 24, 19],
 [15, 9, 24, 19],
 [15, 2, 24, 19],
 [15, 26, 24, 19],
 [15, 44, 50, 51, 24, 19],
 [3, 15, 5, 24, 12],
 [3, 49, 15, 26, 24, 12],
 [3, 1, 15, 6, 24, 12],
 [3, 1, 15, 5, 9, 24, 12],
 [3, 1, 15, 24, 12],
 [15, 24, 12],
 [15, 6, 24, 12],
 [15, 9, 24, 12],
 [15, 2, 24, 12],
 [15, 26, 24, 12],
 [15, 44, 50, 51,

In [None]:
# Menerapkan proses dari padding
x_train = pad_sequences(train)
print(x_train)

[[ 0  0  0 ...  0  0 80]
 [ 0  0  0 ...  0  0 81]
 [ 0  0  0 ...  0  0 82]
 ...
 [ 0  0  0 ...  7 30 19]
 [ 0  0  0 ...  7 30 19]
 [ 0  0  0 ...  7 30 19]]


In [None]:
# Encoding the outputs 
le = LabelEncoder()
y_train = le.fit_transform(data['tags'])
print(y_train) #Label Encodings

[102 102 102 ... 263 263 263]


In [None]:
# input length
input_shape = x_train.shape[1]
print(input_shape)

10


In [None]:
# labels length
input_shape1 = y_train.shape
print(input_shape1)

(3093,)


In [None]:
# Mendefine vocabulary
vocabulary = len(tokenizer.word_index)
print("Jumlah kata unik : ", vocabulary)

# output length
output_length = le.classes_.shape[0]
print("output length: ", output_length)

Jumlah kata unik :  123
output length:  278


# **Menyimpan model words dan labels**

In [None]:
pickle.dump(words,open('/content/words.pkl','wb'))
pickle.dump(classes,open('/content/labels.pkl','wb'))

# **Menyimpan label Encoder dan Tokenizer**

In [None]:
pickle.dump(le, open('/content/le.pkl','wb'))
pickle.dump(tokenizer, open('/content/tokenizers.pkl','wb'))

# ***Modelling***

Reference :
1. [Dropout layers](https://towardsdatascience.com/dropout-in-neural-networks-47a162d621d9)
2. [Early stopping - code](https://keras.io/api/callbacks/early_stopping/)
3. [Early stopping - explain](https://towardsdatascience.com/a-practical-introduction-to-early-stopping-in-machine-learning-550ac88bc8fd) & [Early stopping - explain 2](https://machinelearningmastery.com/dropout-regularization-deep-learning-models-keras/)



In [None]:
# creating our training data:
training_data = []
# creating an empty array for our output (with size same as length of labels):
output = [0]*len(classes)
for doc in documents:
    bag_of_words = []
    pattern_words = doc[0]
    #lemmatize pattern words:
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    
    for w in words:
        if w in pattern_words:
            bag_of_words.append(1)
        else:
            bag_of_words.append(0)
            
    output_row = list(output)
    output_row[classes.index(doc[1])] = 1
    
    training_data.append([bag_of_words,output_row])

In [None]:
# convert training_data to numpy array and shuffle the data:
random.shuffle(training_data)
training_data = np.array(training_data)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# Now we have to create training list:
x_train = list(training_data[:,0])
y_train = list(training_data[:,1])

In [None]:
len(y_train[1])

278

In [None]:
len(y_train)

36555

In [None]:
len(x_train)

36555

In [None]:
# fungsinya digunakan untuk menghentikan training jika nilai akurasi yang diinginkan telah tercapai.
class myCallback(tf.keras.callbacks.Callback) :
  def on_epoch_end(self,epoch, logs={}):
    if(logs.get('accuracy') >= 0.97):
      print('\nReached 95% accuracy so canceling the training !')
      self.model.stop_training = True

callbacks = myCallback()

In [None]:
# Creating Model same with reference code

model = Sequential()
model.add(Dense(512, input_shape=(len(x_train[0]),), activation='relu'))
model.add(Dropout(0.2)) 
# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(y_train[0]), activation='softmax'))

In [None]:
# # Model with early stopping 
# # create Early stopping --> https://keras.io/api/callbacks/early_stopping/
# from tensorflow.keras.callbacks import EarlyStopping
# early_stopping = EarlyStopping() # If want to change parameters, open the link above 

# # Creating Model:

# model = Sequential()
# model.add(Dense(512, input_shape=(len(x_train[0]),), activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(256, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(len(y_train[0]), activation='softmax'))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               63488     
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                32832     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 278)               18070     
                                                                 
Total params: 114,390
Trainable params: 114,390
Non-trainable params: 0
_________________________________________________________________


In [None]:
sgd_optimizer = SGD(lr=0.01, decay=1e-6, momentum=0.8, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd_optimizer, metrics=['accuracy']) #using a Stochastic gradient descent(sgd) optimizer with Nesterov accelerated gradient.

  super(SGD, self).__init__(name, **kwargs)


In [None]:
# # fit the model --> using earlystoping, 
# history = model.fit(np.array(x_train),
#                     np.array(y_train),
#                     epochs=100,
#                     batch_size=8,
#                     validation_split=0.3,
#                     verbose=1,
#                     callbacks=[early_stopping])

In [None]:
history = model.fit(np.array(x_train),
                    np.array(y_train),
                    epochs=300,
                    batch_size=5,
                    validation_split=0.3,
                    verbose=1,
                    callbacks=[callbacks])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Reached 95% accuracy so canceling the training !


In [None]:
# Plotting model Accuracy and Loss (Visualisasi Plot Hasil Akurasi dan Loss)
# Plot Akurasi
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'],label='Training Set Accuracy')
plt.legend(loc='lower right')
plt.title('Accuracy')
# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'],label='Training Set Loss')
plt.legend(loc='upper right')
plt.title('Loss')
plt.show()

In [None]:
def show_final_history(history):
    fig, ax = plt.subplots(1, 2, figsize(15,5))
    ax[0].set_titles('LOSS')
    ax[0].plot(history.epoch, history.history["loss"], label="Train Loss")
    ax[0].plot(history.epoch, history.history["val_loss"], label="Validation Loss")
    ax[1].set_title('MAE')
    ax[1].plot(history.epoch, history.history["mae"], label="Mae")
    ax[1].plot(history.epoch, history.history["val_mae"], label="Validation Mae")
    ax[0].legend()
    ax[1].legend()


In [None]:
show_final_history(history)

In [None]:
# Save the model
model.save('model_budi.h5', history)

# **Testing Chatbot**

To testing chatbot can save code below with extention .py and run in local

In [None]:
# load model,words list, labels list 
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import pickle
import numpy as np
import json
import random
from keras.models import load_model

#model = load_model('model_budi.h5')
model = load_model('model_budi.h5')
intents = json.loads(open('budi.json').read())
words = pickle.load(open('words.pkl','rb'))
labels = pickle.load(open('labels.pkl','rb'))

In [None]:
#  function which will perform text operations and then predict the label
def bank_of_words(s,words, show_details=True):
    bag_of_words = [0 for _ in range(len(words))]
    sent_words = nltk.word_tokenize(s)
    sent_words = [lemmatizer.lemmatize(word.lower()) for word in sent_words]
    for sent in sent_words:
        for i,w in enumerate(words):
            if w == sent:
                bag_of_words[i] = 1
    return np.array(bag_of_words)

def predict_label(s, model):
    # filtering out predictions
    pred = bank_of_words(s, words,show_details=False)
    response = model.predict(np.array([pred]))[0]
    ERROR_THRESHOLD = 0.25
    final_results = [[i,r] for i,r in enumerate(response) if r>ERROR_THRESHOLD]
    final_results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in final_results:
        return_list.append({"intent": labels[r[0]], "probability": str(r[1])})
    return return_list

In [None]:
# function which will give responses from the list of intents
def Response(ints, intents_json):
    tags = ints[0]['intent']
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if(i['tag']== tags):
            response = random.choice(i['responses'])
            break
    return response

def chatbot_response(msg):
    ints = predict_label(msg, model)
    response = Response(ints, intents)
    return response

In [None]:
def chat():
    print("Start chat with ChatBot of ProjectGurukul")
    while True:
        inp = input("You: ")
        if inp.lower() == 'quit':
            break
        response = chatbot_response(inp)
        print("\n BOT: " + response + '\n\n')

chat()

Start chat with ChatBot of ProjectGurukul
You: hi

 BOT: Hai! Salam Kenal aku Budi.


You: jakarta

 BOT: Hello, Lagu Daerah dari daerah Jakarta adalah Kicir-Kicir.


You: morning

 BOT: Hai! Salam Kenal aku Budi.


You: Apa itu Budi?

 BOT: Hai! Salam Kenal aku Budi.


You: Apa itu Budi

 BOT: Hai! Salam Kenal aku Budi.


You: Apa itu Budi?

 BOT: Hai! Salam Kenal aku Budi.


You: Apa Saja Kerajinan asal Aceh?

 BOT: Terima kasih yaa telah bertanya....Baiklah akan Budi jawab. Kerajinan asal provinsi Aceh antara lain Kupiah Meuketop, Kupiah Riman, Batik Aceh, Songket Aceh, dan Sulam Kasab.


You: Apa Saja Kerajinan asal Aceh?

 BOT: Terima kasih yaa telah bertanya....Baiklah akan Budi jawab. Kerajinan asal provinsi Aceh antara lain Kupiah Meuketop, Kupiah Riman, Batik Aceh, Songket Aceh, dan Sulam Kasab.


You: Kerajinan Khas Riau?

 BOT: Terima kasih yaa telah bertanya....Baiklah akan Budi jawab. Kerajinan asal provinsi Riau yaitu Tenun Lejo Bengkalis, Songket Indragiri, Anyaman Bambu