In [2]:
# prepare dataset
import json
import pandas as pd

with open("data/intents.json") as data_file:
    data = json.load(data_file)

In [9]:
text_input = []
intents = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        text_input.append(pattern)
        intents.append(intent['tag'])

df = pd.DataFrame({'text_input': text_input,
                    'intents': intents})

df.head()

Unnamed: 0,text_input,intents
0,Hai,salam
1,Hi,salam
2,Halo,salam
3,Apa Kabar,salam
4,Selamat Pagi,salam


In [10]:
df.intents.value_counts()

salam                 12
bye                    8
nama                   7
ngodingpython_typo     7
komunitas              7
pekerjaan              5
ngodingpython          5
youtube                5
kemampuan              5
Name: intents, dtype: int64

In [11]:
# data cleansing
import string

# convert lowercase
df.text_input = df.text_input.apply(lambda x: x.lower())

# remove punctuation
exclude = set(string.punctuation)
df.text_input = df.text_input.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [13]:
# label encoding
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y_train = le.fit_transform(df.intents)
y_train = to_categorical(y_train)

In [20]:
all_vocab = []
length = []

for idx, row in df.iterrows():
    sent = row['text_input']
    [all_vocab.append(i) for i in sent.split()]
    length.append(len(sent.split()))

In [19]:
len(all_vocab)

166

In [21]:
max(length)

6

In [18]:
len(set(all_vocab))

86

In [23]:
from tensorflow.keras.layers import TextVectorization

max_vocab_length = 86
max_length = 6

text_vectorization = TextVectorization(max_tokens=max_vocab_length,
                                       standardize='lower_and_strip_punctuation',
                                       split='whitespace',
                                       ngrams=None,
                                       output_mode='int',
                                       output_sequence_length=max_length
                                       )


In [24]:
text_vectorization.adapt(df.text_input)

In [26]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'apa',
 'bisa',
 'emang',
 'ngoding',
 'lo',
 'kamu',
 'tuh',
 'selamat',
 'ngapain',
 'lu',
 'sih',
 'siapa',
 'nya',
 'ngodingpython',
 'ngodingpithon',
 'nama',
 'ga',
 'ya',
 'wah',
 'tugas',
 'si',
 'sape',
 'python',
 'ok',
 'mau',
 'kodingan',
 'kemana',
 'kalau',
 'dimana',
 'dah',
 'ada',
 'youtube',
 'yang',
 'wui',
 'woy',
 'urlnya',
 'tinggal',
 'tanyatanya',
 'siang',
 'semoga',
 'sampai',
 'salam',
 'saha',
 'piton',
 'ping',
 'phyton',
 'pekerjaan',
 'pagi',
 'paan',
 'p',
 'ngodingpiton',
 'ngodingphython',
 'nanya',
 'minta',
 'menyenangkan',
 'mana',
 'malam',
 'makasih',
 'linknya',
 'link',
 'lakukan',
 'lagi',
 'komunitasnya',
 'kerja',
 'kemampuan',
 'kabar',
 'jumpa',
 'hi',
 'harimu',
 'halo',
 'hai',
 'grupnya',
 'grup',
 'eta',
 'dong',
 'diliat',
 'dadah',
 'daah',
 'channel',
 'bye',
 'bantuin',
 'bantu',
 'apasih',
 'apaan']

In [28]:
text_vectorization('halo nama kamu siapa')

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([71, 17,  7, 13,  0,  0])>

In [33]:
text_vectorization.get_vocabulary()[0]

''

In [34]:
from tensorflow.keras.layers import Embedding
embedding = Embedding(input_dim=max_vocab_length,
                      output_dim=16,
                      embeddings_initializer="uniform",
                      input_length=max_length)

In [40]:
import numpy as np
res_embed = embedding(np.array([[71, 17,  7, 13,  0,  0]]))
res_embed

<tf.Tensor: shape=(1, 6, 16), dtype=float32, numpy=
array([[[-0.0148086 , -0.02316591, -0.0318022 , -0.04923364,
         -0.01750685,  0.03336764,  0.03463038,  0.0322554 ,
         -0.00094187,  0.02602686, -0.04140023, -0.04352396,
         -0.00929301,  0.03723199,  0.03281195, -0.03474935],
        [ 0.04710625, -0.00286432,  0.0388199 ,  0.00017405,
         -0.01137855, -0.02908495, -0.02423978, -0.02978312,
          0.01146308, -0.01361241,  0.04300782, -0.02926438,
          0.0360395 , -0.04072239,  0.03278048,  0.04314229],
        [-0.01441199, -0.04212294,  0.01821918, -0.0077691 ,
          0.0293701 , -0.0288891 ,  0.02955926,  0.0297921 ,
         -0.04988505,  0.00222541, -0.02723522, -0.01464278,
          0.04548428, -0.03990283,  0.00988837,  0.03327673],
        [-0.02177813, -0.04225926,  0.02909348, -0.01372105,
         -0.02806585,  0.01055714, -0.03106246, -0.01567892,
          0.04908412, -0.00063174,  0.0429321 ,  0.01953385,
          0.0150297 , -0.00761

In [42]:
# modelling
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, LSTM
inputs = Input(shape=(1,), dtype='string')
x = text_vectorization(inputs)
x = embedding(x)
x = LSTM(12)(x)
outputs = Dense(9, activation='softmax')(x)
model_lstm = Model(inputs, outputs, name="LSTM_model")

In [43]:
# compile model
model_lstm.compile(loss='categorical_crossentropy',
                   optimizer='adam',
                   metrics=["accuracy"])

In [44]:
model_lstm.fit(df.text_input,
               y_train,
               epochs=200,
               verbose=0)

<keras.callbacks.History at 0x7fbc845d1fd0>

In [45]:
model_lstm.evaluate(df.text_input, y_train)



[0.1274145096540451, 1.0]

In [46]:
model_lstm.save("bot_model.tf")

2021-12-04 04:38:26.924363: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: bot_model.tf/assets


INFO:tensorflow:Assets written to: bot_model.tf/assets


In [47]:
import pickle
le_filename = open("label_encoder.pickle", "wb")
pickle.dump(le, le_filename)
le_filename.close()