<a href="https://colab.research.google.com/github/aladino24/artificial_intelligence_orbit/blob/main/Chatbot_Kampus_Merdeka.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Dataset

In [None]:
!wget https://raw.githubusercontent.com/fendy07/chatbot-AI/master/kampus_merdeka.json

--2023-10-09 03:15:08--  https://raw.githubusercontent.com/fendy07/chatbot-AI/master/kampus_merdeka.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16366 (16K) [text/plain]
Saving to: ‘kampus_merdeka.json’


2023-10-09 03:15:08 (29.2 MB/s) - ‘kampus_merdeka.json’ saved [16366/16366]



In [None]:
# Import Libraries
import json
import nltk
import random
import string
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Embedding, LSTM, Flatten, Dense, GlobalMaxPool1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download Packages

In [None]:
# package for tokenizing data
nltk.download('punkt')

# package for lemmatization
nltk.download('wordnet')

# package for multilingual wordnet
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# 1. Read Data

In [None]:
# Importing the dataset
# DO: Masukkan path file data kampus_merdeka.json
with open(' ') as content:
  data1 = json.load(content)

# Mendapatkan semua data ke dalam list
tags = []
inputs = []
responses = {}
words = []
classes = []
documents = []
ignore_words = ['?', '!']


for intent in data1['intents']:
  # Mengumpulkan semua response
  responses[intent['tag']]= intent['responses']

  # Mengumpulkan semua patterns dan tags
  for lines in intent['patterns']:
    inputs.append(lines)
    tags.append(intent['tag'])

    for pattern in intent['patterns']:
      w = nltk.word_tokenize(pattern)
      words.extend(w)


# Konversi data json ke dalam dataframe
data = pd.DataFrame({"patterns": inputs, "tags": tags})

In [None]:
# DO: Menampilkan data index ke-15 sampai 19
data[15 : 20]

Unnamed: 0,patterns,tags
15,Apa itu KadekBot?,kadekbot
16,Siapa KadekBot?,kadekbot
17,Siapa pembuatmu?,pencipta_kadekbot
18,"Kadek, yang buat kamu siapa sih?",pencipta_kadekbot
19,"Siapa penciptamu, Kadek?",pencipta_kadekbot


# 2. Data Preprocessing

## Removing Punctuation

In [None]:
# Character punctuation
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# Mengubah data ke format huruf kecil dan menghilangkan punctuation
# DO: Masukkan kolom ['patterns'] untuk dilakukan punctuation
data[''] = data[''].apply(lambda word:[letters.lower() for letters in word if letters not in string.punctuation])
data[''] = data[''].apply(lambda word: ''.join(word))

# Menampilkan hasil
data

## Tokenizer

In [None]:
# Tokenize the data
# DO: Atur num_words sebanyak 2000
tokenizer = Tokenizer(num_words=)

# DO: Masukkan kolom patterns kedalam fit_on_texts
tokenizer.fit_on_texts()
# DO: Masukkan kolom patterns kedalam texts_to_sequences
tokenized_data = tokenizer.texts_to_sequences()

# Menampilkan data ke-15 sampai ke-20
print('Sebelum dilakukan tokenisasi:\n')
print(data['patterns'][15 : 20])

# Menampilkan hasil tokenisasi data ke-15 sampai ke-20
print('\n\nSesudah dilakukan tokenisasi:\n')
print(tokenized_data[15 : 20])

## Padding

In [None]:
# Melakukan padding terhadap hasil tokenisasi
# DO: Masukkan hasil tokenisasi kedalam pad_sequences
x_train = pad_sequences()

# Menampilkan hasil padding
print('Hasil padding:\n')
print(x_train[15:20])

Hasil padding:

[[ 0  0  0  0  0  0  0  1  6 32]
 [ 0  0  0  0  0  0  0  0 29 32]
 [ 0  0  0  0  0  0  0  0 29 55]
 [ 0  0  0  0 33 21 56 57 29 58]
 [ 0  0  0  0  0  0  0 29 59 33]]


## Encoding Label

In [None]:
# Encoding the label
# DO: Panggil fungsi LabelEncoder()
le =

# DO: Masukkan kolom tags sebagai label data ke dalam fit_transform
y_train = le.fit_transform()

# Menampilkan label sebelum di-encoding
print('Label sebelum di-encoding:\n')
print(data['tags'])

# Menampilkan label setelah di-encoding
print('\n\nLabel setelah di-encoding:\n')
print(y_train)

Label sebelum di-encoding:

0                         greeting
1                         greeting
2                         greeting
3                         greeting
4                         greeting
                  ...             
75                 penjelasan_PMMB
76               persyaratan_IISMA
77    periode_dan_pendaftaran_PMMB
78                    manfaat_PMMB
79                      mitra_PMMB
Name: tags, Length: 80, dtype: object


Label setelah di-encoding:

[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  2  2 14 14 14  0  0  0  0
  0  0  0  0  0 38 38 38 38 38 18 39 32 37 37 37 37 37 37 37 37 37 37 37
 22 36 29  9 13 19 33 26  6 12 17 31 25  5 15 30 23  3 10 21 35 28  8 20
 34 27  7 16 30 24  4 11]


# 3. Modelling and Evaluation

## Defining Input and Output

In [None]:
# Menentukan input shape
input_shape = x_train.shape[1]
print('input shape:', input_shape)

# Menentukan jumlah vocab
vocabulary = len(tokenizer.word_index)
print("number of unique words : ", vocabulary)

# Menentukan panjang output
output_length = le.classes_.shape[0]
print("output length: ", output_length)

input shape: 10
number of unique words :  77
output length:  40


## Build Model

In [None]:
# Membuat Input layer
input = Input(shape=(input_shape,))

# Membuat Hidden layer
layer = Embedding(vocabulary+1,10)(input)
layer = LSTM(10, return_sequences=True)(layer)
layer = Flatten()(layer)

# Membuat Output layer
layer = Dense(output_length, activation="softmax")(layer)

# DO: Gabungkan input dan layer kedalam fungsi Model!
model  = Model(,)

# Compiling the model
model.compile(loss="sparse_categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

## Training
Data yang dibutuhkan untuk training adalah data hasil preprocessing dan data label hasil encoding

In [None]:
# Melakukan training
# DO: Masukkan data yang akan ditraining dan atur epochs = 200!
train = model.fit(,,)

## Evaluation

In [None]:
# Plot Akurasi
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(train.history['accuracy'],label='Training Set Accuracy')
plt.legend(loc='lower right')
plt.title('Accuracy')

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(train.history['loss'],label='Training Set Loss')
plt.legend(loc='upper right')
plt.title('Loss')
plt.show()

# 4. Testing

In [None]:
from nltk.translate.gale_church import LanguageIndependent
while True:
  texts_p = []
  prediction_input = input('Kamu : ')

  # Menghapus punktuasi dan konversi ke huruf kecil
  prediction_input = [letters.lower() for letters in prediction_input if letters not in string.punctuation]
  prediction_input = ''.join(prediction_input)
  texts_p.append(prediction_input)

  # Tokenisasi dan Padding
  prediction_input = tokenizer.texts_to_sequences(texts_p)
  prediction_input = np.array(prediction_input).reshape(-1)
  prediction_input = pad_sequences([prediction_input],input_shape)

  # Mendapatkan hasil keluaran pada model
  output = model.predict(prediction_input)
  output = output.argmax()

  # Menemukan respon sesuai data tag dan memainkan voice bot
  response_tag = le.inverse_transform([output])[0]
  print("Kadekbot : ", random.choice(responses[response_tag]))
  print("="*60 + "\n")

  if response_tag == "terimakasih":
    break

Kamu : selamat pagi
Dedecorins :  Halo! Saya Kadekbot, salam kenal ya! Mau tau tentang kampus merdeka, kan?

greeting
Kamu : apa itu kadekbot?
Dedecorins :  KadekBot itu adalah teman informasi kamu tentang Kampus Merdeka!

kadekbot
Kamu : apa itu kampus merdeka?
Dedecorins :  Kampus Merdeka merupakan kebijakan Menteri Pendidikan dan Kebudayaan Nadiem Makarim yang membebaskan mahasiswa untuk mengikuti kegiatan di luar program studinya selama 1 semester atau setara 20 sks dan paling lama 2 semester atau setara 40 sks.

penjelasan_kampus_merdeka
Kamu : bagaimana cara mengikuti kampus merdeka
Dedecorins :  Mau tau tentang apa nih?

program_kampus_merdeka
Kamu : cara mendaftar kampus merdeka
Dedecorins :  Mau tau tentang apa nih?

program_kampus_merdeka
Kamu : terima kasih
Dedecorins :  Dengan senang hati :)

terimakasih
