<a href="https://colab.research.google.com/github/bryanbayup/chatbot_project/blob/main/finalisasi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Instalasi Library Tambahan
!pip install gensim
!pip install keras-tuner --upgrade
!pip install imbalanced-learn
!pip install Sastrawi
!pip install sentencepiece
!pip install seqeval
!pip install rapidfuzz



In [9]:
# Import Libraries
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, Dropout, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import re
import pickle
import os
import random
import nltk
import gensim
from gensim.models import KeyedVectors
from keras_tuner import HyperModel, RandomSearch
from tensorflow.keras.callbacks import EarlyStopping
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import sentencepiece as spm
from nltk.corpus import stopwords
from seqeval.metrics import classification_report as seq_classification_report
import ipywidgets as widgets
from IPython.display import display, clear_output
from rapidfuzz import process, fuzz

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# # Download FastText
# !wget -O id.tar.gz "https://www.dropbox.com/scl/fi/sju4o3keikox69euw51vy/id.tar.gz?dl=1"
# !tar -xzf id.tar.gz

# Load FastText
try:
    fasttext_model = KeyedVectors.load_word2vec_format('id.vec', binary=False)
    print("Model FastText 'id.vec' berhasil dimuat.")
except Exception as e:
    print(f"Gagal memuat 'id.vec': {e}")
    raise ValueError("Gagal memuat model FastText. Pastikan file 'id.vec' dalam format yang benar.")

Model FastText 'id.vec' berhasil dimuat.


In [11]:
# Load dataset from JSON file
with open('dataaa.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert dataset to DataFrame
df = pd.DataFrame(data)

# Ensure consistent column naming
df.rename(columns={'utterance': 'utterances', 'response': 'responses'}, inplace=True)

In [12]:
# Encode intents
label_encoder = LabelEncoder()
df['intent_label'] = label_encoder.fit_transform(df['intent'])

# Save intent mapping
intent_mapping = dict(zip(df['intent_label'], df['intent']))

# Handle imbalanced classes with oversampling
ros = RandomOverSampler(random_state=42)
X = df.index.values.reshape(-1, 1)
y = df['intent_label']
X_ros, y_ros = ros.fit_resample(X, y)

# Buat DataFrame baru dengan data yang telah di-oversample
df_balanced = df.loc[X_ros.flatten()].reset_index(drop=True)
df_balanced['intent_label'] = y_ros
df_balanced['intent'] = label_encoder.inverse_transform(df_balanced['intent_label'])

In [13]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Load custom stopwords
with open('stopword_list_tala.txt', 'r', encoding='utf-8') as f:
    stop_words = f.read().splitlines()
stop_words = set(stop_words)

# Stemming using Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Daftar kata yang dikenal dari data
all_text = ' '.join(df_balanced['utterances'])
tokenizer_vocab = set(all_text.split())

def correct_typo(text):
    tokens = text.split()
    corrected_tokens = []
    for token in tokens:
        if token not in tokenizer_vocab:
            matches = process.extractOne(token, tokenizer_vocab)
            if matches and matches[1] > 80:  # Threshold kecocokan
                corrected_tokens.append(matches[0])
            else:
                corrected_tokens.append(token)
        else:
            corrected_tokens.append(token)
    return ' '.join(corrected_tokens)

def preprocess_text(text):
    text = clean_text(text)
    text = correct_typo(text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    text = stemmer.stem(text)
    return text

# Apply preprocessing
df_balanced['utterances_clean'] = df_balanced['utterances'].apply(preprocess_text)

In [14]:
# Prepare texts and labels
texts = df_balanced['utterances_clean'].tolist()
labels = df_balanced['intent_label'].tolist()

# Split data for Intent Classification
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.1,
    random_state=42,
    stratify=labels
)

# Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

# Padding sequences
max_seq_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in val_sequences))
train_padded = pad_sequences(train_sequences, maxlen=max_seq_length, padding='post')
val_padded = pad_sequences(val_sequences, maxlen=max_seq_length, padding='post')

# Convert labels to categorical
num_classes = len(label_encoder.classes_)
train_labels_cat = to_categorical(train_labels, num_classes=num_classes)
val_labels_cat = to_categorical(val_labels, num_classes=num_classes)

# Create embedding matrix using FastText
embedding_dim = fasttext_model.vector_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in word_index.items():
    if word in fasttext_model:
        embedding_matrix[idx] = fasttext_model[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [15]:
# Filter data with entities
df_ner = df[df['entities'].map(lambda d: len(d)) > 0].reset_index(drop=True)
df_ner['utterances_clean'] = df_ner['utterances'].apply(preprocess_text)

def prepare_ner_data(df, tokenizer, max_seq_length):
    texts = []
    labels = []
    for index, row in df.iterrows():
        text = row['utterances_clean']
        entities = row['entities']
        tokens = tokenizer.texts_to_sequences([text])[0]
        label_seq = ['O'] * len(tokens)
        for ent in entities:
            ent_text = preprocess_text(ent['value'])
            ent_tokens = tokenizer.texts_to_sequences([ent_text])[0]
            ent_len = len(ent_tokens)
            for i in range(len(tokens) - ent_len + 1):
                if tokens[i:i+ent_len] == ent_tokens:
                    label_seq[i] = 'B-' + ent['entity']
                    for j in range(1, ent_len):
                        label_seq[i+j] = 'I-' + ent['entity']
                    break
        texts.append(tokens)
        labels.append(label_seq)
    # Padding
    texts_padded = pad_sequences(texts, maxlen=max_seq_length, padding='post')
    # Padding labels
    labels_padded = [label + ['O']*(max_seq_length - len(label)) for label in labels]
    return texts_padded, labels_padded

# Create label encoder for NER
all_labels = set()
for label_list in df_ner['entities']:
    for ent in label_list:
        all_labels.add('B-' + ent['entity'])
        all_labels.add('I-' + ent['entity'])
all_labels.add('O')
ner_label_encoder = {label: idx for idx, label in enumerate(sorted(all_labels))}
ner_label_decoder = {idx: label for label, idx in ner_label_encoder.items()}

# Prepare NER data
texts_ner, labels_ner = prepare_ner_data(df_ner, tokenizer, max_seq_length)

# Convert labels to numerical and categorical format
def encode_ner_labels(labels, ner_label_encoder):
    labels_encoded = []
    for label_seq in labels:
        label_ids = [ner_label_encoder[label] for label in label_seq]
        labels_encoded.append(label_ids)
    labels_encoded = np.array(labels_encoded)
    labels_encoded = to_categorical(labels_encoded, num_classes=len(ner_label_encoder))
    return labels_encoded

labels_ner_encoded = encode_ner_labels(labels_ner, ner_label_encoder)

# Split data for NER
train_texts_ner, val_texts_ner, train_labels_ner, val_labels_ner = train_test_split(
    texts_ner,
    labels_ner_encoded,
    test_size=0.1,
    random_state=42,
)

In [16]:
def build_intent_model_with_cnn(embedding_matrix, max_seq_length, num_classes, l2_reg=0.001):
    inputs = Input(shape=(max_seq_length,))
    embedding = tf.keras.layers.Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=max_seq_length,
        trainable=True
    )(inputs)
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
    global_pool = GlobalMaxPooling1D()(conv)
    dense = Dense(64, activation='relu', kernel_regularizer=l2(l2_reg))(global_pool)
    dropout = Dropout(0.5)(dense)
    outputs = Dense(num_classes, activation='softmax')(dropout)
    model = Model(inputs=inputs, outputs=outputs)
    return model

model_intent_cnn = build_intent_model_with_cnn(embedding_matrix, max_seq_length, num_classes, l2_reg=0.001)
model_intent_cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_intent_cnn.summary()

# Early stopping
callbacks_intent = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]

# Train the model
history_intent_cnn = model_intent_cnn.fit(
    train_padded,
    train_labels_cat,
    validation_data=(val_padded, val_labels_cat),
    epochs=20,
    batch_size=16,
    callbacks=callbacks_intent
)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 22)]              0         
                                                                 
 embedding (Embedding)       (None, 22, 300)           167700    
                                                                 
 conv1d (Conv1D)             (None, 20, 128)           115328    
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                             

In [17]:
class NERHyperModel(HyperModel):
    def __init__(self, embedding_matrix, max_seq_length, num_entities):
        self.embedding_matrix = embedding_matrix
        self.max_seq_length = max_seq_length
        self.num_entities = num_entities

    def build(self, hp):
        l2_reg = hp.Choice('l2_reg', values=[1e-4, 1e-3, 1e-2])
        dropout_rate = hp.Float('dropout_rate', 0.3, 0.7, step=0.1)
        lstm_units = hp.Int('lstm_units', min_value=32, max_value=128, step=32)

        inputs = Input(shape=(self.max_seq_length,))
        embedding = tf.keras.layers.Embedding(
            input_dim=self.embedding_matrix.shape[0],
            output_dim=self.embedding_matrix.shape[1],
            weights=[self.embedding_matrix],
            input_length=self.max_seq_length,
            trainable=True
        )(inputs)
        lstm = Bidirectional(LSTM(lstm_units, kernel_regularizer=l2(l2_reg), return_sequences=True))(embedding)
        dropout = Dropout(dropout_rate)(lstm)
        outputs = TimeDistributed(Dense(self.num_entities, activation='softmax'))(dropout)
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        return model

# Initialize HyperModel
ner_hypermodel = NERHyperModel(embedding_matrix, max_seq_length, len(ner_label_encoder))

# Initialize RandomSearch
tuner_ner = RandomSearch(
    ner_hypermodel,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='ner_tuner_dir',
    project_name='ner_tuning'
)

# Hyperparameter search for NER
tuner_ner.search(
    train_texts_ner,
    train_labels_ner,
    epochs=10,
    validation_data=(val_texts_ner, val_labels_ner),
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    ]
)

# Get the best model for NER
best_model_ner = tuner_ner.get_best_models(num_models=1)[0]
best_hp_ner = tuner_ner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Hyperparameters for NER: {best_hp_ner.values}")

Trial 10 Complete [00h 00m 22s]
val_accuracy: 0.9424715638160706

Best val_accuracy So Far: 0.9438920617103577
Total elapsed time: 00h 02m 34s
Best Hyperparameters for NER: {'l2_reg': 0.0001, 'dropout_rate': 0.3, 'lstm_units': 96}


In [18]:
# Evaluasi Model Intent Classification
loss_intent, accuracy_intent = model_intent_cnn.evaluate(val_padded, val_labels_cat)
print(f'Akurasi Model Klasifikasi Intent: {accuracy_intent * 100:.2f}%')

# Evaluasi Model NER
loss_ner, accuracy_ner = best_model_ner.evaluate(val_texts_ner, val_labels_ner)
print(f'Akurasi Model NER: {accuracy_ner * 100:.2f}%')

Akurasi Model Klasifikasi Intent: 96.62%
Akurasi Model NER: 94.39%


In [19]:
# Create directories if not exist
os.makedirs('models', exist_ok=True)
os.makedirs('encoders', exist_ok=True)
os.makedirs('data', exist_ok=True)

# Save intent model
model_intent_cnn.save('models/model_intent.keras')

# Save NER model
best_model_ner.save('models/model_ner.keras')

# Save tokenizer
with open('encoders/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save label encoder
with open('encoders/label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save NER label encoder
with open('encoders/ner_label_encoder.pickle', 'wb') as handle:
    pickle.dump(ner_label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
# Create DataFrame for utterances and responses
df_utterances = df_balanced[['utterances', 'responses', 'intent']].reset_index(drop=True)
df_utterances['utterances_clean'] = df_utterances['utterances'].apply(preprocess_text)

# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(df_utterances['utterances_clean'])

# Save vectorizer
with open('data/vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
def predict_intent(text):
    text_clean = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([text_clean])
    padded_seq = pad_sequences(seq, maxlen=max_seq_length, padding='post')
    pred = model_intent_cnn.predict(padded_seq)
    predicted_label = np.argmax(pred, axis=1)[0]
    intent = label_encoder.inverse_transform([predicted_label])[0]
    return intent

def predict_entities(text):
    text_clean = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([text_clean])
    padded_seq = pad_sequences(seq, maxlen=max_seq_length, padding='post')
    pred = best_model_ner.predict(padded_seq)
    pred_labels = np.argmax(pred, axis=-1)[0]
    tokens = tokenizer.sequences_to_texts(seq)[0].split()
    entities = []
    for idx, label_id in enumerate(pred_labels[:len(tokens)]):
        label = ner_label_decoder[label_id]
        if label != 'O':
            entities.append({'entity': label.split('-')[1], 'value': tokens[idx]})
    return entities

In [22]:
# Mapping antara intent dan hewan terkait
intent_animal_mapping = {
    'medical_inquiry_dog': 'anjing',
    'medical_inquiry_cat': 'kucing',
    'symptom_analysis_dog': 'anjing',
    'symptom_analysis_cat': 'kucing',
    'disease_prevention_dog': 'anjing',
    'disease_prevention_cat': 'kucing',
    'dog_healthcare': 'anjing',
    'cat_healthcare': 'kucing',
    'animal_health_issue': ['anjing', 'kucing'],
    # Tambahkan mapping untuk intent lain yang relevan
}

# Fungsi untuk menyesuaikan intent berdasarkan entitas
def adjust_intent(intent, entities):
    # Dapatkan hewan dari intent yang diprediksi
    predicted_animal = intent_animal_mapping.get(intent, None)

    # Ekstrak entitas 'animal' dari input pengguna
    entity_animals = [ent['value'].lower() for ent in entities if ent['entity'] == 'animal']

    # Jika ada entitas 'animal' dalam input
    if entity_animals:
        user_animal = entity_animals[0]
        # Jika hewan dari intent tidak sesuai dengan entitas 'animal' pengguna, sesuaikan intentnya
        if predicted_animal:
            if isinstance(predicted_animal, list):
                if user_animal not in predicted_animal:
                    intent = None
            else:
                if predicted_animal != user_animal:
                    # Cari intent lain yang sesuai
                    for intent_name, animal in intent_animal_mapping.items():
                        if animal == user_animal and intent_name != intent:
                            intent = intent_name
                            break
                    else:
                        intent = None
    else:
        # Jika tidak ada entitas 'animal', dan intent membutuhkan hewan tertentu, set intent ke None
        if intent in intent_animal_mapping:
            intent = None
    return intent

# Fungsi untuk mendapatkan respon berdasarkan intent yang disesuaikan
def get_response(user_input, intent=None, entities=None):
    user_input_clean = preprocess_text(user_input)
    print(f"Input yang Dipreproses: {user_input_clean}")

    if intent:
        # Filter dataset berdasarkan intent yang disesuaikan
        df_intent = df_utterances[df_utterances['intent'] == intent]
        if df_intent.empty:
            print("Intent tidak ditemukan dalam dataset.")
            return get_default_response()
        else:
            # Vectorize ulang utterances yang difilter
            tfidf_matrix_intent = vectorizer.transform(df_intent['utterances_clean'])
            user_tfidf = vectorizer.transform([user_input_clean])
            similarities = cosine_similarity(user_tfidf, tfidf_matrix_intent)
            most_similar_idx = np.argmax(similarities[0])
            highest_similarity = similarities[0][most_similar_idx]
            print(f"Kemiripan Tertinggi: {highest_similarity}")
            if highest_similarity < 0.2:  # Sesuaikan threshold sesuai kebutuhan
                print("Kemiripan di bawah threshold.")
                return get_default_response()
            else:
                # Ambil respon yang sesuai dengan utterance paling mirip
                response = df_intent.iloc[most_similar_idx]['responses']
                print(f"Respon yang Dipilih: {response}")
                return response
    else:
        print("Intent tidak tersedia.")
        return get_default_response()

In [23]:
def chatbot_response(user_input):
    # Prediksi intent dan entitas
    intent = predict_intent(user_input)
    entities = predict_entities(user_input)

    # Sesuaikan intent berdasarkan entitas
    adjusted_intent = adjust_intent(intent, entities)

    print(f"Intent yang Diprediksi: {intent}")
    print(f"Entitas yang Diekstrak: {entities}")
    print(f"Intent yang Disesuaikan: {adjusted_intent}")

    # Jika intent setelah disesuaikan adalah None, berikan respon default
    if adjusted_intent is None:
        response = get_default_response()
    else:
        # Dapatkan respon berdasarkan intent yang disesuaikan
        response = get_response(user_input, adjusted_intent, entities)
        # Jika tidak ada respon yang ditemukan, gunakan respon default
        if not response:
            response = get_default_response()
    return response

In [24]:
# Fungsi untuk mendapatkan respon default
def get_default_response():
    default_responses = [
        "Maaf, saya belum bisa menjawab pertanyaan Anda.",
        "Maaf, mohon diperjelas apa yang Anda maksud.",
        "Maaf, saya hanya diprogram untuk menjawab pertanyaan mengenai kucing dan anjing.",
        "Mohon maaf, saya tidak mengerti. Bisa dijelaskan lebih detail?",
        "Saya belum memiliki informasi mengenai hal tersebut."
    ]
    return random.choice(default_responses)

In [31]:
# Contoh Pengujian
test_inputs = [
    "hai",
    "halo selamat pagi",
    "kucing saya muntah dan diare",
    "anjing saya matanya bengkak",
    "apa itu toxoplasmosis",
    "AI itu apa",
    "anjing tetangga suka menggonggong",
    "Saya melihat seekor anjing tua tanpa kalung di kompleks. Apa yang harus saya lakukan?",
    "Saya menemukan kucing dengan mata tertutup kotoran di depan pasar. Apa yang harus saya lakukan?",
    "Apakah saya bisa membawa anjing ke dalam kereta api jarak jauh? Jika ya, apa yang harus disiapkan?",
    "Saya ingin membawa kucing saya dalam perjalanan ke luar kota menggunakan pesawat. Apa saja yang perlu dipersiapkan?",
    "Kucing saya seperti atlet parkour, suka melompat ke rak dapur dan menjatuhkan barang. Bagaimana saya bisa menghentikannya?",
    "Saya melihat kucing di gang kecil yang terus mondar-mandir dengan ekspresi kebingungan",
    "Ada seekor anjing besar yang tampak sakit di depan kantor saya",
    "pasar itu apa",
    "apa yang dimaksud dengan analisa fundamental",
    "terimakasih",
    "makasih ya"
]

for input_text in test_inputs:
    print(f"Anda: {input_text}")
    response = chatbot_response(input_text)
    print(f"Chatbot: {response}\n")

Anda: hai
Intent yang Diprediksi: intro_chat
Entitas yang Diekstrak: [{'entity': 'disease', 'value': 'hati'}]
Intent yang Disesuaikan: intro_chat
Input yang Dipreproses: hati
Kemiripan Tertinggi: 1.0
Respon yang Dipilih: Hai! Apa yang ingin Anda tanyakan?
Chatbot: Hai! Apa yang ingin Anda tanyakan?

Anda: halo selamat pagi
Intent yang Diprediksi: intro_chat
Entitas yang Diekstrak: []
Intent yang Disesuaikan: intro_chat
Input yang Dipreproses: pagi
Kemiripan Tertinggi: 1.0
Respon yang Dipilih: Selamat pagi! Bagaimana kabar Anda hari ini?
Chatbot: Selamat pagi! Bagaimana kabar Anda hari ini?

Anda: kucing saya muntah dan diare
Intent yang Diprediksi: cat_healthcare
Entitas yang Diekstrak: [{'entity': 'animal', 'value': 'kucing'}, {'entity': 'symptom', 'value': 'muntah'}, {'entity': 'symptom', 'value': 'diare'}]
Intent yang Disesuaikan: cat_healthcare
Input yang Dipreproses: kucing muntah diare
Kemiripan Tertinggi: 0.5629355754261958
Respon yang Dipilih: Jika muntah hanya sekali dan kucin