<a href="https://colab.research.google.com/github/bryanbayup/petpoint/blob/main/Copy_of_PetpointWithGeneralConver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Jika Anda menggunakan Google Colab, pastikan untuk meng-clone repository terlebih dahulu
!git clone https://github.com/bryanbayup/petpoint

# Pastikan directory kerja Anda berada di tempat yang benar
import os
os.chdir('/content/petpoint')

Cloning into 'petpoint'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 54 (delta 10), reused 28 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (54/54), 87.49 KiB | 668.00 KiB/s, done.
Resolving deltas: 100% (10/10), done.


In [2]:
import json
import pandas as pd
import glob
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import random
import itertools
from collections import Counter

In [3]:
# Install library yang diperlukan
!pip install stanza transformers sentencepiece

import stanza
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Inisialisasi device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting stanza
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.9.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.9.2


In [4]:
# List untuk menyimpan pasangan percakapan
conversation_pairs = []

In [5]:
# Fungsi untuk mengekstrak pasangan percakapan dari intents
def extract_conversation_pairs(intents):
    pairs = []
    for intent in intents:
        for utterance in intent['utterances']:
            for answer in intent['answers']:
                pairs.append((utterance, answer))
    return pairs

In [6]:
# Load semua JSON files dari corpus/id/
intents = []
for file in glob.glob('corpus/id/*.json'):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        intents.extend(data)

# Load domain-specific data
with open('kucing_anjing/kucing_anjing.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    intents.extend(data)

In [7]:
# Ekstrak pasangan percakapan
conversation_pairs.extend(extract_conversation_pairs(intents))

# Membuat DataFrame
df = pd.DataFrame(conversation_pairs, columns=['input', 'response'])

In [8]:
# Load normalization dictionary
normalization_dict = {}
with open('normalization/normalization.txt', 'r', encoding='utf-8') as f:
    for line in f:
        # Check if the line contains the delimiter before splitting
        if '\t' in line:
            slang, normal = line.strip().split('\t')
            normalization_dict[slang] = normal
        else:
            print(f"Warning: Skipping line '{line.strip()}', no tab delimiter found.")



In [9]:
# Function to normalize text
def normalize_text(text):
    words = text.split()
    normalized_words = [normalization_dict.get(word, word) for word in words]
    return ' '.join(normalized_words)

# Load stopwords
with open('normalization/stopword.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords]
    return ' '.join(filtered_words)

In [10]:
# Initialize the Indonesian pipeline
stanza.download('id')  # Jika sudah pernah di-download, bisa dilewatkan
nlp = stanza.Pipeline('id', processors='tokenize,pos,lemma', use_gpu=True)

def tokenize_and_lemmatize(text):
    doc = nlp(text)
    lemmas = [word.lemma for sentence in doc.sentences for word in sentence.words]
    return ' '.join(lemmas)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: id (Indonesian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-id/resolve/v1.9.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/id/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: id (Indonesian):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| mwt       | gsd          |
| pos       | gsd_charlm   |
| lemma     | gsd_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!


In [11]:
# Function for preprocessing
def preprocess_text(text):
    # Normalisasi
    text = normalize_text(text.lower())
    # Hapus stopwords
    text = remove_stopwords(text)
    # Tokenisasi dan Lematisasi
    text = tokenize_and_lemmatize(text)
    return text

# Terapkan preprocessing
df['input'] = df['input'].apply(preprocess_text)
df['response'] = df['response'].apply(preprocess_text)

In [12]:
# Fungsi untuk membangun vocabulary dengan token khusus yang benar
def build_vocab(sentences, max_vocab_size=5000):
    word_counts = Counter(itertools.chain(*[s.split() for s in sentences]))
    most_common = word_counts.most_common(max_vocab_size)
    idx2word = ['<PAD>', '<SOS>', '<EOS>', '<UNK>'] + [word for word, _ in most_common]
    word2idx = {word: idx for idx, word in enumerate(idx2word)}
    return word2idx, idx2word

In [13]:
# Bangun vocabulary untuk input dan output
input_word2idx, input_idx2word = build_vocab(df['input'])
output_word2idx, output_idx2word = build_vocab(df['response'])

def sentence_to_indices(sentence, word2idx):
    indices = [word2idx.get(word, word2idx['<UNK>']) for word in sentence.split()]
    return indices

In [14]:
# Menambahkan indeks ke DataFrame
df['input_indices'] = df['input'].apply(lambda x: sentence_to_indices(x, input_word2idx))
df['response_indices'] = df['response'].apply(lambda x: [output_word2idx['<SOS>']] + sentence_to_indices(x, output_word2idx) + [output_word2idx['<EOS>']])

In [15]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

class ChatDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx], dtype=torch.long), torch.tensor(self.outputs[idx], dtype=torch.long)

In [16]:
# Training Dataset dan DataLoader
train_dataset = ChatDataset(train_df['input_indices'].tolist(), train_df['response_indices'].tolist())
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: x)

In [17]:
# Validation Dataset dan DataLoader
val_dataset = ChatDataset(val_df['input_indices'].tolist(), val_df['response_indices'].tolist())
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=lambda x: x)

In [18]:
# Menggunakan pre-trained embeddings Bahasa Indonesia
import gensim.downloader as api

# Load pre-trained FastText embeddings Bahasa Indonesia
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz
!gzip -d cc.id.300.vec.gz

--2024-11-04 23:43:54--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.78, 13.226.210.25, 13.226.210.111, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1227018698 (1.1G) [binary/octet-stream]
Saving to: ‘cc.id.300.vec.gz’


2024-11-04 23:44:29 (33.2 MB/s) - ‘cc.id.300.vec.gz’ saved [1227018698/1227018698]



In [19]:
from gensim.models import KeyedVectors
fasttext_model = KeyedVectors.load_word2vec_format('cc.id.300.vec')

# Membuat embedding matrix
embedding_dim = 300
vocab_size = len(input_word2idx)
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, idx in input_word2idx.items():
    try:
        embedding_vector = fasttext_model[word]
        embedding_matrix[idx] = embedding_vector
    except KeyError:
        embedding_matrix[idx] = np.random.normal(scale=0.05, size=(embedding_dim, ))

In [20]:
# Definisikan Encoder dan Decoder dengan embedding size yang sesuai
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embed_size)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False  # Optional: Freeze embeddings
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        # x: [batch_size, seq_length]
        embedding = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedding)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_size, embed_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x: [batch_size]
        x = x.unsqueeze(1)  # [batch_size, 1]
        embedding = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))
        predictions = self.fc(outputs.squeeze(1))
        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, output_word2idx):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.output_word2idx = output_word2idx

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        target_len = target.size(1)
        target_vocab_size = len(self.output_word2idx)

        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)
        hidden, cell = self.encoder(source)

        x = target[:, 0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[:, t] = output
            best_guess = output.argmax(1)
            x = target[:, t] if random.random() < teacher_forcing_ratio else best_guess
        return outputs

In [21]:
# Hyperparameters
input_size_encoder = len(input_word2idx)
input_size_decoder = len(output_word2idx)
output_size = len(output_word2idx)
embed_size = 300  # Sesuaikan dengan embedding_dim
hidden_size = 512
num_layers = 1
learning_rate = 0.001
num_epochs = 30

encoder_net = Encoder(input_size_encoder, embed_size, hidden_size, num_layers).to(device)
decoder_net = Decoder(output_size, embed_size, hidden_size, num_layers).to(device)

model = Seq2Seq(encoder_net, decoder_net, output_word2idx).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=output_word2idx['<PAD>'])

In [22]:
# Training dan Evaluasi Model
for epoch in range(num_epochs):
    print(f'Epoch [{epoch+1}/{num_epochs}]')
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_dataloader):
        # Ambil batch data
        inputs, targets = zip(*batch)
        inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=input_word2idx['<PAD>']).to(device)
        targets = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=output_word2idx['<PAD>']).to(device)

        # Forward pass
        outputs = model(inputs, targets)

        # Reshape untuk menghitung loss
        outputs = outputs[:, 1:].reshape(-1, outputs.shape[2])
        targets = targets[:, 1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(outputs, targets)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}')

    # Evaluasi pada Validation Set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_dataloader):
            inputs, targets = zip(*batch)
            inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=input_word2idx['<PAD>']).to(device)
            targets = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=output_word2idx['<PAD>']).to(device)

            outputs = model(inputs, targets, teacher_forcing_ratio=0)  # No teacher forcing during evaluation

            outputs = outputs[:, 1:].reshape(-1, outputs.shape[2])
            targets = targets[:, 1:].reshape(-1)

            loss = criterion(outputs, targets)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}')

Epoch [1/30]
Epoch 1, Training Loss: 5.0312
Epoch 1, Validation Loss: 4.6763
Epoch [2/30]
Epoch 2, Training Loss: 4.3469
Epoch 2, Validation Loss: 4.4721
Epoch [3/30]
Epoch 3, Training Loss: 3.9947
Epoch 3, Validation Loss: 4.3352
Epoch [4/30]
Epoch 4, Training Loss: 3.6622
Epoch 4, Validation Loss: 4.1236
Epoch [5/30]
Epoch 5, Training Loss: 3.2777
Epoch 5, Validation Loss: 4.2169
Epoch [6/30]
Epoch 6, Training Loss: 3.0520
Epoch 6, Validation Loss: 4.2864
Epoch [7/30]
Epoch 7, Training Loss: 2.7159
Epoch 7, Validation Loss: 4.2221
Epoch [8/30]
Epoch 8, Training Loss: 2.4963
Epoch 8, Validation Loss: 4.2899
Epoch [9/30]
Epoch 9, Training Loss: 2.2851
Epoch 9, Validation Loss: 4.6126
Epoch [10/30]
Epoch 10, Training Loss: 2.0246
Epoch 10, Validation Loss: 4.5047
Epoch [11/30]
Epoch 11, Training Loss: 1.8574
Epoch 11, Validation Loss: 4.9426
Epoch [12/30]
Epoch 12, Training Loss: 1.8588
Epoch 12, Validation Loss: 4.6702
Epoch [13/30]
Epoch 13, Training Loss: 1.8639
Epoch 13, Validation 

In [38]:
# Load sentiment scores
sentiment_scores = {}

# Load positive sentiment words
with open('sentiment/tsv/positive.tsv', 'r', encoding='utf-8') as f:
    for line in f:
        word, score = line.strip().split('\t')
        sentiment_scores[word] = int(score)

# Load negative sentiment words
with open('sentiment/tsv/negative.tsv', 'r', encoding='utf-8') as f:
    for line in f:
        word, score = line.strip().split('\t')
        sentiment_scores[word] = int(score)

def analyze_sentiment(text):
    words = text.split()
    score = 0
    for word in words:
        score += sentiment_scores.get(word, 0)
    return score

def adjust_response_by_sentiment(response, sentiment_score):
    if sentiment_score < 0:
        response = "Maaf mendengarnya. " + response
    elif sentiment_score > 0:
        response = "Senang mendengarnya! " + response
    return response

def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.type) for ent in doc.ents]
    return entities

def generate_template_response(entities):
    if entities:
        for entity, ent_type in entities:
            if ent_type == 'PER':
                return f"Halo {entity}, apa kabar?"
            elif ent_type == 'ORG':
                return f"Apa kabar dengan {entity}?"
    return None

In [40]:
# Fungsi Utama Chatbot
def chatbot_response(user_input):
    # Analisis sentimen
    sentiment_score = analyze_sentiment(user_input)

    # Ekstrak entitas
    entities = extract_entities(user_input)

    # Coba generate response berbasis template
    response = generate_template_response(entities)
    if response is None:
        # Jika tidak ada entitas, gunakan model
        response = translate_sentence(model, user_input, input_word2idx, output_idx2word)

    # Sesuaikan respon berdasarkan sentimen
    response = adjust_response_by_sentiment(response, sentiment_score)

    return response

In [43]:
# Load Pre-trained Model
tokenizer = MBart50TokenizerFast.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')
model_transformer = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50-many-to-many-mmt').to(device)
tokenizer.src_lang = "id_ID"
tokenizer.tgt_lang = "id_ID"

def generate_transformer_response(input_text):
    # Preprocessing
    input_text = preprocess_text(input_text)
    # Tokenisasi
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    # Generate
    output_ids = model_transformer.generate(input_ids, num_beams=5, max_length=50, early_stopping=True)
    # Decode
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

# Modifikasi fungsi chatbot_response untuk menggunakan Transformer
def chatbot_response_transformer(user_input):
    # Analisis sentimen
    sentiment_score = analyze_sentiment(user_input)

    # Ekstrak entitas
    entities = extract_entities(user_input)

    # Coba generate response berbasis template
    response = generate_template_response(entities)
    if response is None:
        # Jika tidak ada entitas, gunakan model Transformer
        response = generate_transformer_response(user_input)

    # Sesuaikan respon berdasarkan sentimen
    response = adjust_response_by_sentiment(response, sentiment_score)

    return response

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [45]:
# Penggunaan Chatbot
print("Selamat datang di Chatbot! Ketik 'exit' untuk keluar.")
while True:
    user_input = input("Anda: ")
    if user_input.lower() == 'exit':
        print("Chatbot: Sampai jumpa!")
        break
    bot_response = chatbot_response(user_input)
    print(f"Chatbot: {bot_response}")

Selamat datang di Chatbot! Ketik 'exit' untuk keluar.
Anda: hai
Chatbot: a <sos>a , sikat kaya nutrisi rancang khusus anjing hamil .
Anda: siapa kamu


RuntimeError: Expected sequence length to be larger than 0 in RNN